[med-svn] [salmon] 01/02: Imported Upstream version 0.4.2

Sat Sep 19 06:22:53 UTC 2015

This is an automated email from the git hooks/post-receive script.

misterc-guest pushed a commit to branch master
in repository salmon.

commit c7eb293c9e8a95ecbeee5ffd4f3cbdcb396c05c3
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date:   Fri Sep 18 18:35:38 2015 -0700

    Imported Upstream version 0.4.2
---
 .gitignore                                         |   28 +
 .travis.yml                                        |   40 +
 CMakeLists.txt                                     |  573 ++++
 LICENSE                                            |  674 ++++
 README.md                                          |   27 +
 cmake/Modules/FindJemalloc.cmake                   |   46 +
 cmake/Modules/FindTBB.cmake                        |  282 ++
 cmake/Modules/FindTcmalloc.cmake                   |   47 +
 cmake/PostInstall.cmake                            |   19 +
 cmake/SimpleTest.cmake                             |   35 +
 cmake/TestSalmon.cmake                             |   35 +
 doc/FileFormats.md                                 |   14 +
 doc/Makefile                                       |  177 ++
 doc/build/doctrees/building.doctree                |  Bin 0 -> 35989 bytes
 doc/build/doctrees/environment.pickle              |  Bin 0 -> 6327 bytes
 doc/build/doctrees/index.doctree                   |  Bin 0 -> 5516 bytes
 doc/build/html/.buildinfo                          |    4 +
 doc/build/html/_sources/building.txt               |  203 ++
 doc/build/html/_sources/index.txt                  |   22 +
 doc/build/html/_static/ajax-loader.gif             |  Bin 0 -> 673 bytes
 doc/build/html/_static/basic.css                   |  537 ++++
 doc/build/html/_static/comment-bright.png          |  Bin 0 -> 3500 bytes
 doc/build/html/_static/comment-close.png           |  Bin 0 -> 3578 bytes
 doc/build/html/_static/comment.png                 |  Bin 0 -> 3445 bytes
 doc/build/html/_static/default.css                 |  256 ++
 doc/build/html/_static/doctools.js                 |  238 ++
 doc/build/html/_static/down-pressed.png            |  Bin 0 -> 368 bytes
 doc/build/html/_static/down.png                    |  Bin 0 -> 363 bytes
 doc/build/html/_static/file.png                    |  Bin 0 -> 392 bytes
 doc/build/html/_static/jquery.js                   |    2 +
 doc/build/html/_static/minus.png                   |  Bin 0 -> 199 bytes
 doc/build/html/_static/plus.png                    |  Bin 0 -> 199 bytes
 doc/build/html/_static/pygments.css                |   62 +
 doc/build/html/_static/searchtools.js              |  622 ++++
 doc/build/html/_static/sidebar.js                  |  159 +
 doc/build/html/_static/underscore.js               |   31 +
 doc/build/html/_static/up-pressed.png              |  Bin 0 -> 372 bytes
 doc/build/html/_static/up.png                      |  Bin 0 -> 363 bytes
 doc/build/html/_static/websupport.js               |  808 +++++
 doc/build/html/building.html                       |  284 ++
 doc/build/html/genindex.html                       |   93 +
 doc/build/html/index.html                          |  129 +
 doc/build/html/objects.inv                         |  Bin 0 -> 206 bytes
 doc/build/html/search.html                         |  100 +
 doc/build/html/searchindex.js                      |    1 +
 doc/license-header.txt                             |   22 +
 doc/source/ReadLibraryIllustration.png             |  Bin 0 -> 43418 bytes
 doc/source/building.rst                            |  113 +
 doc/source/conf.py                                 |  263 ++
 doc/source/index.rst                               |   24 +
 doc/source/library_type.rst                        |  105 +
 doc/source/license.rst                             |   16 +
 doc/source/sailfish.rst                            |   89 +
 doc/source/salmon.rst                              |  322 ++
 external/.gitignore                                |   28 +
 include/AlignmentGroup.hpp                         |   65 +
 include/AlignmentLibrary.hpp                       |  290 ++
 include/AlignmentModel.hpp                         |  111 +
 include/AtomicMatrix.hpp                           |  118 +
 include/BAMQueue.hpp                               |  147 +
 include/BAMQueue.tpp                               |  819 +++++
 include/BiasIndex.hpp                              |  109 +
 include/BinaryLiteral.hpp                          |   77 +
 include/ClusterForest.hpp                          |  158 +
 include/CollapsedEMOptimizer.hpp                   |   28 +
 include/CollapsedGibbsSampler.hpp                  |   27 +
 include/CollapsedIterativeOptimizer.hpp.bak        | 2069 ++++++++++++
 include/CommonTypes.hpp                            |   48 +
 include/EquivalenceClassBuilder.hpp                |  143 +
 include/ErrorModel.hpp                             |   74 +
 include/FASTAParser.hpp                            |   16 +
 include/ForgettingMassCalculator.hpp               |  138 +
 include/FragmentLengthDistribution.hpp             |  153 +
 include/FragmentList.hpp                           |   42 +
 include/FragmentStartPositionDistribution.hpp      |  120 +
 include/GenomicFeature.hpp                         |  215 ++
 include/HeptamerIndex.hpp                          |   41 +
 include/IOUtils.hpp                                |   15 +
 include/LibraryFormat.hpp                          |  101 +
 include/MatrixTools.hpp                            |  573 ++++
 include/MicroOpts.hpp                              |   16 +
 include/MiniBatchInfo.hpp                          |   74 +
 include/MultinomialSampler.hpp                     |   53 +
 include/MultithreadedBAMParser.hpp                 |   68 +
 include/NullFragmentFilter.hpp                     |   10 +
 include/OutputUnmappedFilter.hpp                   |   46 +
 include/PCA.hpp                                    |  108 +
 include/PairSequenceParser.hpp                     |  193 ++
 include/PartitionRefiner.hpp                       |   42 +
 include/PerfectHashIndex.hpp                       |  174 +
 include/ReadExperiment.hpp                         |  458 +++
 include/ReadLibrary.hpp                            |  238 ++
 include/ReadPair.hpp                               |  137 +
 include/ReadProducer.hpp                           |   92 +
 include/SailfishUtils.hpp                          |  153 +
 include/SalmonConfig.hpp                           |   35 +
 include/SalmonMath.hpp                             |   48 +
 include/SalmonOpts.hpp                             |  110 +
 include/SalmonStringUtils.hpp                      |  137 +
 include/SalmonUtils.hpp                            |  162 +
 include/Sampler.hpp                                |  489 +++
 include/SequenceBiasModel.hpp                      |   97 +
 include/SpinLock.hpp                               |   38 +
 include/StadenUtils.hpp                            |   20 +
 include/SufficientStatisticsQueue.hpp              |  103 +
 include/Transcript.hpp                             |  235 ++
 include/TranscriptAlignments.hpp                   |   20 +
 include/TranscriptCluster.hpp                      |  114 +
 include/TranscriptGeneMap.hpp                      |  148 +
 include/TranscriptGroup.hpp                        |   51 +
 include/UnpairedRead.hpp                           |   90 +
 include/VersionChecker.hpp                         |   52 +
 include/atomicops.h                                |  276 ++
 include/btree.h                                    | 2394 ++++++++++++++
 include/btree_container.h                          |  350 ++
 include/btree_map.h                                |  130 +
 include/btree_set.h                                |  121 +
 include/concurrentqueue.h                          | 3363 ++++++++++++++++++++
 include/count_main_cmdline.hpp                     |  711 +++++
 include/cuckoohash_config.h                        |   16 +
 include/cuckoohash_map.hh                          | 2145 +++++++++++++
 include/cuckoohash_util.h                          |   23 +
 include/eigen3/Eigen/Array                         |   11 +
 include/eigen3/Eigen/Cholesky                      |   32 +
 include/eigen3/Eigen/CholmodSupport                |   45 +
 include/eigen3/Eigen/Core                          |  376 +++
 include/eigen3/Eigen/Dense                         |    7 +
 include/eigen3/Eigen/Eigen                         |    2 +
 include/eigen3/Eigen/Eigen2Support                 |   95 +
 include/eigen3/Eigen/Eigenvalues                   |   48 +
 include/eigen3/Eigen/Geometry                      |   63 +
 include/eigen3/Eigen/Householder                   |   23 +
 include/eigen3/Eigen/IterativeLinearSolvers        |   40 +
 include/eigen3/Eigen/Jacobi                        |   26 +
 include/eigen3/Eigen/LU                            |   41 +
 include/eigen3/Eigen/LeastSquares                  |   32 +
 include/eigen3/Eigen/MetisSupport                  |   28 +
 include/eigen3/Eigen/OrderingMethods               |   66 +
 include/eigen3/Eigen/PaStiXSupport                 |   46 +
 include/eigen3/Eigen/PardisoSupport                |   30 +
 include/eigen3/Eigen/QR                            |   45 +
 include/eigen3/Eigen/QtAlignedMalloc               |   34 +
 include/eigen3/Eigen/SPQRSupport                   |   29 +
 include/eigen3/Eigen/SVD                           |   37 +
 include/eigen3/Eigen/Sparse                        |   27 +
 include/eigen3/Eigen/SparseCholesky                |   47 +
 include/eigen3/Eigen/SparseCore                    |   64 +
 include/eigen3/Eigen/SparseLU                      |   49 +
 include/eigen3/Eigen/SparseQR                      |   33 +
 include/eigen3/Eigen/StdDeque                      |   27 +
 include/eigen3/Eigen/StdList                       |   26 +
 include/eigen3/Eigen/StdVector                     |   27 +
 include/eigen3/Eigen/SuperLUSupport                |   59 +
 include/eigen3/Eigen/UmfPackSupport                |   36 +
 include/eigen3/Eigen/src/Cholesky/LDLT.h           |  604 ++++
 include/eigen3/Eigen/src/Cholesky/LLT.h            |  490 +++
 include/eigen3/Eigen/src/Cholesky/LLT_MKL.h        |  102 +
 .../Eigen/src/CholmodSupport/CholmodSupport.h      |  607 ++++
 include/eigen3/Eigen/src/Core/Array.h              |  308 ++
 include/eigen3/Eigen/src/Core/ArrayBase.h          |  228 ++
 include/eigen3/Eigen/src/Core/ArrayWrapper.h       |  264 ++
 include/eigen3/Eigen/src/Core/Assign.h             |  583 ++++
 include/eigen3/Eigen/src/Core/Assign_MKL.h         |  224 ++
 include/eigen3/Eigen/src/Core/BandMatrix.h         |  334 ++
 include/eigen3/Eigen/src/Core/Block.h              |  405 +++
 include/eigen3/Eigen/src/Core/BooleanRedux.h       |  154 +
 include/eigen3/Eigen/src/Core/CommaInitializer.h   |  154 +
 include/eigen3/Eigen/src/Core/CoreIterators.h      |   61 +
 include/eigen3/Eigen/src/Core/CwiseBinaryOp.h      |  229 ++
 include/eigen3/Eigen/src/Core/CwiseNullaryOp.h     |  864 +++++
 include/eigen3/Eigen/src/Core/CwiseUnaryOp.h       |  126 +
 include/eigen3/Eigen/src/Core/CwiseUnaryView.h     |  139 +
 include/eigen3/Eigen/src/Core/DenseBase.h          |  523 +++
 include/eigen3/Eigen/src/Core/DenseCoeffsBase.h    |  754 +++++
 include/eigen3/Eigen/src/Core/DenseStorage.h       |  339 ++
 include/eigen3/Eigen/src/Core/Diagonal.h           |  237 ++
 include/eigen3/Eigen/src/Core/DiagonalMatrix.h     |  313 ++
 include/eigen3/Eigen/src/Core/DiagonalProduct.h    |  130 +
 include/eigen3/Eigen/src/Core/Dot.h                |  263 ++
 include/eigen3/Eigen/src/Core/EigenBase.h          |  131 +
 include/eigen3/Eigen/src/Core/Flagged.h            |  140 +
 include/eigen3/Eigen/src/Core/ForceAlignedAccess.h |  146 +
 include/eigen3/Eigen/src/Core/Functors.h           |  985 ++++++
 include/eigen3/Eigen/src/Core/Fuzzy.h              |  150 +
 include/eigen3/Eigen/src/Core/GeneralProduct.h     |  635 ++++
 include/eigen3/Eigen/src/Core/GenericPacketMath.h  |  350 ++
 include/eigen3/Eigen/src/Core/GlobalFunctions.h    |   92 +
 include/eigen3/Eigen/src/Core/IO.h                 |  250 ++
 include/eigen3/Eigen/src/Core/Map.h                |  192 ++
 include/eigen3/Eigen/src/Core/MapBase.h            |  247 ++
 include/eigen3/Eigen/src/Core/MathFunctions.h      |  768 +++++
 include/eigen3/Eigen/src/Core/Matrix.h             |  405 +++
 include/eigen3/Eigen/src/Core/MatrixBase.h         |  556 ++++
 include/eigen3/Eigen/src/Core/NestByValue.h        |  111 +
 include/eigen3/Eigen/src/Core/NoAlias.h            |  134 +
 include/eigen3/Eigen/src/Core/NumTraits.h          |  150 +
 include/eigen3/Eigen/src/Core/PermutationMatrix.h  |  692 ++++
 include/eigen3/Eigen/src/Core/PlainObjectBase.h    |  790 +++++
 include/eigen3/Eigen/src/Core/ProductBase.h        |  290 ++
 include/eigen3/Eigen/src/Core/Random.h             |  152 +
 include/eigen3/Eigen/src/Core/Redux.h              |  408 +++
 include/eigen3/Eigen/src/Core/Ref.h                |  263 ++
 include/eigen3/Eigen/src/Core/Replicate.h          |  177 ++
 include/eigen3/Eigen/src/Core/ReturnByValue.h      |   88 +
 include/eigen3/Eigen/src/Core/Reverse.h            |  224 ++
 include/eigen3/Eigen/src/Core/Select.h             |  162 +
 include/eigen3/Eigen/src/Core/SelfAdjointView.h    |  314 ++
 include/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h  |  197 ++
 include/eigen3/Eigen/src/Core/SolveTriangular.h    |  260 ++
 include/eigen3/Eigen/src/Core/StableNorm.h         |  203 ++
 include/eigen3/Eigen/src/Core/Stride.h             |  108 +
 include/eigen3/Eigen/src/Core/Swap.h               |  126 +
 include/eigen3/Eigen/src/Core/Transpose.h          |  419 +++
 include/eigen3/Eigen/src/Core/Transpositions.h     |  436 +++
 include/eigen3/Eigen/src/Core/TriangularMatrix.h   |  839 +++++
 include/eigen3/Eigen/src/Core/VectorBlock.h        |   95 +
 include/eigen3/Eigen/src/Core/VectorwiseOp.h       |  642 ++++
 include/eigen3/Eigen/src/Core/Visitor.h            |  237 ++
 .../eigen3/Eigen/src/Core/arch/AltiVec/Complex.h   |  217 ++
 .../Eigen/src/Core/arch/AltiVec/PacketMath.h       |  501 +++
 .../eigen3/Eigen/src/Core/arch/Default/Settings.h  |   49 +
 include/eigen3/Eigen/src/Core/arch/NEON/Complex.h  |  253 ++
 .../eigen3/Eigen/src/Core/arch/NEON/PacketMath.h   |  419 +++
 include/eigen3/Eigen/src/Core/arch/SSE/Complex.h   |  442 +++
 .../eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h |  475 +++
 .../eigen3/Eigen/src/Core/arch/SSE/PacketMath.h    |  649 ++++
 .../Eigen/src/Core/products/CoeffBasedProduct.h    |  439 +++
 .../src/Core/products/GeneralBlockPanelKernel.h    | 1341 ++++++++
 .../Eigen/src/Core/products/GeneralMatrixMatrix.h  |  427 +++
 .../Core/products/GeneralMatrixMatrixTriangular.h  |  278 ++
 .../products/GeneralMatrixMatrixTriangular_MKL.h   |  146 +
 .../src/Core/products/GeneralMatrixMatrix_MKL.h    |  118 +
 .../Eigen/src/Core/products/GeneralMatrixVector.h  |  566 ++++
 .../src/Core/products/GeneralMatrixVector_MKL.h    |  131 +
 .../eigen3/Eigen/src/Core/products/Parallelizer.h  |  159 +
 .../src/Core/products/SelfadjointMatrixMatrix.h    |  436 +++
 .../Core/products/SelfadjointMatrixMatrix_MKL.h    |  295 ++
 .../src/Core/products/SelfadjointMatrixVector.h    |  281 ++
 .../Core/products/SelfadjointMatrixVector_MKL.h    |  114 +
 .../Eigen/src/Core/products/SelfadjointProduct.h   |  123 +
 .../src/Core/products/SelfadjointRank2Update.h     |   93 +
 .../src/Core/products/TriangularMatrixMatrix.h     |  427 +++
 .../src/Core/products/TriangularMatrixMatrix_MKL.h |  309 ++
 .../src/Core/products/TriangularMatrixVector.h     |  348 ++
 .../src/Core/products/TriangularMatrixVector_MKL.h |  247 ++
 .../src/Core/products/TriangularSolverMatrix.h     |  329 ++
 .../src/Core/products/TriangularSolverMatrix_MKL.h |  155 +
 .../src/Core/products/TriangularSolverVector.h     |  139 +
 include/eigen3/Eigen/src/Core/util/BlasUtil.h      |  264 ++
 include/eigen3/Eigen/src/Core/util/Constants.h     |  438 +++
 .../Eigen/src/Core/util/DisableStupidWarnings.h    |   40 +
 .../Eigen/src/Core/util/ForwardDeclarations.h      |  299 ++
 include/eigen3/Eigen/src/Core/util/MKL_support.h   |  126 +
 include/eigen3/Eigen/src/Core/util/Macros.h        |  431 +++
 include/eigen3/Eigen/src/Core/util/Memory.h        |  977 ++++++
 include/eigen3/Eigen/src/Core/util/Meta.h          |  243 ++
 include/eigen3/Eigen/src/Core/util/NonMPL2.h       |    3 +
 .../Eigen/src/Core/util/ReenableStupidWarnings.h   |   14 +
 include/eigen3/Eigen/src/Core/util/StaticAssert.h  |  208 ++
 include/eigen3/Eigen/src/Core/util/XprHelper.h     |  469 +++
 include/eigen3/Eigen/src/Eigen2Support/Block.h     |  126 +
 include/eigen3/Eigen/src/Eigen2Support/Cwise.h     |  192 ++
 .../Eigen/src/Eigen2Support/CwiseOperators.h       |  298 ++
 .../Eigen/src/Eigen2Support/Geometry/AlignedBox.h  |  159 +
 .../eigen3/Eigen/src/Eigen2Support/Geometry/All.h  |  115 +
 .../Eigen/src/Eigen2Support/Geometry/AngleAxis.h   |  214 ++
 .../Eigen/src/Eigen2Support/Geometry/Hyperplane.h  |  254 ++
 .../src/Eigen2Support/Geometry/ParametrizedLine.h  |  141 +
 .../Eigen/src/Eigen2Support/Geometry/Quaternion.h  |  495 +++
 .../Eigen/src/Eigen2Support/Geometry/Rotation2D.h  |  145 +
 .../src/Eigen2Support/Geometry/RotationBase.h      |  123 +
 .../Eigen/src/Eigen2Support/Geometry/Scaling.h     |  167 +
 .../Eigen/src/Eigen2Support/Geometry/Transform.h   |  786 +++++
 .../Eigen/src/Eigen2Support/Geometry/Translation.h |  184 ++
 include/eigen3/Eigen/src/Eigen2Support/LU.h        |  120 +
 include/eigen3/Eigen/src/Eigen2Support/Lazy.h      |   71 +
 .../eigen3/Eigen/src/Eigen2Support/LeastSquares.h  |  169 +
 include/eigen3/Eigen/src/Eigen2Support/Macros.h    |   20 +
 .../eigen3/Eigen/src/Eigen2Support/MathFunctions.h |   57 +
 include/eigen3/Eigen/src/Eigen2Support/Memory.h    |   45 +
 include/eigen3/Eigen/src/Eigen2Support/Meta.h      |   75 +
 include/eigen3/Eigen/src/Eigen2Support/Minor.h     |  117 +
 include/eigen3/Eigen/src/Eigen2Support/QR.h        |   67 +
 include/eigen3/Eigen/src/Eigen2Support/SVD.h       |  637 ++++
 .../Eigen/src/Eigen2Support/TriangularSolver.h     |   42 +
 .../eigen3/Eigen/src/Eigen2Support/VectorBlock.h   |   94 +
 .../Eigen/src/Eigenvalues/ComplexEigenSolver.h     |  333 ++
 .../eigen3/Eigen/src/Eigenvalues/ComplexSchur.h    |  456 +++
 .../Eigen/src/Eigenvalues/ComplexSchur_MKL.h       |   94 +
 include/eigen3/Eigen/src/Eigenvalues/EigenSolver.h |  598 ++++
 .../Eigen/src/Eigenvalues/GeneralizedEigenSolver.h |  341 ++
 .../GeneralizedSelfAdjointEigenSolver.h            |  227 ++
 .../src/Eigenvalues/HessenbergDecomposition.h      |  373 +++
 .../Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h  |  160 +
 include/eigen3/Eigen/src/Eigenvalues/RealQZ.h      |  624 ++++
 include/eigen3/Eigen/src/Eigenvalues/RealSchur.h   |  529 +++
 .../eigen3/Eigen/src/Eigenvalues/RealSchur_MKL.h   |   83 +
 .../Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h |  812 +++++
 .../src/Eigenvalues/SelfAdjointEigenSolver_MKL.h   |   92 +
 .../Eigen/src/Eigenvalues/Tridiagonalization.h     |  557 ++++
 include/eigen3/Eigen/src/Geometry/AlignedBox.h     |  375 +++
 include/eigen3/Eigen/src/Geometry/AngleAxis.h      |  233 ++
 include/eigen3/Eigen/src/Geometry/EulerAngles.h    |  104 +
 include/eigen3/Eigen/src/Geometry/Homogeneous.h    |  307 ++
 include/eigen3/Eigen/src/Geometry/Hyperplane.h     |  280 ++
 include/eigen3/Eigen/src/Geometry/OrthoMethods.h   |  218 ++
 .../eigen3/Eigen/src/Geometry/ParametrizedLine.h   |  195 ++
 include/eigen3/Eigen/src/Geometry/Quaternion.h     |  778 +++++
 include/eigen3/Eigen/src/Geometry/Rotation2D.h     |  160 +
 include/eigen3/Eigen/src/Geometry/RotationBase.h   |  206 ++
 include/eigen3/Eigen/src/Geometry/Scaling.h        |  166 +
 include/eigen3/Eigen/src/Geometry/Transform.h      | 1455 +++++++++
 include/eigen3/Eigen/src/Geometry/Translation.h    |  206 ++
 include/eigen3/Eigen/src/Geometry/Umeyama.h        |  177 ++
 .../eigen3/Eigen/src/Geometry/arch/Geometry_SSE.h  |  115 +
 .../Eigen/src/Householder/BlockHouseholder.h       |   68 +
 include/eigen3/Eigen/src/Householder/Householder.h |  171 +
 .../Eigen/src/Householder/HouseholderSequence.h    |  441 +++
 .../IterativeLinearSolvers/BasicPreconditioners.h  |  149 +
 .../Eigen/src/IterativeLinearSolvers/BiCGSTAB.h    |  275 ++
 .../src/IterativeLinearSolvers/ConjugateGradient.h |  265 ++
 .../src/IterativeLinearSolvers/IncompleteLUT.h     |  467 +++
 .../IterativeLinearSolvers/IterativeSolverBase.h   |  254 ++
 include/eigen3/Eigen/src/Jacobi/Jacobi.h           |  433 +++
 include/eigen3/Eigen/src/LU/Determinant.h          |  101 +
 include/eigen3/Eigen/src/LU/FullPivLU.h            |  743 +++++
 include/eigen3/Eigen/src/LU/Inverse.h              |  400 +++
 include/eigen3/Eigen/src/LU/PartialPivLU.h         |  501 +++
 include/eigen3/Eigen/src/LU/PartialPivLU_MKL.h     |   85 +
 include/eigen3/Eigen/src/LU/arch/Inverse_SSE.h     |  329 ++
 .../eigen3/Eigen/src/MetisSupport/MetisSupport.h   |  137 +
 include/eigen3/Eigen/src/OrderingMethods/Amd.h     |  435 +++
 .../Eigen/src/OrderingMethods/Eigen_Colamd.h       | 1850 +++++++++++
 .../eigen3/Eigen/src/OrderingMethods/Ordering.h    |  154 +
 .../eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h |  721 +++++
 .../Eigen/src/PardisoSupport/PardisoSupport.h      |  592 ++++
 include/eigen3/Eigen/src/QR/ColPivHouseholderQR.h  |  580 ++++
 .../eigen3/Eigen/src/QR/ColPivHouseholderQR_MKL.h  |   99 +
 include/eigen3/Eigen/src/QR/FullPivHouseholderQR.h |  614 ++++
 include/eigen3/Eigen/src/QR/HouseholderQR.h        |  374 +++
 include/eigen3/Eigen/src/QR/HouseholderQR_MKL.h    |   69 +
 .../Eigen/src/SPQRSupport/SuiteSparseQRSupport.h   |  314 ++
 include/eigen3/Eigen/src/SVD/JacobiSVD.h           |  969 ++++++
 include/eigen3/Eigen/src/SVD/JacobiSVD_MKL.h       |   92 +
 .../eigen3/Eigen/src/SVD/UpperBidiagonalization.h  |  148 +
 .../Eigen/src/SparseCholesky/SimplicialCholesky.h  |  671 ++++
 .../src/SparseCholesky/SimplicialCholesky_impl.h   |  199 ++
 include/eigen3/Eigen/src/SparseCore/AmbiVector.h   |  373 +++
 .../Eigen/src/SparseCore/CompressedStorage.h       |  233 ++
 .../SparseCore/ConservativeSparseSparseProduct.h   |  245 ++
 .../Eigen/src/SparseCore/MappedSparseMatrix.h      |  181 ++
 include/eigen3/Eigen/src/SparseCore/SparseBlock.h  |  499 +++
 .../eigen3/Eigen/src/SparseCore/SparseColEtree.h   |  206 ++
 .../Eigen/src/SparseCore/SparseCwiseBinaryOp.h     |  325 ++
 .../Eigen/src/SparseCore/SparseCwiseUnaryOp.h      |  163 +
 .../Eigen/src/SparseCore/SparseDenseProduct.h      |  311 ++
 .../Eigen/src/SparseCore/SparseDiagonalProduct.h   |  196 ++
 include/eigen3/Eigen/src/SparseCore/SparseDot.h    |  101 +
 include/eigen3/Eigen/src/SparseCore/SparseFuzzy.h  |   26 +
 include/eigen3/Eigen/src/SparseCore/SparseMatrix.h | 1259 ++++++++
 .../eigen3/Eigen/src/SparseCore/SparseMatrixBase.h |  452 +++
 .../Eigen/src/SparseCore/SparsePermutation.h       |  148 +
 .../eigen3/Eigen/src/SparseCore/SparseProduct.h    |  188 ++
 include/eigen3/Eigen/src/SparseCore/SparseRedux.h  |   45 +
 .../Eigen/src/SparseCore/SparseSelfAdjointView.h   |  507 +++
 .../SparseCore/SparseSparseProductWithPruning.h    |  150 +
 .../eigen3/Eigen/src/SparseCore/SparseTranspose.h  |   63 +
 .../Eigen/src/SparseCore/SparseTriangularView.h    |  179 ++
 include/eigen3/Eigen/src/SparseCore/SparseUtil.h   |  173 +
 include/eigen3/Eigen/src/SparseCore/SparseVector.h |  447 +++
 include/eigen3/Eigen/src/SparseCore/SparseView.h   |   99 +
 .../eigen3/Eigen/src/SparseCore/TriangularSolver.h |  334 ++
 include/eigen3/Eigen/src/SparseLU/SparseLU.h       |  757 +++++
 include/eigen3/Eigen/src/SparseLU/SparseLUImpl.h   |   64 +
 .../eigen3/Eigen/src/SparseLU/SparseLU_Memory.h    |  227 ++
 .../eigen3/Eigen/src/SparseLU/SparseLU_Structs.h   |  111 +
 .../Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h |  298 ++
 include/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h |   80 +
 .../Eigen/src/SparseLU/SparseLU_column_bmod.h      |  180 ++
 .../Eigen/src/SparseLU/SparseLU_column_dfs.h       |  177 ++
 .../Eigen/src/SparseLU/SparseLU_copy_to_ucol.h     |  106 +
 .../Eigen/src/SparseLU/SparseLU_gemm_kernel.h      |  279 ++
 .../Eigen/src/SparseLU/SparseLU_heap_relax_snode.h |  127 +
 .../Eigen/src/SparseLU/SparseLU_kernel_bmod.h      |  130 +
 .../Eigen/src/SparseLU/SparseLU_panel_bmod.h       |  223 ++
 .../eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h |  258 ++
 .../eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h    |  134 +
 .../eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h    |  135 +
 .../Eigen/src/SparseLU/SparseLU_relax_snode.h      |   83 +
 include/eigen3/Eigen/src/SparseQR/SparseQR.h       |  714 +++++
 include/eigen3/Eigen/src/StlSupport/StdDeque.h     |  134 +
 include/eigen3/Eigen/src/StlSupport/StdList.h      |  114 +
 include/eigen3/Eigen/src/StlSupport/StdVector.h    |  126 +
 include/eigen3/Eigen/src/StlSupport/details.h      |   84 +
 .../Eigen/src/SuperLUSupport/SuperLUSupport.h      | 1026 ++++++
 .../Eigen/src/UmfPackSupport/UmfPackSupport.h      |  474 +++
 include/eigen3/Eigen/src/misc/Image.h              |   84 +
 include/eigen3/Eigen/src/misc/Kernel.h             |   81 +
 include/eigen3/Eigen/src/misc/Solve.h              |   76 +
 include/eigen3/Eigen/src/misc/SparseSolve.h        |  128 +
 include/eigen3/Eigen/src/misc/blas.h               |  658 ++++
 .../eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h |  211 ++
 .../eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h  |  203 ++
 include/eigen3/Eigen/src/plugins/BlockMethods.h    |  935 ++++++
 .../Eigen/src/plugins/CommonCwiseBinaryOps.h       |   46 +
 .../eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h |  172 +
 .../Eigen/src/plugins/MatrixCwiseBinaryOps.h       |  126 +
 .../eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h |   67 +
 include/eigen3/signature_of_eigen3_matrix_library  |    1 +
 include/eigen3/unsupported/Eigen/AdolcForward      |  156 +
 include/eigen3/unsupported/Eigen/AlignedVector3    |  190 ++
 include/eigen3/unsupported/Eigen/AutoDiff          |   40 +
 include/eigen3/unsupported/Eigen/BVH               |   95 +
 include/eigen3/unsupported/Eigen/FFT               |  418 +++
 include/eigen3/unsupported/Eigen/IterativeSolvers  |   45 +
 include/eigen3/unsupported/Eigen/KroneckerProduct  |   34 +
 .../eigen3/unsupported/Eigen/LevenbergMarquardt    |   45 +
 include/eigen3/unsupported/Eigen/MPRealSupport     |  203 ++
 include/eigen3/unsupported/Eigen/MatrixFunctions   |  447 +++
 include/eigen3/unsupported/Eigen/MoreVectorization |   24 +
 .../eigen3/unsupported/Eigen/NonLinearOptimization |  134 +
 include/eigen3/unsupported/Eigen/NumericalDiff     |   56 +
 include/eigen3/unsupported/Eigen/OpenGLSupport     |  322 ++
 include/eigen3/unsupported/Eigen/Polynomials       |  138 +
 include/eigen3/unsupported/Eigen/Skyline           |   39 +
 include/eigen3/unsupported/Eigen/SparseExtra       |   56 +
 include/eigen3/unsupported/Eigen/Splines           |   31 +
 .../Eigen/src/AutoDiff/AutoDiffJacobian.h          |   83 +
 .../Eigen/src/AutoDiff/AutoDiffScalar.h            |  642 ++++
 .../Eigen/src/AutoDiff/AutoDiffVector.h            |  220 ++
 .../unsupported/Eigen/src/BVH/BVAlgorithms.h       |  293 ++
 include/eigen3/unsupported/Eigen/src/BVH/KdBVH.h   |  222 ++
 .../unsupported/Eigen/src/FFT/ei_fftw_impl.h       |  261 ++
 .../unsupported/Eigen/src/FFT/ei_kissfft_impl.h    |  420 +++
 .../src/IterativeSolvers/ConstrainedConjGrad.h     |  189 ++
 .../Eigen/src/IterativeSolvers/DGMRES.h            |  542 ++++
 .../unsupported/Eigen/src/IterativeSolvers/GMRES.h |  383 +++
 .../src/IterativeSolvers/IncompleteCholesky.h      |  278 ++
 .../Eigen/src/IterativeSolvers/IncompleteLU.h      |  113 +
 .../src/IterativeSolvers/IterationController.h     |  154 +
 .../Eigen/src/IterativeSolvers/MINRES.h            |  302 ++
 .../Eigen/src/IterativeSolvers/Scaling.h           |  185 ++
 .../src/KroneckerProduct/KroneckerTensorProduct.h  |  244 ++
 .../Eigen/src/MatrixFunctions/MatrixExponential.h  |  451 +++
 .../Eigen/src/MatrixFunctions/MatrixFunction.h     |  591 ++++
 .../src/MatrixFunctions/MatrixFunctionAtomic.h     |  131 +
 .../Eigen/src/MatrixFunctions/MatrixLogarithm.h    |  486 +++
 .../Eigen/src/MatrixFunctions/MatrixPower.h        |  508 +++
 .../Eigen/src/MatrixFunctions/MatrixSquareRoot.h   |  466 +++
 .../Eigen/src/MatrixFunctions/StemFunction.h       |  105 +
 .../Eigen/src/MoreVectorization/MathFunctions.h    |   95 +
 .../NonLinearOptimization/HybridNonLinearSolver.h  |  601 ++++
 .../src/NonLinearOptimization/LevenbergMarquardt.h |  650 ++++
 .../Eigen/src/NonLinearOptimization/chkder.h       |   66 +
 .../Eigen/src/NonLinearOptimization/covar.h        |   70 +
 .../Eigen/src/NonLinearOptimization/dogleg.h       |  107 +
 .../Eigen/src/NonLinearOptimization/fdjac1.h       |   79 +
 .../Eigen/src/NonLinearOptimization/lmpar.h        |  298 ++
 .../Eigen/src/NonLinearOptimization/qrsolv.h       |   91 +
 .../Eigen/src/NonLinearOptimization/r1mpyq.h       |   30 +
 .../Eigen/src/NonLinearOptimization/r1updt.h       |   99 +
 .../Eigen/src/NonLinearOptimization/rwupdt.h       |   49 +
 .../Eigen/src/NumericalDiff/NumericalDiff.h        |  130 +
 .../unsupported/Eigen/src/Polynomials/Companion.h  |  276 ++
 .../Eigen/src/Polynomials/PolynomialSolver.h       |  389 +++
 .../Eigen/src/Polynomials/PolynomialUtils.h        |  143 +
 .../Eigen/src/Skyline/SkylineInplaceLU.h           |  352 ++
 .../unsupported/Eigen/src/Skyline/SkylineMatrix.h  |  862 +++++
 .../Eigen/src/Skyline/SkylineMatrixBase.h          |  212 ++
 .../unsupported/Eigen/src/Skyline/SkylineProduct.h |  295 ++
 .../unsupported/Eigen/src/Skyline/SkylineStorage.h |  259 ++
 .../unsupported/Eigen/src/Skyline/SkylineUtil.h    |   89 +
 .../src/SparseExtra/BlockOfDynamicSparseMatrix.h   |  122 +
 .../Eigen/src/SparseExtra/DynamicSparseMatrix.h    |  357 +++
 .../unsupported/Eigen/src/SparseExtra/MarketIO.h   |  273 ++
 .../Eigen/src/SparseExtra/MatrixMarketIterator.h   |  232 ++
 .../Eigen/src/SparseExtra/RandomSetter.h           |  327 ++
 .../eigen3/unsupported/Eigen/src/Splines/Spline.h  |  474 +++
 .../unsupported/Eigen/src/Splines/SplineFitting.h  |  156 +
 .../unsupported/Eigen/src/Splines/SplineFwd.h      |   86 +
 include/ezETAProgressBar.hpp                       |  204 ++
 include/fastapprox.h                               | 1608 ++++++++++
 include/format.h                                   | 2762 ++++++++++++++++
 include/kseq.h                                     |  235 ++
 include/merge_files.hpp                            |   30 +
 include/obsolete/IterativeOptimizer.hpp            | 1198 +++++++
 include/obsolete/KFITF.hpp                         |  158 +
 include/obsolete/LinearSystemBuilder.hpp           |  170 +
 include/obsolete/PoissonSolver.hpp                 |  320 ++
 include/obsolete/affymetrix_utils.hpp              |   62 +
 include/obsolete/graph_utils.hpp                   |   82 +
 include/obsolete/linq.h                            |  473 +++
 include/obsolete/set_multicover_solver.hpp         |  492 +++
 include/obsolete/transcript_segmenter.hpp          | 1132 +++++++
 include/posix.h                                    |  344 ++
 include/readerwriterqueue.h                        |  630 ++++
 include/safe_btree.h                               |  395 +++
 include/safe_btree_map.h                           |   89 +
 include/safe_btree_set.h                           |   88 +
 include/tensemble/BaseForest.h                     |  513 +++
 include/tensemble/BaseGBM.h                        |  426 +++
 include/tensemble/ClassificationCriterion.h        |  201 ++
 include/tensemble/Criterion.h                      |  163 +
 include/tensemble/Estimator.h                      |   79 +
 include/tensemble/EvaluateMetric.h                 |   83 +
 include/tensemble/FeatureData.h                    |  227 ++
 include/tensemble/GBMClassifier.h                  |  400 +++
 include/tensemble/GBMRegressor.h                   |  242 ++
 include/tensemble/LossFunction.h                   |  350 ++
 include/tensemble/RandomForestClassifier.h         |  269 ++
 include/tensemble/RandomForestRegressor.h          |  279 ++
 include/tensemble/RandomGenerator.h                |   33 +
 include/tensemble/ReadData.h                       |  163 +
 include/tensemble/Tree.h                           |  731 +++++
 include/tensemble/Tree.h.backup.h                  |  391 +++
 include/tensemble/TreeClassifier.h                 |   74 +
 include/tensemble/TreeNode.h                       |   41 +
 include/tensemble/TreeRegressor.h                  |   66 +
 include/tensemble/TypeDef.h                        |   39 +
 include/tensemble/cmdline.h                        |  160 +
 sample_data.tgz                                    |  Bin 0 -> 427095 bytes
 scripts/AddHeaders.sh                              |   52 +
 scripts/ComputeMutationRate.py                     |   70 +
 scripts/MutateReference.py                         |   83 +
 scripts/cpld.bash                                  |   35 +
 scripts/make-release.sh                            |   78 +
 scripts/push-binary.sh                             |   42 +
 src/AlignmentModel.cpp                             |  621 ++++
 src/BiasCorrectionDriver.cpp                       |   52 +
 src/BuildSalmonIndex.cpp                           |  279 ++
 src/CMakeLists.txt                                 |  266 ++
 src/CollapsedEMOptimizer.cpp                       |  449 +++
 src/CollapsedGibbsSampler.cpp                      |  345 ++
 src/ComputeBiasFeatures.cpp                        |  225 ++
 src/ErrorModel.cpp                                 |  234 ++
 src/FASTAParser.cpp                                |   54 +
 src/FragmentLengthDistribution.cpp                 |  208 ++
 src/FragmentList.cpp                               |  231 ++
 src/FragmentStartPositionDistribution.cpp          |  141 +
 src/GenomicFeature.cpp                             |  104 +
 src/JellyfishMerCounter.cpp                        |  379 +++
 src/LibraryFormat.cpp                              |   99 +
 src/LookUpTableUtils.cpp                           |  255 ++
 src/MultithreadedBAMParser.cpp                     |   95 +
 src/PCA.cpp                                        |  306 ++
 src/PCAUtils.cpp                                   |  133 +
 src/PerformBiasCorrection.cpp                      |  349 ++
 src/PerformBiasCorrection_old.cpp                  |  420 +++
 src/QSufSort.c                                     |  402 +++
 src/SailfishUtils.cpp                              |  699 ++++
 src/Salmon.cpp                                     |  237 ++
 src/SalmonQuantify.cpp                             | 3209 +++++++++++++++++++
 src/SalmonQuantify.cpp.bak                         | 1314 ++++++++
 src/SalmonQuantifyAlignments.cpp                   | 1264 ++++++++
 src/SalmonStringUtils.cpp                          |   19 +
 src/SalmonUtils.cpp                                | 1236 +++++++
 src/SequenceBiasModel.cpp                          |  311 ++
 src/StadenUtils.cpp                                |   17 +
 src/TestUtils.cpp                                  |   72 +
 src/TranscriptGroup.cpp                            |   55 +
 src/VersionChecker.cpp                             |  186 ++
 src/bwt_gen.c                                      | 1632 ++++++++++
 src/bwtindex.c                                     |  295 ++
 src/cokus.cpp                                      |  196 ++
 src/format.cc                                      | 1213 +++++++
 src/is.c                                           |  223 ++
 src/merge_files.cc                                 |  149 +
 src/posix.cc                                       |  252 ++
 src/tags                                           |  429 +++
 569 files changed, 156562 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b8bd026
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,28 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..43253af
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,40 @@
+#whitelist
+branches:
+    only:
+        - master
+        - develop
+
+language: cpp
+
+compiler: gcc
+
+before_install:
+    - echo "yes" | sudo apt-add-repository ppa:boost-latest
+    - echo "yes" | sudo apt-add-repository ppa:mapnik/boost-backports-1-54
+    - echo "yes" | sudo apt-add-repository ppa:h-rayflood/gcc-upper
+    - sudo apt-get clean -qq
+    - sudo apt-get update -qq
+    - sudo apt-get install -qq gcc-4.9 g++-4.9 
+    - export CC="gcc-4.9"
+    - export CXX="gcc-4.9"
+    - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 50
+    - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 50
+
+install:
+    - export CXX="g++-4.9"
+    - export CC="gcc-4.9"
+
+script: 
+    - mkdir build 
+    - cd build 
+    - cmake -DFETCH_BOOST=TRUE -DNO_RTM=TRUE .. 
+    - travis_wait 45 make 
+    - travis_wait make install
+    - travis_wait make test VERBOSE=1
+
+after_success: 
+    - cd $TRAVIS_BUILD_DIR
+    - ./scripts/push-binary.sh
+      
+after_failure:
+    - cat $TRAVIS_BUILD_DIR/build/Testing/Temporary/LastTest.log
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100755
index 0000000..c95f755
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,573 @@
+cmake_minimum_required (VERSION 2.8)
+
+enable_testing()
+
+project (Salmon)
+
+set(CPACK_PACKAGE_VERSION "0.4.2")
+set(CPACK_PACKAGE_VERSION_MAJOR "0")
+set(CPACK_PACKAGE_VERSION_MINOR 4")
+set(CPACK_PACKAGE_VERSION_PATCH 2")
+set(CPACK_GENERATOR "TGZ")
+set(CPACK_SOURCE_GENERATOR "TGZ")
+set(CPACK_PACKAGE_VENDOR Stony Brook University")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY Salmon - Wicked-fast RNA-seq isoform quantification using lightweight alignments")
+set(CPACK_PACKAGE_NAME 
+  "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
+set(CPACK_SOURCE_PACKAGE_FILE_NAME
+  "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}-Source")
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+
+set (WARNING_IGNORE_FLAGS "-Wno-deprecated-register") 
+set (BOOST_CXX_FLAGS "-Wno-deprecated-register -std=c++11")
+## Prefer static to dynamic libraries
+SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+
+## Set the standard required compile flags
+# Nov 18th --- removed -DHAVE_CONFIG_H 
+set (CMAKE_CXX_FLAGS "-pthread -funroll-loops -fPIC -fomit-frame-pointer -Ofast -DHAVE_ANSI_TERM -DHAVE_SSTREAM -Wall -std=c++11 -Wreturn-type -Werror=return-type")
+
+##
+# OSX is strange (some might say, stupid in this regard).  Deal with it's quirkines here.
+##
+if (APPLE)
+    # To allow ourselves to build a dynamic library, we have to tell the compiler
+    # that, yes, the symbols will be around at runtime.
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -undefined dynamic_lookup")
+    set (LIBSALMON_LINKER_FLAGS "-all_load")
+    # In order to "think different", we also have to use non-standard suffixes
+    # for our shared libraries
+    set(SHARED_LIB_EXTENSION "dylib")
+else()
+    # We're in sane linux world
+   set (SHARED_LIB_EXTENSION "so")
+   set (LIBSALMON_LINKER_FLAGS "")
+endif()
+
+set( BOOST_EXTRA_FLAGS "--layout=tagged" )
+## this get's set differently below if we 
+## are on clang & apple
+set (NON_APPLECLANG_LIBS gomp rt)
+set (PTHREAD_LIB)
+
+##
+# Compiler-specific C++11 activation.
+# http://stackoverflow.com/questions/10984442/how-to-detect-c11-support-of-a-compiler-with-cmake
+##
+##
+# First take care of what to do if we have gcc
+##
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+    execute_process(
+        COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    # If we're on OSX
+    if (APPLE AND NOT (GCC_VERSION VERSION_GREATER 4.8.2 OR GCC_VERSION VERSION_EQUAL 4.8.2))
+        message(FATAL_ERROR "When building under OSX, ${PROJECT_NAME} requires "
+                            "either clang or g++ >= 4.8.2")
+    elseif (NOT (GCC_VERSION VERSION_GREATER 4.7 OR GCC_VERSION VERSION_EQUAL 4.7))
+        message(FATAL_ERROR "${PROJECT_NAME} requires g++ 4.7 or greater.")
+    endif ()
+
+    set (GCC TRUE)
+    
+    # Put complete static linking on hold for the time-being
+    # If we're not on OSX, make an attempt to compile everything statically
+    #if (NOT APPLE)
+    #set (CMAKE_CXX_FLAGS "-static ${CMAKE_CXX_FLAGS}")
+    #set (CMAKE_EXE_LINK_FLAGS "-static")
+    set (PTHREAD_LIB "pthread")
+    #endif()
+
+    # If we're on Linux (i.e. not OSX) and we're using 
+    # gcc, then set the -static-libstdc++ flag
+    if (NOT APPLE) 
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
+    endif()
+
+    set (WARNING_IGNORE_FLAGS "${WARNING_IGNORE_FLAGS} -Wno-unused-local-typedefs")
+    set (BOOST_TOOLSET "gcc")
+    set (BOOST_CONFIGURE_TOOLSET "--with-toolset=gcc")
+	set (BCXX_FLAGS "-std=c++11")
+    set (BOOST_EXTRA_FLAGS toolset=gcc cxxflags=${BCXX_FLAGS})
+# Tentatively, we support clang now
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    set(CLANG TRUE)
+    # If we have libc++, then try and use it 
+    include(CheckCXXCompilerFlag)
+    check_cxx_compiler_flag(-stdlib=libc++ HAVE_LIBCPP)
+    if (HAVE_LIBCPP)
+        message ("It appears that you're compiling with clang and that libc++ is available, so I'll use that")
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
+	    set (BOOST_TOOLSET "clang")
+        set (BOOST_CONFIGURE_TOOLSET "--with-toolset=clang")
+	    set (BCXX_FLAGS "-stdlib=libc++ -DBOOST_HAS_INT128")
+	    set (BOOST_EXTRA_FLAGS toolset=clang cxxflags=${BCXX_FLAGS} linkflags="-stdlib=libc++")
+        set (JELLYFISH_CXX_FLAGS "-stdlib=libc++")
+    # Otherwise, use libstdc++ (and make it static) 
+    else()
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
+    endif()
+    # There's currently a bug with clang-3.4 & Boost 1.55 -- this hack fixes it
+    # but we should do something better (does this break things if CPU doesn't
+    # have 128-bit support)?
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_HAS_INT128") 
+
+    if (APPLE)
+        set (NON_APPLECLANG_LIBS "")
+    else()
+        set (PTHREAD_LIB "pthread")
+    endif()
+else ()
+    message(FATAL_ERROR "Your C++ compiler does not support C++11.")
+endif ()
+
+## TODO: Figure out how to detect this automatically
+# If the "assembler" is too old, tell TBB not to compile 
+# with -mrtm
+if (NO_RTM)
+    set (TBB_CXXFLAGS "-mno-rtm")
+endif()
+
+include(ExternalProject)
+
+##
+#  Update the CXX flags according to the system and compiler
+##
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_IGNORE_FLAGS}")
+
+if (CMAKE_BUILD_TYPE MATCHES Debug)
+    message ("Making Debug build")
+    set (CMAKE_CXX_FLAGS_DEBUG "-g ${CMAKE_CXX_FLAGS}")
+elseif (CMAKE_BUILD_TYPE MATCHES Release)
+    message ("Making Release build")
+    set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS}")
+else ()
+    message ("Making Default build type")
+endif ()
+
+##
+# Record this top-level path
+##
+set (GAT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
+# Have CMake tell us what it's doing
+set (CMAKE_VERBOSE_MAKEFILE true)
+
+##
+# Super-secret override
+##
+if ( DEFINED CUSTOM_BOOST_PATH )
+	set (CMAKE_INCLUDE_PATH ${CUSTOM_BOOST_PATH} ${CMAKE_INCLUDE_PATH})
+    set (CMAKE_LIBRARY_PATH ${CUSTOM_BOOST_PATH}/lib ${CMAKE_LIBRARY_PATH})
+endif ( DEFINED CUSTOM_BOOST_PATH )
+
+##
+# We want static, multithreaded boost libraries
+##
+set (Boost_USE_STATIC_LIBS ON)
+set (Boost_USE_MULTITHREADED ON)
+#set (Boost_USE_STATIC_RUNTIME OFF)
+
+find_package (ZLIB)
+if (NOT ZLIB_FOUND)
+	message (FATAL_ERROR "zlib must be installed before configuration & building can proceed")
+endif()
+
+find_package (LibLZMA)
+if (NOT LIBLZMA_FOUND)
+    message ("Will attempt to fetch and build liblzma")
+    message ("=======================================")
+ExternalProject_Add(liblzma
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    URL http://tukaani.org/xz/xz-5.2.0.tar.gz 
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/xz-5.2.0
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    BUILD_IN_SOURCE TRUE
+    CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/external/xz-5.2.0/configure --prefix=<INSTALL_DIR> CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
+    BUILD_COMMAND make
+    INSTALL_COMMAND make install
+)
+# Tell cmake that the external project generated a library so we can
+# add dependencies here instead of later
+set (LIBLZMA_LIBRARIES ${GAT_SOURCE_DIR}/external/install/lib/liblzma.a)
+set (LIBSTADEN_LDFLAGS "-L${GAT_SOURCE_DIR}/external/install/lib")
+set (LIBSTADEN_CFLAGS "-I${GAT_SOURCE_DIR}/external/install/include")
+else()
+    message("Found liblzma library: ${LIBLZMA_LIBRARIES}")
+    message("===========================================")
+endif()
+
+find_package (BZip2)
+if (NOT BZIP2_FOUND)
+    message ("Will attempt to fetch and build libbz2")
+    message ("=======================================")
+ExternalProject_Add(libbz2
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    URL http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/bzip2-1.0.6
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    BUILD_IN_SOURCE TRUE
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND make CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
+    INSTALL_COMMAND make install PREFIX=<INSTALL_DIR>
+)
+# Tell cmake that the external project generated a library so we can
+# add dependencies here instead of later
+set (BZIP2_LIBRARIES ${GAT_SOURCE_DIR}/external/install/lib/libbz2.a)
+set (LIBSTADEN_LDFLAGS "-L${GAT_SOURCE_DIR}/external/install/lib -I${GAT_SOURCE_DIR}/external/install/include")
+set (LIBSTADEN_CFLAGS "-I${GAT_SOURCE_DIR}/external/install/include")
+else()
+    message("Found libbz2 library: ${BZIP2_LIBRARIES}")
+    message("===========================================")
+endif()
+
+##
+# Set the latest version and look for what we need
+##
+set(Boost_ADDITIONAL_VERSIONS "1.53" "1.53.0" "1.54" "1.55" "1.56" "1.57.0" "1.58")
+find_package(Boost 1.53.0 COMPONENTS iostreams filesystem system thread timer chrono program_options serialization)
+message("BOOST_INCLUDEDIR = ${BOOST_INCLUDEDIR}")
+message("BOOST_LIBRARYDIR = ${BOOST_LIBRARYDIR}")
+message("Boost_FOUND = ${Boost_FOUND}")
+include(ExternalProject)
+
+##
+#  If we had to fetch Boost, the reconfigure step will re-run cmake.  The second configuration
+#  pass is executed with the BOOST_RECONFIGURE flag set. This should allow our newly
+#  installed Boost to be found by CMake.
+##
+if (BOOST_RECONFIGURE)
+    message("Executing Boost Reconfiguration")
+    unset(Boost_FOUND CACHE)
+    unset(Boost_INCLUDE_DIR CACHE)
+    unset(Boost_INCLUDE_DIRS CACHE)
+    unset(Boost_LIBRARY_DIRS CACHE) 
+    unset(Boost_LIBRARIES CACHE)
+    unset(BOOST_ROOT CACHE)
+    unset(CMAKE_PREFIX_PATH CACHE)
+    
+    set(BOOST_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/external/install)
+    set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/external/install)
+    set(Boost_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include)
+    set(Boost_LIBRARY_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib)
+    find_package(Boost 1.53.0 COMPONENTS filesystem system thread timer chrono program_options REQUIRED)
+    set(FETCH_BOOST FALSE)
+endif()
+
+
+
+##
+# Either inform the user of how to obtain Boost, or, if they passed in the FETCH_BOOST
+# option, go and grab it for them.
+##
+if ((NOT Boost_FOUND) AND (NOT FETCH_BOOST))
+	message(FATAL_ERROR 
+        "Salmon cannot be compiled without Boost.\n"
+        "It is recommended to visit http://www.boost.org/ and install Boost according to those instructions.\n"
+        "This build system can also download and install a local version of boost for you (this takes a lot of time).\n"
+        "To fetch and build boost locally, call cmake with -DFETCH_BOOST=TRUE"
+    )
+elseif(FETCH_BOOST)
+    ## Let the rest of the build process know we're going to be fetching boost
+    set (BOOST_LIB_SUBSET --with-atomic --with-chrono --with-container --with-date_time --with-exception 
+                          --with-filesystem --with-graph --with-graph_parallel --with-math 
+                          --with-program_options --with-system --with-thread
+                          --with-timer)
+    set (BOOST_WILL_RECONFIGURE TRUE)
+    set (FETCH_BOOST FALSE)
+    message("Build system will fetch and build Boost")
+    message("==================================================================")
+    ExternalProject_Add(libboost
+        DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+        DOWNLOAD_COMMAND curl -k -L http://fossies.org/linux/misc/boost_1_57_0.tar.gz -o boost_1_57_0.tar.gz && 
+                tar xzf boost_1_57_0.tar.gz
+        #URL http://downloads.sourceforge.net/project/boost/boost/1.57.0/boost_1_57_0.tar.gz
+        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_57_0
+        INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+        #PATCH_COMMAND patch -p2 < ${CMAKE_CURRENT_SOURCE_DIR}/external/boost156.patch
+	CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_57_0/bootstrap.sh ${BOOST_CONFIGURE_TOOLSET} ${BOOST_BUILD_LIBS} --prefix=<INSTALL_DIR>
+	BUILD_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_57_0/b2 -d0 -j2 ${BOOST_LIB_SUBSET} toolset=${BOOST_TOOLSET} ${BOOST_EXTRA_FLAGS} cxxflags=${BOOST_CXX_FLAGS} install
+        BUILD_IN_SOURCE 1
+        INSTALL_COMMAND ""
+    )
+
+    ##
+    # After we've installed boost, 
+    ##
+    SET( RECONFIG_FLAGS ${RECONFIG_FLAGS} -DBOOST_WILL_RECONFIGURE=FALSE -DBOOST_RECONFIGURE=TRUE -DFETCH_BOOST=FALSE)
+    ExternalProject_Add_Step(libboost reconfigure
+        COMMAND ${CMAKE_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR} ${RECONFIG_FLAGS}
+        DEPENDEES install
+    )
+endif()
+
+##
+# If we're fetching boost and we need to have dummy paths for these variables
+# so that CMake won't complain
+##
+if (BOOST_WILL_RECONFIGURE)
+    message("Setting Temporary Boost paths")
+    set(Boost_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include)
+    set(Boost_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include)
+    set(Boost_LIBRARY_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib)        
+    set(Boost_FOUND TRUE)
+endif()
+
+
+message("BOOST INCLUDE DIR = ${Boost_INCLUDE_DIR}")
+message("BOOST INCLUDE DIRS = ${Boost_INCLUDE_DIRS}")
+message("BOOST LIB DIR = ${Boost_LIBRARY_DIRS}")
+message("BOOST LIBRAREIS = ${Boost_LIBRARIES}")
+	
+set(EXTERNAL_LIBRARY_PATH $CMAKE_CURRENT_SOURCE_DIR/lib)
+
+message("Build system will fetch and build the Cereal serialization library")
+message("==================================================================")
+include(ExternalProject)
+ExternalProject_Add(libcereal
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    URL http://www.cs.cmu.edu/~robp/files/cereal-v1.0.0.tgz
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.0.0
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
+    BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.0.0/build
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND sh -c "mkdir -p <INSTALL_DIR>/include && cp -r <SOURCE_DIR>/include/cereal <INSTALL_DIR>/include"
+)
+
+message("Build system will fetch and build BWA (for Salmon)")
+message("==================================================================")
+include(ExternalProject)
+ExternalProject_Add(libbwa
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/bwa/archive/0.7.12.3.tar.gz -o bwa-master.tar.gz &&
+                     mkdir -p bwa-master && 
+                     tar -xzvf bwa-master.tar.gz --strip-components=1 -C bwa-master 
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/bwa-master
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND sh -c "make CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}"
+    INSTALL_COMMAND sh -c "mkdir -p <INSTALL_DIR>/lib && mkdir -p <INSTALL_DIR>/include/bwa && cp libbwa.a <INSTALL_DIR>/lib && cp *.h <INSTALL_DIR>/include/bwa && cp is.c bwtindex.c bwt_gen.c QSufSort.c ${CMAKE_CURRENT_SOURCE_DIR}/src/"
+    BUILD_IN_SOURCE TRUE
+)
+
+message("Build system will fetch and build Jellyfish")
+message("==================================================================")
+ExternalProject_Add(libjellyfish
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    URL ftp://ftp.genome.umd.edu/pub/jellyfish/jellyfish-2.1.3.tar.gz
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.1.3
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.1.3/configure --prefix=<INSTALL_DIR> CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
+    BUILD_COMMAND ${MAKE} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
+    BUILD_IN_SOURCE 1
+    INSTALL_COMMAND make install && 
+                    cp config.h <INSTALL_DIR>/include/jellyfish-2.1.3/jellyfish/ &&
+                    cp config.h <INSTALL_DIR>/include/
+)
+
+find_package(TBB)
+
+##
+#
+# Fetch and build Intel's Threading Building Blocks library.
+#
+##
+if(NOT TBB_FOUND)
+
+set(TBB_WILL_RECONFIGURE TRUE)
+# Set the appropriate compiler
+if (CLANG)
+    set(TBB_COMPILER "clang")
+else()
+    set(TBB_COMPILER "gcc")
+endif()
+
+message("Build system will fetch and build Intel Threading Building Blocks")
+message("==================================================================")
+# These are useful for the custom install step we'll do later
+set(TBB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb43_20140724oss)
+set(TBB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install)
+
+if ("${TBB_COMPILER}" STREQUAL "gcc") 
+    ## Don't know why it's a problem yet, but if we're using
+    ## GCC, get rid of the DO_ITT_NOTIFY flag
+    set(TBB_CXXFLAGS "${TBB_CXXFLAGS} -UDO_ITT_NOTIFY")  
+endif()
+
+ExternalProject_Add(libtbb
+	DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    URL http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb43_20140724oss_src.tgz
+    DOWNLOAD_COMMAND curl -k -L  http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb43_20140724oss_src.tgz -o tbb_20140724oss_src.tgz &&
+                     tar -xzvf tbb_20140724oss_src.tgz
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb43_20140724oss
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    PATCH_COMMAND "${TBB_PATCH_STEP}"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND make CXXFLAGS=${TBB_CXXFLAGS} lambdas=1 compiler=${TBB_COMPILER} cfg=release tbb_build_prefix=LIBS 
+    INSTALL_COMMAND sh -c "cp ${TBB_SOURCE_DIR}/build/LIBS_release/*.${SHARED_LIB_EXTENSION}* ${TBB_INSTALL_DIR}/lib && cp -r ${TBB_SOURCE_DIR}/include/* ${TBB_INSTALL_DIR}/include"
+    BUILD_IN_SOURCE 1
+)
+
+SET( RECONFIG_FLAGS ${RECONFIG_FLAGS} -DTBB_WILL_RECONFIGURE=FALSE -DTBB_RECONFIGURE=TRUE)
+ExternalProject_Add_Step(libtbb reconfigure
+        COMMAND ${CMAKE_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR} ${RECONFIG_FLAGS}
+        DEPENDEES install
+)
+endif()
+
+##
+# If we're fetching tbb, we need to have dummy paths for these variables
+# so that CMake won't complain
+##
+if(TBB_WILL_RECONFIGURE)
+    set(TBB_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include)
+    set(TBB_LIBRARY_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib)
+    #set(TBB_LIBRARIES tbb tbbmalloc)
+    set(TBB_LIBRARIES ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib/libtbb.${SHARED_LIB_EXTENSION}
+                      ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib/libtbbmalloc.${SHARED_LIB_EXTENSION}
+    )
+endif()
+
+##
+#  Similar to the Boost trick above, the libtbb reconfigure should force this code
+#  to be run on the second configuration pass, where it should appropriately set the
+#  TBB_INSTALL_DIR variable.
+##
+if (TBB_RECONFIGURE)
+    unset(TBB_FOUND CACHE)
+    unset(TBB_INSTALL_DIR CACHE)
+    unset(CMAKE_PREFIX_PATH CACHE)
+    set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/external/install)
+    set(TBB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install)
+    message("TBB_INSTALL_DIR = ${TBB_INSTALL_DIR}")
+    find_package(TBB)
+endif()
+
+
+message("TBB_LIBRARIES = ${TBB_LIBRARIES}")
+
+message("Build system will compile libgff")
+message("==================================================================")
+ExternalProject_Add(libgff
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    URL http://www.cs.cmu.edu/~robp/files/libgff.tgz
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/libgff
+    UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/libgff/build
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_CURRENT_SOURCE_DIR}/external/install
+)
+
+message("Build system will compile Staden IOLib")
+message("==================================================================")
+ExternalProject_Add(libstadenio 
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/staden-io_lib/releases/download/v1.13.10/io_lib-1.13.10.tar.gz -o staden-io_lib-v1.13.10.tar.gz &&
+                     mkdir -p staden-io_lib-1.13.10 &&
+                     tar -xzf staden-io_lib-v1.13.10.tar.gz --strip-components=1 -C staden-io_lib-1.13.10 &&
+                     rm -fr staden-io_lib &&
+                     mv -f staden-io_lib-1.13.10 staden-io_lib
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/staden-io_lib
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    CONFIGURE_COMMAND ./configure --enable-shared=no --without-libcurl --prefix=<INSTALL_DIR> LDFLAGS=${LIBSTADEN_LDFLAGS} CFLAGS=${LIBSTADEN_CFLAGS} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
+    BUILD_COMMAND make CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
+    BUILD_IN_SOURCE 1
+    INSTALL_COMMAND make install
+)
+
+message("Build system will fetch SPDLOG")
+message("==================================================================")
+ExternalProject_Add(libspdlog
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/spdlog/archive/v1.5.tar.gz -o spdlog-v1.5.tar.gz &&
+                     tar -xzf spdlog-v1.5.tar.gz &&
+                     rm -fr spdlog &&
+                     mv -f  spdlog-1.5 spdlog
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/spdlog
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    BUILD_IN_SOURCE 1
+    INSTALL_COMMAND cp -r <SOURCE_DIR>/include/spdlog <INSTALL_DIR>/include
+)
+
+set (FAST_MALLOC_LIB "")
+set (HAVE_FAST_MALLOC FALSE)
+
+# See if we have Jemalloc
+find_package(Jemalloc)
+if (Jemalloc_FOUND)
+    message("Found Jemalloc library --- using this memory allocator")
+    set (FAST_MALLOC_LIB ${JEMALLOC_LIBRARIES})
+    set (HAVE_FAST_MALLOC TRUE)
+endif()
+
+if (NOT HAVE_FAST_MALLOC)
+    # See if we have Tcmalloc
+    find_package(Tcmalloc)
+    if (Tcmalloc_FOUND)
+        message("Fount TCMalloc library --- using this memory allocator")
+        set (TCMALLOC_LIB ${Tcmalloc_LIBRARIES})
+        set (FAST_MALLOC_LIB ${TCMALLOC_LIB})
+        set (HAVE_FAST_MALLOC TRUE)
+    endif()
+endif()
+
+if (NOT HAVE_FAST_MALLOC)
+    message("Build system will fetch and use JEMalloc")
+    message("==================================================================")
+    ExternalProject_Add(libjemalloc
+        DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+        DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/jemalloc/archive/3.6.0.tar.gz -o jemalloc-3.6.0.tar.gz &&
+        tar -xzf jemalloc-3.6.0.tar.gz 
+        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/jemalloc-3.6.0
+        BUILD_IN_SOURCE TRUE
+        INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+        CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --prefix=<INSTALL_DIR>"
+        INSTALL_COMMAND cp -r lib <INSTALL_DIR>/ && cp -r include <INSTALL_DIR>/
+        )
+
+    set (FAST_MALLOC_LIB ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib/libjemalloc.a)
+    set (HAVE_FAST_MALLOC TRUE)
+endif ()
+
+###
+#
+# Done building external dependencies.
+#
+###
+
+set (CPACK_SOURCE_IGNORE_FILES 
+"/src/PCA.cpp"
+"/src/PCAUtils.cpp"
+"/build/"
+"/scripts/AggregateToGeneLevel.py"
+"/scripts/ExpressionTools.py"
+"/scripts/GenerateExpressionFiles.sh"
+"/scripts/ParseSoftFile.py"
+"/scripts/PlotCorrelation.py"
+"/scripts/junk"
+"/scripts/sfstrace.log"
+"/scripts/SFPipeline.py"
+"/bin/"
+"/lib/"
+"/sample_data/"
+"PublishREADMEToWebsite.sh"
+"/external/"
+"/src/obsolete/"
+"/include/obsolete/"
+"WebsiteHeader.txt"
+"/experimental_configs/"
+".git/")
+
+message("CPACK_SOURCE_IGNORE_FILES = ${CPACK_SOURCE_IGNORE_FILES}")
+
+# Recurse into Salmon source directory
+add_subdirectory ( src )
+
+# build a CPack driven installer package
+include (CPack)
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1e343ea
--- /dev/null
+++ b/README.md
@@ -0,0 +1,27 @@
+[![Build Status](https://travis-ci.org/COMBINE-lab/salmon.svg?branch=master)](https://travis-ci.org/COMBINE-lab/salmon)
+[![Documentation Status](https://readthedocs.org/projects/salmon/badge/?version=latest)](https://readthedocs.org/projects/salmon/?badge=latest)
+
+What is Salmon?
+===============
+
+Salmon is a **wicked**-fast program to produce a highly-accurate, transcript-level quantification estimates from 
+RNA-seq data.  Salmon achieves is accuracy and speed via a number of different innovations, including the 
+use of *lightweight* alignments (accurate but fast-to-compute proxies for traditional read alignments) and 
+massively-parallel stochastic collapsed variational inference.  The result is a versatile tool that fits nicely
+into many differnt pipelines.  For example, you can choose to make use of our *lightweight* alignments by providing Salmon with raw sequencing reads, or, if it is more convenient, you can provide Salmon with regular alignments (e.g. 
+computed with your favorite aligner), and it will use the same **wicked**-fast, state-of-the-art inference algorithm 
+to estimate transcript-level abundances for your experiment.
+
+Give salmon a try!  You can find the latest binary releases [here](https://github.com/COMBINE-lab/salmon/releases).
+
+Documentation
+==============
+
+The documentation for Salmon is available on [ReadTheDocs](http://readthedocs.org), check it out [here](http://salmon.readthedocs.org).
+
+Chat live about Salmon
+======================
+
+You can chat with the Salmon developers and other users via Gitter!
+
+[![Join the chat at https://gitter.im/COMBINE-lab/salmon](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/COMBINE-lab/salmon?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
diff --git a/cmake/Modules/FindJemalloc.cmake b/cmake/Modules/FindJemalloc.cmake
new file mode 100644
index 0000000..6eaf214
--- /dev/null
+++ b/cmake/Modules/FindJemalloc.cmake
@@ -0,0 +1,46 @@
+# From: https://raw.githubusercontent.com/STEllAR-GROUP/hpx/master/cmake/FindJemalloc.cmake
+# Copyright (c)      2014 Thomas Heller
+# Copyright (c) 2007-2012 Hartmut Kaiser
+# Copyright (c) 2010-2011 Matt Anderson
+# Copyright (c) 2011      Bryce Lelbach
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+find_package(PkgConfig)
+pkg_check_modules(PC_JEMALLOC QUIET libjemalloc)
+
+find_path(JEMALLOC_INCLUDE_DIR jemalloc/jemalloc.h
+  HINTS
+    ${JEMALLOC_ROOT} ENV JEMALLOC_ROOT
+    ${PC_JEMALLOC_MINIMAL_INCLUDEDIR}
+    ${PC_JEMALLOC_MINIMAL_INCLUDE_DIRS}
+    ${PC_JEMALLOC_INCLUDEDIR}
+    ${PC_JEMALLOC_INCLUDE_DIRS}
+  PATH_SUFFIXES include)
+
+find_library(JEMALLOC_LIBRARY NAMES jemalloc libjemalloc
+  HINTS
+    ${JEMALLOC_ROOT} ENV JEMALLOC_ROOT
+    ${PC_JEMALLOC_MINIMAL_LIBDIR}
+    ${PC_JEMALLOC_MINIMAL_LIBRARY_DIRS}
+    ${PC_JEMALLOC_LIBDIR}
+    ${PC_JEMALLOC_LIBRARY_DIRS}
+  PATH_SUFFIXES lib lib64)
+
+set(JEMALLOC_LIBRARIES ${JEMALLOC_LIBRARY})
+set(JEMALLOC_INCLUDE_DIRS ${JEMALLOC_INCLUDE_DIR})
+
+find_package_handle_standard_args(Jemalloc DEFAULT_MSG
+  JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR)
+
+get_property(_type CACHE JEMALLOC_ROOT PROPERTY TYPE)
+if(_type)
+  set_property(CACHE JEMALLOC_ROOT PROPERTY ADVANCED 1)
+  if("x${_type}" STREQUAL "xUNINITIALIZED")
+    set_property(CACHE JEMALLOC_ROOT PROPERTY TYPE PATH)
+  endif()
+endif()
+
+mark_as_advanced(JEMALLOC_ROOT JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR)
+
diff --git a/cmake/Modules/FindTBB.cmake b/cmake/Modules/FindTBB.cmake
new file mode 100644
index 0000000..8695dd1
--- /dev/null
+++ b/cmake/Modules/FindTBB.cmake
@@ -0,0 +1,282 @@
+# Locate Intel Threading Building Blocks include paths and libraries
+# FindTBB.cmake can be found at https://code.google.com/p/findtbb/
+# Written by Hannes Hofmann <hannes.hofmann _at_ informatik.uni-erlangen.de>
+# Improvements by Gino van den Bergen <gino _at_ dtecta.com>,
+#   Florian Uhlig <F.Uhlig _at_ gsi.de>,
+#   Jiri Marsik <jiri.marsik89 _at_ gmail.com>
+
+# The MIT License
+#
+# Copyright (c) 2011 Hannes Hofmann
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# GvdB: This module uses the environment variable TBB_ARCH_PLATFORM which defines architecture and compiler.
+#   e.g. "ia32/vc8" or "em64t/cc4.1.0_libc2.4_kernel2.6.16.21"
+#   TBB_ARCH_PLATFORM is set by the build script tbbvars[.bat|.sh|.csh], which can be found
+#   in the TBB installation directory (TBB_INSTALL_DIR).
+#
+# GvdB: Mac OS X distribution places libraries directly in lib directory.
+#
+# For backwards compatibility, you may explicitely set the CMake variables TBB_ARCHITECTURE and TBB_COMPILER.
+# TBB_ARCHITECTURE [ ia32 | em64t | itanium ]
+#   which architecture to use
+# TBB_COMPILER e.g. vc9 or cc3.2.3_libc2.3.2_kernel2.4.21 or cc4.0.1_os10.4.9
+#   which compiler to use (detected automatically on Windows)
+
+# This module respects
+# TBB_INSTALL_DIR or $ENV{TBB21_INSTALL_DIR} or $ENV{TBB_INSTALL_DIR}
+
+# This module defines
+# TBB_INCLUDE_DIRS, where to find task_scheduler_init.h, etc.
+# TBB_LIBRARY_DIRS, where to find libtbb, libtbbmalloc
+# TBB_DEBUG_LIBRARY_DIRS, where to find libtbb_debug, libtbbmalloc_debug
+# TBB_INSTALL_DIR, the base TBB install directory
+# TBB_LIBRARIES, the libraries to link against to use TBB.
+# TBB_DEBUG_LIBRARIES, the libraries to link against to use TBB with debug symbols.
+# TBB_FOUND, If false, don't try to use TBB.
+# TBB_INTERFACE_VERSION, as defined in tbb/tbb_stddef.h
+
+
+if (WIN32)
+    # has em64t/vc8 em64t/vc9
+    # has ia32/vc7.1 ia32/vc8 ia32/vc9
+    set(_TBB_DEFAULT_INSTALL_DIR "C:/Program Files/Intel/TBB" "C:/Program Files (x86)/Intel/TBB")
+    set(_TBB_LIB_NAME "tbb")
+    set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc")
+    set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug")
+    set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug")
+    if (MSVC71)
+        set (_TBB_COMPILER "vc7.1")
+    endif(MSVC71)
+    if (MSVC80)
+        set(_TBB_COMPILER "vc8")
+    endif(MSVC80)
+    if (MSVC90)
+        set(_TBB_COMPILER "vc9")
+    endif(MSVC90)
+    if(MSVC10)
+        set(_TBB_COMPILER "vc10")
+    endif(MSVC10)
+    # Todo: add other Windows compilers such as ICL.
+    set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})
+endif (WIN32)
+
+if (UNIX)
+    if (APPLE)
+        # MAC
+        set(_TBB_DEFAULT_INSTALL_DIR "/Library/Frameworks/Intel_TBB.framework/Versions")
+        # libs: libtbb.dylib, libtbbmalloc.dylib, *_debug
+        set(_TBB_LIB_NAME "tbb")
+        set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc")
+        set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug")
+        set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug")
+        # default flavor on apple: ia32/cc4.0.1_os10.4.9
+        # Jiri: There is no reason to presume there is only one flavor and
+        #       that user's setting of variables should be ignored.
+        if(NOT TBB_COMPILER)
+            set(_TBB_COMPILER "cc4.0.1_os10.4.9")
+        elseif (NOT TBB_COMPILER)
+            set(_TBB_COMPILER ${TBB_COMPILER})
+        endif(NOT TBB_COMPILER)
+        if(NOT TBB_ARCHITECTURE)
+            set(_TBB_ARCHITECTURE "ia32")
+        elseif(NOT TBB_ARCHITECTURE)
+            set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})
+        endif(NOT TBB_ARCHITECTURE)
+    else (APPLE)
+        # LINUX
+        set(_TBB_DEFAULT_INSTALL_DIR "/opt/intel/tbb" "/usr/local/include" "/usr/include")
+        set(_TBB_LIB_NAME "tbb")
+        set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc")
+        set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug")
+        set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug")
+        # has em64t/cc3.2.3_libc2.3.2_kernel2.4.21 em64t/cc3.3.3_libc2.3.3_kernel2.6.5 em64t/cc3.4.3_libc2.3.4_kernel2.6.9 em64t/cc4.1.0_libc2.4_kernel2.6.16.21
+        # has ia32/*
+        # has itanium/*
+        set(_TBB_COMPILER ${TBB_COMPILER})
+        set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})
+    endif (APPLE)
+endif (UNIX)
+
+if (CMAKE_SYSTEM MATCHES "SunOS.*")
+# SUN
+# not yet supported
+# has em64t/cc3.4.3_kernel5.10
+# has ia32/*
+endif (CMAKE_SYSTEM MATCHES "SunOS.*")
+
+
+#-- Clear the public variables
+set (TBB_FOUND "NO")
+
+
+#-- Find TBB install dir and set ${_TBB_INSTALL_DIR} and cached ${TBB_INSTALL_DIR}
+# first: use CMake variable TBB_INSTALL_DIR
+if (TBB_INSTALL_DIR)
+    set (_TBB_INSTALL_DIR ${TBB_INSTALL_DIR})
+endif (TBB_INSTALL_DIR)
+# second: use environment variable
+if (NOT _TBB_INSTALL_DIR)
+    if (NOT "$ENV{TBB_INSTALL_DIR}" STREQUAL "")
+        set (_TBB_INSTALL_DIR $ENV{TBB_INSTALL_DIR})
+    endif (NOT "$ENV{TBB_INSTALL_DIR}" STREQUAL "")
+    # Intel recommends setting TBB21_INSTALL_DIR
+    if (NOT "$ENV{TBB21_INSTALL_DIR}" STREQUAL "")
+        set (_TBB_INSTALL_DIR $ENV{TBB21_INSTALL_DIR})
+    endif (NOT "$ENV{TBB21_INSTALL_DIR}" STREQUAL "")
+    if (NOT "$ENV{TBB22_INSTALL_DIR}" STREQUAL "")
+        set (_TBB_INSTALL_DIR $ENV{TBB22_INSTALL_DIR})
+    endif (NOT "$ENV{TBB22_INSTALL_DIR}" STREQUAL "")
+    if (NOT "$ENV{TBB30_INSTALL_DIR}" STREQUAL "")
+        set (_TBB_INSTALL_DIR $ENV{TBB30_INSTALL_DIR})
+    endif (NOT "$ENV{TBB30_INSTALL_DIR}" STREQUAL "")
+endif (NOT _TBB_INSTALL_DIR)
+# third: try to find path automatically
+if (NOT _TBB_INSTALL_DIR)
+    if (_TBB_DEFAULT_INSTALL_DIR)
+        set (_TBB_INSTALL_DIR ${_TBB_DEFAULT_INSTALL_DIR})
+    endif (_TBB_DEFAULT_INSTALL_DIR)
+endif (NOT _TBB_INSTALL_DIR)
+# sanity check
+if (NOT _TBB_INSTALL_DIR)
+    message ("ERROR: Unable to find Intel TBB install directory. ${_TBB_INSTALL_DIR}")
+else (NOT _TBB_INSTALL_DIR)
+# finally: set the cached CMake variable TBB_INSTALL_DIR
+if (NOT TBB_INSTALL_DIR)
+    set (TBB_INSTALL_DIR ${_TBB_INSTALL_DIR} CACHE PATH "Intel TBB install directory")
+    mark_as_advanced(TBB_INSTALL_DIR)
+endif (NOT TBB_INSTALL_DIR)
+
+#-- A macro to rewrite the paths of the library. This is necessary, because
+#   find_library() always found the em64t/vc9 version of the TBB libs
+macro(TBB_CORRECT_LIB_DIR var_name)
+#    if (NOT "${_TBB_ARCHITECTURE}" STREQUAL "em64t")
+        string(REPLACE em64t "${_TBB_ARCHITECTURE}" ${var_name} ${${var_name}})
+#    endif (NOT "${_TBB_ARCHITECTURE}" STREQUAL "em64t")
+    string(REPLACE ia32 "${_TBB_ARCHITECTURE}" ${var_name} ${${var_name}})
+    string(REPLACE vc7.1 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
+    string(REPLACE vc8 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
+    string(REPLACE vc9 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
+    string(REPLACE vc10 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
+endmacro(TBB_CORRECT_LIB_DIR var_content)
+
+
+#-- Look for include directory and set ${TBB_INCLUDE_DIR}
+set (TBB_INC_SEARCH_DIR ${_TBB_INSTALL_DIR}/include)
+# Jiri: tbbvars now sets the CPATH environment variable to the directory
+#       containing the headers.
+find_path(TBB_INCLUDE_DIR
+    tbb/task_scheduler_init.h
+    PATHS ${TBB_INC_SEARCH_DIR} ENV CPATH
+)
+mark_as_advanced(TBB_INCLUDE_DIR)
+
+
+#-- Look for libraries
+# GvdB: $ENV{TBB_ARCH_PLATFORM} is set by the build script tbbvars[.bat|.sh|.csh]
+if (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL "")
+    set (_TBB_LIBRARY_DIR 
+         ${_TBB_INSTALL_DIR}/lib/$ENV{TBB_ARCH_PLATFORM}
+         ${_TBB_INSTALL_DIR}/$ENV{TBB_ARCH_PLATFORM}/lib
+        )
+endif (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL "")
+# Jiri: This block isn't mutually exclusive with the previous one
+#       (hence no else), instead I test if the user really specified
+#       the variables in question.
+if ((NOT ${TBB_ARCHITECTURE} STREQUAL "") AND (NOT ${TBB_COMPILER} STREQUAL ""))
+    # HH: deprecated
+    message(STATUS "[Warning] FindTBB.cmake: The use of TBB_ARCHITECTURE and TBB_COMPILER is deprecated and may not be supported in future versions. Please set \$ENV{TBB_ARCH_PLATFORM} (using tbbvars.[bat|csh|sh]).")
+    # Jiri: It doesn't hurt to look in more places, so I store the hints from
+    #       ENV{TBB_ARCH_PLATFORM} and the TBB_ARCHITECTURE and TBB_COMPILER
+    #       variables and search them both.
+    set (_TBB_LIBRARY_DIR "${_TBB_INSTALL_DIR}/${_TBB_ARCHITECTURE}/${_TBB_COMPILER}/lib" ${_TBB_LIBRARY_DIR})
+endif ((NOT ${TBB_ARCHITECTURE} STREQUAL "") AND (NOT ${TBB_COMPILER} STREQUAL ""))
+
+# GvdB: Mac OS X distribution places libraries directly in lib directory.
+list(APPEND _TBB_LIBRARY_DIR ${_TBB_INSTALL_DIR}/lib)
+
+# Jiri: No reason not to check the default paths. From recent versions,
+#       tbbvars has started exporting the LIBRARY_PATH and LD_LIBRARY_PATH
+#       variables, which now point to the directories of the lib files.
+#       It all makes more sense to use the ${_TBB_LIBRARY_DIR} as a HINTS
+#       argument instead of the implicit PATHS as it isn't hard-coded
+#       but computed by system introspection. Searching the LIBRARY_PATH
+#       and LD_LIBRARY_PATH environment variables is now even more important
+#       that tbbvars doesn't export TBB_ARCH_PLATFORM and it facilitates
+#       the use of TBB built from sources.
+find_library(TBB_LIBRARY ${_TBB_LIB_NAME} HINTS ${_TBB_LIBRARY_DIR}
+        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
+find_library(TBB_MALLOC_LIBRARY ${_TBB_LIB_MALLOC_NAME} HINTS ${_TBB_LIBRARY_DIR}
+        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
+
+#Extract path from TBB_LIBRARY name
+get_filename_component(TBB_LIBRARY_DIR ${TBB_LIBRARY} PATH)
+
+#TBB_CORRECT_LIB_DIR(TBB_LIBRARY)
+#TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY)
+mark_as_advanced(TBB_LIBRARY TBB_MALLOC_LIBRARY)
+
+#-- Look for debug libraries
+# Jiri: Changed the same way as for the release libraries.
+find_library(TBB_LIBRARY_DEBUG ${_TBB_LIB_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR}
+        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
+find_library(TBB_MALLOC_LIBRARY_DEBUG ${_TBB_LIB_MALLOC_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR}
+        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
+
+# Jiri: Self-built TBB stores the debug libraries in a separate directory.
+#       Extract path from TBB_LIBRARY_DEBUG name
+get_filename_component(TBB_LIBRARY_DEBUG_DIR ${TBB_LIBRARY_DEBUG} PATH)
+
+#TBB_CORRECT_LIB_DIR(TBB_LIBRARY_DEBUG)
+#TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY_DEBUG)
+mark_as_advanced(TBB_LIBRARY_DEBUG TBB_MALLOC_LIBRARY_DEBUG)
+
+
+if (TBB_INCLUDE_DIR)
+    if (TBB_LIBRARY)
+        set (TBB_FOUND "YES")
+        set (TBB_LIBRARIES ${TBB_LIBRARY} ${TBB_MALLOC_LIBRARY} ${TBB_LIBRARIES})
+        set (TBB_DEBUG_LIBRARIES ${TBB_LIBRARY_DEBUG} ${TBB_MALLOC_LIBRARY_DEBUG} ${TBB_DEBUG_LIBRARIES})
+        set (TBB_INCLUDE_DIRS ${TBB_INCLUDE_DIR} CACHE PATH "TBB include directory" FORCE)
+        set (TBB_LIBRARY_DIRS ${TBB_LIBRARY_DIR} CACHE PATH "TBB library directory" FORCE)
+        # Jiri: Self-built TBB stores the debug libraries in a separate directory.
+        set (TBB_DEBUG_LIBRARY_DIRS ${TBB_LIBRARY_DEBUG_DIR} CACHE PATH "TBB debug library directory" FORCE)
+        mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARY_DIRS TBB_DEBUG_LIBRARY_DIRS TBB_LIBRARIES TBB_DEBUG_LIBRARIES)
+        message(STATUS "Found Intel TBB")
+    endif (TBB_LIBRARY)
+endif (TBB_INCLUDE_DIR)
+
+if (NOT TBB_FOUND)
+    message("ERROR: Intel TBB NOT found!")
+    message(STATUS "Looked for Threading Building Blocks in ${_TBB_INSTALL_DIR}")
+    # do only throw fatal, if this pkg is REQUIRED
+    if (TBB_FIND_REQUIRED)
+        message(FATAL_ERROR "Could NOT find TBB library.")
+    endif (TBB_FIND_REQUIRED)
+endif (NOT TBB_FOUND)
+
+endif (NOT _TBB_INSTALL_DIR)
+
+if (TBB_FOUND)
+	set(TBB_INTERFACE_VERSION 0)
+	FILE(READ "${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h" _TBB_VERSION_CONTENTS)
+	STRING(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_TBB_VERSION_CONTENTS}")
+	set(TBB_INTERFACE_VERSION "${TBB_INTERFACE_VERSION}")
+endif (TBB_FOUND)
diff --git a/cmake/Modules/FindTcmalloc.cmake b/cmake/Modules/FindTcmalloc.cmake
new file mode 100644
index 0000000..96727ef
--- /dev/null
+++ b/cmake/Modules/FindTcmalloc.cmake
@@ -0,0 +1,47 @@
+# - Find Tcmalloc
+# Find the native Tcmalloc includes and library
+#
+#  Tcmalloc_INCLUDE_DIR - where to find Tcmalloc.h, etc.
+#  Tcmalloc_LIBRARIES   - List of libraries when using Tcmalloc.
+#  Tcmalloc_FOUND       - True if Tcmalloc found.
+
+find_path(Tcmalloc_INCLUDE_DIR google/tcmalloc.h NO_DEFAULT_PATH PATHS
+  ${HT_DEPENDENCY_INCLUDE_DIR}
+  /usr/include
+  /opt/local/include
+  /usr/local/include
+)
+
+if (USE_TCMALLOC)
+  set(Tcmalloc_NAMES tcmalloc)
+else ()
+  set(Tcmalloc_NAMES tcmalloc_minimal tcmalloc)
+endif ()
+
+find_library(Tcmalloc_LIBRARY NO_DEFAULT_PATH
+  NAMES ${Tcmalloc_NAMES}
+  PATHS ${HT_DEPENDENCY_LIB_DIR} /lib /usr/lib /usr/local/lib /opt/local/lib
+)
+
+if (Tcmalloc_INCLUDE_DIR AND Tcmalloc_LIBRARY)
+  set(Tcmalloc_FOUND TRUE)
+  set( Tcmalloc_LIBRARIES ${Tcmalloc_LIBRARY} )
+else ()
+  set(Tcmalloc_FOUND FALSE)
+  set( Tcmalloc_LIBRARIES )
+endif ()
+
+if (Tcmalloc_FOUND)
+  message(STATUS "Found Tcmalloc: ${Tcmalloc_LIBRARY}")
+else ()
+  message(STATUS "Not Found Tcmalloc: ${Tcmalloc_LIBRARY}")
+  if (Tcmalloc_FIND_REQUIRED)
+    message(STATUS "Looked for Tcmalloc libraries named ${Tcmalloc_NAMES}.")
+    message(FATAL_ERROR "Could NOT find Tcmalloc library")
+  endif ()
+endif ()
+
+mark_as_advanced(
+  Tcmalloc_LIBRARY
+  Tcmalloc_INCLUDE_DIR
+  )
diff --git a/cmake/PostInstall.cmake b/cmake/PostInstall.cmake
new file mode 100644
index 0000000..62a572d
--- /dev/null
+++ b/cmake/PostInstall.cmake
@@ -0,0 +1,19 @@
+##
+# Print some post install messages for the user
+##
+message("\n\n")
+message("Installation complete. Please ensure the following paths are set properly.")	
+message("==========================================================================")
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+	message("Fixing library names with install_name_tool")
+	execute_process(COMMAND install_name_tool -add_rpath ${CMAKE_INSTALL_PREFIX}/bin ${CMAKE_INSTALL_PREFIX}/bin/salmon)
+	execute_process(COMMAND install_name_tool -add_rpath ${CMAKE_INSTALL_PREFIX}/lib ${CMAKE_INSTALL_PREFIX}/bin/salmon)
+	execute_process(COMMAND install_name_tool -add_rpath @executable_path ${CMAKE_INSTALL_PREFIX}/bin/salmon) 
+endif()
+message("Please add ${CMAKE_INSTALL_PREFIX}/bin to your PATH")
+if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
+	message("Please add ${CMAKE_INSTALL_PREFIX}/lib to your DYLD_FALLBACK_LIBRARY_PATH")
+else()
+	message("Please add ${CMAKE_INSTALL_PREFIX}/lib to your LD_LIBRARY_PATH")
+endif()
+message("==========================================================================")
diff --git a/cmake/SimpleTest.cmake b/cmake/SimpleTest.cmake
new file mode 100644
index 0000000..c24f8c6
--- /dev/null
+++ b/cmake/SimpleTest.cmake
@@ -0,0 +1,35 @@
+execute_process(COMMAND tar xzvf sample_data.tgz
+                WORKING_DIRECTORY ${TOPLEVEL_DIR}
+                RESULT_VARIABLE TAR_RESULT
+               )
+
+if (TAR_RESULT)
+    message(FATAL_ERROR "Error untarring sample_data.tgz")
+endif()
+
+set(INDEX_CMD ${TOPLEVEL_DIR}/build/src/sailfish index -t transcripts.fasta -k 20 -o sample_index --force)
+execute_process(COMMAND ${INDEX_CMD}
+                WORKING_DIRECTORY ${TOPLEVEL_DIR}/sample_data
+                RESULT_VARIABLE INDEX_RESULT
+                )
+
+if (INDEX_RESULT)
+    message(FATAL_ERROR "Error running ${INDEX_COMMAND}")
+endif()
+
+set(QUANT_COMMAND ${TOPLEVEL_DIR}/build/src/sailfish quant -i sample_index --noBiasCorrect -l IU -1 reads_1.fastq -2 reads_2.fastq -o sample_quant)
+execute_process(COMMAND ${QUANT_COMMAND}
+	            WORKING_DIRECTORY ${TOPLEVEL_DIR}/sample_data
+	            RESULT_VARIABLE QUANT_RESULT
+                )
+if (QUANT_RESULT)
+    message(FATAL_ERROR "Error running ${QUANT_RESULT}")
+endif()
+
+if (EXISTS ${TOPLEVEL_DIR}/sample_data/sample_quant/quant.sf)
+	message("Sailfish ran successfully")
+else()
+	message(FATAL_ERROR "Sailfish failed to produce output")
+endif()
+
+
diff --git a/cmake/TestSalmon.cmake b/cmake/TestSalmon.cmake
new file mode 100644
index 0000000..6384868
--- /dev/null
+++ b/cmake/TestSalmon.cmake
@@ -0,0 +1,35 @@
+execute_process(COMMAND tar xzvf sample_data.tgz
+                WORKING_DIRECTORY ${TOPLEVEL_DIR}
+                RESULT_VARIABLE TAR_RESULT
+               )
+
+if (TAR_RESULT)
+    message(FATAL_ERROR "Error untarring sample_data.tgz")
+endif()
+
+set(SALMON_INDEX_CMD ${TOPLEVEL_DIR}/build/src/salmon index -t transcripts.fasta -i sample_salmon_index)
+execute_process(COMMAND ${SALMON_INDEX_CMD}
+                WORKING_DIRECTORY ${TOPLEVEL_DIR}/sample_data
+                RESULT_VARIABLE SALMON_INDEX_RESULT
+                )
+
+if (SALMON_INDEX_RESULT)
+    message(FATAL_ERROR "Error running ${SALMON_INDEX_COMMAND}")
+endif()
+
+set(SALMON_QUANT_COMMAND ${TOPLEVEL_DIR}/build/src/salmon quant -i sample_salmon_index -l IU -1 reads_1.fastq -2 reads_2.fastq -o sample_salmon_quant -n 1000000)
+execute_process(COMMAND ${SALMON_QUANT_COMMAND}
+	            WORKING_DIRECTORY ${TOPLEVEL_DIR}/sample_data
+                RESULT_VARIABLE SALMON_QUANT_RESULT
+                )
+if (SALMON_QUANT_RESULT)
+    message(FATAL_ERROR "Error running ${QUANT_RESULT}")
+endif()
+
+if (EXISTS ${TOPLEVEL_DIR}/sample_data/sample_salmon_quant/quant.sf)
+    message("Salmon (read) ran successfully")
+else()
+    message(FATAL_ERROR "Salmon (read) failed to produce output")
+endif()
+
+
diff --git a/doc/FileFormats.md b/doc/FileFormats.md
new file mode 100755
index 0000000..d970310
--- /dev/null
+++ b/doc/FileFormats.md
@@ -0,0 +1,14 @@
+Transcript Index Format
+=======================
+
+The sorted list (array) is the structure used by our program
+to count the k-mers from the reads and it relies on a transcript
+index.  The index is (currently) simply a sorted array containing
+all of the kmers, in encoded (uint64_t) numeric order, that were
+seen in the trancsripts.  The file format is as follows
+
+````
+kmer_len[uint32_t]
+num_kmers[uint32_t]
+k_1[uint_64t] . . . k_{num_kmers}[uint64_t]
+````
\ No newline at end of file
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..89a4147
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Sailfish.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Sailfish.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/Sailfish"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Sailfish"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/build/doctrees/building.doctree b/doc/build/doctrees/building.doctree
new file mode 100644
index 0000000..9080407
Binary files /dev/null and b/doc/build/doctrees/building.doctree differ
diff --git a/doc/build/doctrees/environment.pickle b/doc/build/doctrees/environment.pickle
new file mode 100644
index 0000000..28aa087
Binary files /dev/null and b/doc/build/doctrees/environment.pickle differ
diff --git a/doc/build/doctrees/index.doctree b/doc/build/doctrees/index.doctree
new file mode 100644
index 0000000..7838248
Binary files /dev/null and b/doc/build/doctrees/index.doctree differ
diff --git a/doc/build/html/.buildinfo b/doc/build/html/.buildinfo
new file mode 100644
index 0000000..fa61659
--- /dev/null
+++ b/doc/build/html/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 0e2b30cad323b017c247ad4d9d521088
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/doc/build/html/_sources/building.txt b/doc/build/html/_sources/building.txt
new file mode 100644
index 0000000..58ac7d2
--- /dev/null
+++ b/doc/build/html/_sources/building.txt
@@ -0,0 +1,203 @@
+Requirements
+============
+
+* A C++11 conformant compiler (currently tested with GCC>=4.7 and Clang>=3.4)
+* CMake_. Sailfish uses the CMake build system to check, fetch and install
+  dependencies, and to compile and install Sailfish. CMake is available for all
+  major platforms (though Sailfish is currently unsupported on Windows.)
+  
+Installation
+============
+
+After downloading the Sailfish source distribution and unpacking it, change into the top-level directory:
+
+::
+
+    > cd Sailfish
+
+Then, create and out-of-source build directory and change into it:
+
+::
+
+    > mkdir build
+    > cd build
+
+
+Sailfish makes extensive use of Boost_.  We recommend installing the most
+recent version (1.55) systemwide if possible. If Boost is not installed on your
+system, the build process will fetch, compile and install it locally.  However,
+if you already have a recent version of Boost available on your system, it make
+sense to tell the build system to use that.
+
+If you have Boost installed you can tell CMake where to look for it. Likewise,
+if you already have `Intel's Threading Building Blocks
+<http://threadingbuildingblocks.org/>`_ library installed, you can tell CMake
+where it is as well. The flags for CMake are as follows:
+
+* -DFETCH_BOOST=TRUE --  If you don't have Boost installed (or have an older
+  version of it), you can provide the FETCH_BOOST flag instead of the
+  BOOST_ROOT variable, which will cause CMake to fetch and build Boost locally.
+
+* -DBOOST_ROOT=<boostdir> -- Tells CMake where an existing installtion of Boost
+  resides, and looks for the appropritate version in <boostdir>.  This is the
+  top-level directory where Boost is installed (e.g. /opt/local).
+
+* -DTBB_INSTALL_DIR=<tbbroot> -- Tells CMake where an existing installation of
+  Intel's TBB is installed (<tbbroot>), and looks for the apropriate headers
+  and libraries there. This is the top-level directory where TBB is installed
+  (e.g. /opt/local).
+
+* -DCMAKE_INSTALL_PREFIX=<install_dir> -- <install_dir> is the directory to
+  which you wish Sailfish to be installed.  If you don't specify this option,
+  it will be installed locally in the top-level directory (i.e. the directory
+  directly above "build").
+                                  
+Setting the appropriate flags, you can then run the CMake configure step as
+follows:
+
+::
+                                  
+    > cmake [FLAGS] ..
+
+The above command is the cmake configuration step, which *should* complain if
+anything goes wrong.  Next, you have to run the build step. Depending on what
+libraries need to be fetched and installed, this could take a while
+(specifically if the installation needs to install Boost).  To start the build,
+just run make.
+
+::
+
+    > make
+
+If the build is successful, the appropriate executables and libraries should be
+created. There are two points to note about the build process.  First, if the
+build system is downloading and compiling boost, you may see a large number of
+warnings during compilation; these are normal.  Second, note that CMake has
+colored output by default, and the steps which create or link libraries are
+printed in red.  This is the color chosen by CMake for linking messages, and
+does not denote an error in the build process. 
+                                  
+Finally, after everything is built, the libraries and executable can be
+installed with:
+
+::
+                                  
+    > make install
+
+To ensure that Sailfish has access to the appropriate libraries you should
+ensure that the PATH variabile contains <install_dir>/bin, and that
+LD_LIBRARY_PATH (or DYLD_FALLBACK_LIBRARY_PATH on OSX) contains
+<install_dir>/lib.
+
+After the paths are set, you can test the installation by running
+
+::
+
+    > make test
+
+This should run a simple test and tell you if it succeeded or not.
+
+Running Sailfish
+================
+
+Sailfish is a tool for transcript quantification from RNA-seq data.  It
+requires a set of target transcripts (either from a reference or _de-novo_
+assembly) to quantify.  All you need to run Sailfish is a fasta file containing
+your reference transcripts and a (set of) fasta/fastq file(s) containing your
+reads.  Sailfish runs in two phases; indexing and quantification.  The indexing
+step is independent of the reads, and only need to be run one for a particular
+set of reference transcripts and choice of k (the k-mer size). The
+quantification step, obviously, is specific to the set of RNA-seq reads and is
+thus run more frequently. For a more complete description of all available
+options in Sailfish, see the manual.
+
+
+Indexing
+--------
+
+To generate the Sailfish index for your reference set of transcripts, you
+should run the following command:
+
+::                                  
+    > sailfish index -t <ref_transcripts> -o <out_dir> -k <kmer_len>
+
+
+This will build a Sailfish index for k-mers of length \<kmer_len\> for the
+reference transcripts  provided in the file \<ref_transcripts\> and place the
+index under the directory \<out_dir\>.  There  are additional options that can
+be passed to the Sailfish indexer (e.g. the number of threads to use).  These
+can be seen by executing the command "Sailfish index -h".
+
+Quantification
+--------------
+
+Now that you have generated the Sailfish index (say that it's the directory
+<index_dir> --- this corresponds to the <out_dir> argument provided in the
+previous step), you can quantify the transcript expression for a given set of
+reads.  To perform the quantification, you run a command like the following:
+
+::                                 
+    
+    > sailfish quant -i <index_dir> -l "<libtype>" {-r <unmated> | -1 <mates1> -2 <mates2>} -o <quant_dir>
+
+Where <index_dir> is, as described above, the location of the sailfish index,
+<libtype> is a string describing the format of the read library (see the
+[library string](#library-string) section below) <unmated> is a list of files
+containing unmated reads, <mates{1,2}> are lists of files containg,
+respectively, the first and second mates of paired-end reads.  Finally,
+<quant_dir> is the directory where the output should be written. Just like the
+indexing step, additional options are available, and can be viewed by running
+"sailfish quant -h".
+
+When the quantification step is finished, the directory \<quant_dir\> will
+contain a file named "quant.sf" (and, if bias correction is enabled, an
+additional file names "quant_bias_corrected.sf").  This file contains the
+result of the Sailfish quantification step.  This file contains a number of
+columns (which are listed in the last of the header lines beginning with '#').
+Specifically, the columns are (1) Transcript ID, (2) Transcript Length, (3)
+Transcripts per Million (TPM), (4) Reads Per Kilobase per Million mapped reads
+(RPKM), (5) K-mers Per Kilobase per Million mapped k-mers (KPKM), (6) Estimated
+number of k-mers (an estimate of the number of k-mers drawn from this
+transcript given the transcript's relative abundance and length) and (7)
+Estimated number of reads (an estimate of the number of reads drawn from this
+transcript given the transcript's relative abnundance and length).  The first
+two columns are self-explanatory, the next four are measures of transcript
+abundance and the final is a commonly used input for differential expression
+tools.  The Transcripts per Million quantification number is computed as
+described in [1]_, and is meant as an estimate of the number of transcripts, per
+million observed transcripts, originating from each isoform.  Its benefit over
+the K/RPKM measure is that it is independent of the mean expressed transcript
+length (i.e. if the mean expressed transcript length varies between samples,
+for example, this alone can affect differential analysis based on the K/RPKM.)
+The RPKM is a classic measure of relative transcript abundance, and is an
+estimate of the number of reads per kilobase of transcript (per million mapped
+reads) originating from each transcript. The KPKM should closely track the
+RPKM, but is defined for very short features which are larger than the chosen
+k-mer length but may be shorter than the read length. Typically, you should
+prefer the KPKM measure to the RPKM measure, since the k-mer is the most
+natural unit of coverage for Sailfish.
+
+
+License
+=======
+                                  
+This program is free software: you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+
+References
+==========
+
+.. [1] Li, Bo, et al. "RNA-Seq gene expression estimation with read mapping uncertainty." 
+    Bioinformatics 26.4 (2010): 493-500.i
+
+.. _CMake : http://www.cmake.org 
+.. _Boost: http://www.boost.org
diff --git a/doc/build/html/_sources/index.txt b/doc/build/html/_sources/index.txt
new file mode 100644
index 0000000..f3a20e0
--- /dev/null
+++ b/doc/build/html/_sources/index.txt
@@ -0,0 +1,22 @@
+.. Sailfish documentation master file, created by
+   sphinx-quickstart on Tue Jul 15 17:48:43 2014.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to Sailfish's documentation!
+====================================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+   building.rst
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/doc/build/html/_static/ajax-loader.gif b/doc/build/html/_static/ajax-loader.gif
new file mode 100644
index 0000000..61faf8c
Binary files /dev/null and b/doc/build/html/_static/ajax-loader.gif differ
diff --git a/doc/build/html/_static/basic.css b/doc/build/html/_static/basic.css
new file mode 100644
index 0000000..967e36c
--- /dev/null
+++ b/doc/build/html/_static/basic.css
@@ -0,0 +1,537 @@
+/*
+ * basic.css
+ * ~~~~~~~~~
+ *
+ * Sphinx stylesheet -- basic theme.
+ *
+ * :copyright: Copyright 2007-2014 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+/* -- main layout ----------------------------------------------------------- */
+
+div.clearer {
+    clear: both;
+}
+
+/* -- relbar ---------------------------------------------------------------- */
+
+div.related {
+    width: 100%;
+    font-size: 90%;
+}
+
+div.related h3 {
+    display: none;
+}
+
+div.related ul {
+    margin: 0;
+    padding: 0 0 0 10px;
+    list-style: none;
+}
+
+div.related li {
+    display: inline;
+}
+
+div.related li.right {
+    float: right;
+    margin-right: 5px;
+}
+
+/* -- sidebar --------------------------------------------------------------- */
+
+div.sphinxsidebarwrapper {
+    padding: 10px 5px 0 10px;
+}
+
+div.sphinxsidebar {
+    float: left;
+    width: 230px;
+    margin-left: -100%;
+    font-size: 90%;
+}
+
+div.sphinxsidebar ul {
+    list-style: none;
+}
+
+div.sphinxsidebar ul ul,
+div.sphinxsidebar ul.want-points {
+    margin-left: 20px;
+    list-style: square;
+}
+
+div.sphinxsidebar ul ul {
+    margin-top: 0;
+    margin-bottom: 0;
+}
+
+div.sphinxsidebar form {
+    margin-top: 10px;
+}
+
+div.sphinxsidebar input {
+    border: 1px solid #98dbcc;
+    font-family: sans-serif;
+    font-size: 1em;
+}
+
+div.sphinxsidebar #searchbox input[type="text"] {
+    width: 170px;
+}
+
+div.sphinxsidebar #searchbox input[type="submit"] {
+    width: 30px;
+}
+
+img {
+    border: 0;
+    max-width: 100%;
+}
+
+/* -- search page ----------------------------------------------------------- */
+
+ul.search {
+    margin: 10px 0 0 20px;
+    padding: 0;
+}
+
+ul.search li {
+    padding: 5px 0 5px 20px;
+    background-image: url(file.png);
+    background-repeat: no-repeat;
+    background-position: 0 7px;
+}
+
+ul.search li a {
+    font-weight: bold;
+}
+
+ul.search li div.context {
+    color: #888;
+    margin: 2px 0 0 30px;
+    text-align: left;
+}
+
+ul.keywordmatches li.goodmatch a {
+    font-weight: bold;
+}
+
+/* -- index page ------------------------------------------------------------ */
+
+table.contentstable {
+    width: 90%;
+}
+
+table.contentstable p.biglink {
+    line-height: 150%;
+}
+
+a.biglink {
+    font-size: 1.3em;
+}
+
+span.linkdescr {
+    font-style: italic;
+    padding-top: 5px;
+    font-size: 90%;
+}
+
+/* -- general index --------------------------------------------------------- */
+
+table.indextable {
+    width: 100%;
+}
+
+table.indextable td {
+    text-align: left;
+    vertical-align: top;
+}
+
+table.indextable dl, table.indextable dd {
+    margin-top: 0;
+    margin-bottom: 0;
+}
+
+table.indextable tr.pcap {
+    height: 10px;
+}
+
+table.indextable tr.cap {
+    margin-top: 10px;
+    background-color: #f2f2f2;
+}
+
+img.toggler {
+    margin-right: 3px;
+    margin-top: 3px;
+    cursor: pointer;
+}
+
+div.modindex-jumpbox {
+    border-top: 1px solid #ddd;
+    border-bottom: 1px solid #ddd;
+    margin: 1em 0 1em 0;
+    padding: 0.4em;
+}
+
+div.genindex-jumpbox {
+    border-top: 1px solid #ddd;
+    border-bottom: 1px solid #ddd;
+    margin: 1em 0 1em 0;
+    padding: 0.4em;
+}
+
+/* -- general body styles --------------------------------------------------- */
+
+a.headerlink {
+    visibility: hidden;
+}
+
+h1:hover > a.headerlink,
+h2:hover > a.headerlink,
+h3:hover > a.headerlink,
+h4:hover > a.headerlink,
+h5:hover > a.headerlink,
+h6:hover > a.headerlink,
+dt:hover > a.headerlink {
+    visibility: visible;
+}
+
+div.body p.caption {
+    text-align: inherit;
+}
+
+div.body td {
+    text-align: left;
+}
+
+.field-list ul {
+    padding-left: 1em;
+}
+
+.first {
+    margin-top: 0 !important;
+}
+
+p.rubric {
+    margin-top: 30px;
+    font-weight: bold;
+}
+
+img.align-left, .figure.align-left, object.align-left {
+    clear: left;
+    float: left;
+    margin-right: 1em;
+}
+
+img.align-right, .figure.align-right, object.align-right {
+    clear: right;
+    float: right;
+    margin-left: 1em;
+}
+
+img.align-center, .figure.align-center, object.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+.align-left {
+    text-align: left;
+}
+
+.align-center {
+    text-align: center;
+}
+
+.align-right {
+    text-align: right;
+}
+
+/* -- sidebars -------------------------------------------------------------- */
+
+div.sidebar {
+    margin: 0 0 0.5em 1em;
+    border: 1px solid #ddb;
+    padding: 7px 7px 0 7px;
+    background-color: #ffe;
+    width: 40%;
+    float: right;
+}
+
+p.sidebar-title {
+    font-weight: bold;
+}
+
+/* -- topics ---------------------------------------------------------------- */
+
+div.topic {
+    border: 1px solid #ccc;
+    padding: 7px 7px 0 7px;
+    margin: 10px 0 10px 0;
+}
+
+p.topic-title {
+    font-size: 1.1em;
+    font-weight: bold;
+    margin-top: 10px;
+}
+
+/* -- admonitions ----------------------------------------------------------- */
+
+div.admonition {
+    margin-top: 10px;
+    margin-bottom: 10px;
+    padding: 7px;
+}
+
+div.admonition dt {
+    font-weight: bold;
+}
+
+div.admonition dl {
+    margin-bottom: 0;
+}
+
+p.admonition-title {
+    margin: 0px 10px 5px 0px;
+    font-weight: bold;
+}
+
+div.body p.centered {
+    text-align: center;
+    margin-top: 25px;
+}
+
+/* -- tables ---------------------------------------------------------------- */
+
+table.docutils {
+    border: 0;
+    border-collapse: collapse;
+}
+
+table.docutils td, table.docutils th {
+    padding: 1px 8px 1px 5px;
+    border-top: 0;
+    border-left: 0;
+    border-right: 0;
+    border-bottom: 1px solid #aaa;
+}
+
+table.field-list td, table.field-list th {
+    border: 0 !important;
+}
+
+table.footnote td, table.footnote th {
+    border: 0 !important;
+}
+
+th {
+    text-align: left;
+    padding-right: 5px;
+}
+
+table.citation {
+    border-left: solid 1px gray;
+    margin-left: 1px;
+}
+
+table.citation td {
+    border-bottom: none;
+}
+
+/* -- other body styles ----------------------------------------------------- */
+
+ol.arabic {
+    list-style: decimal;
+}
+
+ol.loweralpha {
+    list-style: lower-alpha;
+}
+
+ol.upperalpha {
+    list-style: upper-alpha;
+}
+
+ol.lowerroman {
+    list-style: lower-roman;
+}
+
+ol.upperroman {
+    list-style: upper-roman;
+}
+
+dl {
+    margin-bottom: 15px;
+}
+
+dd p {
+    margin-top: 0px;
+}
+
+dd ul, dd table {
+    margin-bottom: 10px;
+}
+
+dd {
+    margin-top: 3px;
+    margin-bottom: 10px;
+    margin-left: 30px;
+}
+
+dt:target, .highlighted {
+    background-color: #fbe54e;
+}
+
+dl.glossary dt {
+    font-weight: bold;
+    font-size: 1.1em;
+}
+
+.field-list ul {
+    margin: 0;
+    padding-left: 1em;
+}
+
+.field-list p {
+    margin: 0;
+}
+
+.optional {
+    font-size: 1.3em;
+}
+
+.versionmodified {
+    font-style: italic;
+}
+
+.system-message {
+    background-color: #fda;
+    padding: 5px;
+    border: 3px solid red;
+}
+
+.footnote:target  {
+    background-color: #ffa;
+}
+
+.line-block {
+    display: block;
+    margin-top: 1em;
+    margin-bottom: 1em;
+}
+
+.line-block .line-block {
+    margin-top: 0;
+    margin-bottom: 0;
+    margin-left: 1.5em;
+}
+
+.guilabel, .menuselection {
+    font-family: sans-serif;
+}
+
+.accelerator {
+    text-decoration: underline;
+}
+
+.classifier {
+    font-style: oblique;
+}
+
+abbr, acronym {
+    border-bottom: dotted 1px;
+    cursor: help;
+}
+
+/* -- code displays --------------------------------------------------------- */
+
+pre {
+    overflow: auto;
+    overflow-y: hidden;  /* fixes display issues on Chrome browsers */
+}
+
+td.linenos pre {
+    padding: 5px 0px;
+    border: 0;
+    background-color: transparent;
+    color: #aaa;
+}
+
+table.highlighttable {
+    margin-left: 0.5em;
+}
+
+table.highlighttable td {
+    padding: 0 0.5em 0 0.5em;
+}
+
+tt.descname {
+    background-color: transparent;
+    font-weight: bold;
+    font-size: 1.2em;
+}
+
+tt.descclassname {
+    background-color: transparent;
+}
+
+tt.xref, a tt {
+    background-color: transparent;
+    font-weight: bold;
+}
+
+h1 tt, h2 tt, h3 tt, h4 tt, h5 tt, h6 tt {
+    background-color: transparent;
+}
+
+.viewcode-link {
+    float: right;
+}
+
+.viewcode-back {
+    float: right;
+    font-family: sans-serif;
+}
+
+div.viewcode-block:target {
+    margin: -1px -10px;
+    padding: 0 10px;
+}
+
+/* -- math display ---------------------------------------------------------- */
+
+img.math {
+    vertical-align: middle;
+}
+
+div.body div.math p {
+    text-align: center;
+}
+
+span.eqno {
+    float: right;
+}
+
+/* -- printout stylesheet --------------------------------------------------- */
+
+ at media print {
+    div.document,
+    div.documentwrapper,
+    div.bodywrapper {
+        margin: 0 !important;
+        width: 100%;
+    }
+
+    div.sphinxsidebar,
+    div.related,
+    div.footer,
+    #top-link {
+        display: none;
+    }
+}
\ No newline at end of file
diff --git a/doc/build/html/_static/comment-bright.png b/doc/build/html/_static/comment-bright.png
new file mode 100644
index 0000000..551517b
Binary files /dev/null and b/doc/build/html/_static/comment-bright.png differ
diff --git a/doc/build/html/_static/comment-close.png b/doc/build/html/_static/comment-close.png
new file mode 100644
index 0000000..09b54be
Binary files /dev/null and b/doc/build/html/_static/comment-close.png differ
diff --git a/doc/build/html/_static/comment.png b/doc/build/html/_static/comment.png
new file mode 100644
index 0000000..92feb52
Binary files /dev/null and b/doc/build/html/_static/comment.png differ
diff --git a/doc/build/html/_static/default.css b/doc/build/html/_static/default.css
new file mode 100644
index 0000000..5f1399a
--- /dev/null
+++ b/doc/build/html/_static/default.css
@@ -0,0 +1,256 @@
+/*
+ * default.css_t
+ * ~~~~~~~~~~~~~
+ *
+ * Sphinx stylesheet -- default theme.
+ *
+ * :copyright: Copyright 2007-2014 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+ at import url("basic.css");
+
+/* -- page layout ----------------------------------------------------------- */
+
+body {
+    font-family: sans-serif;
+    font-size: 100%;
+    background-color: #11303d;
+    color: #000;
+    margin: 0;
+    padding: 0;
+}
+
+div.document {
+    background-color: #1c4e63;
+}
+
+div.documentwrapper {
+    float: left;
+    width: 100%;
+}
+
+div.bodywrapper {
+    margin: 0 0 0 230px;
+}
+
+div.body {
+    background-color: #ffffff;
+    color: #000000;
+    padding: 0 20px 30px 20px;
+}
+
+div.footer {
+    color: #ffffff;
+    width: 100%;
+    padding: 9px 0 9px 0;
+    text-align: center;
+    font-size: 75%;
+}
+
+div.footer a {
+    color: #ffffff;
+    text-decoration: underline;
+}
+
+div.related {
+    background-color: #133f52;
+    line-height: 30px;
+    color: #ffffff;
+}
+
+div.related a {
+    color: #ffffff;
+}
+
+div.sphinxsidebar {
+}
+
+div.sphinxsidebar h3 {
+    font-family: 'Trebuchet MS', sans-serif;
+    color: #ffffff;
+    font-size: 1.4em;
+    font-weight: normal;
+    margin: 0;
+    padding: 0;
+}
+
+div.sphinxsidebar h3 a {
+    color: #ffffff;
+}
+
+div.sphinxsidebar h4 {
+    font-family: 'Trebuchet MS', sans-serif;
+    color: #ffffff;
+    font-size: 1.3em;
+    font-weight: normal;
+    margin: 5px 0 0 0;
+    padding: 0;
+}
+
+div.sphinxsidebar p {
+    color: #ffffff;
+}
+
+div.sphinxsidebar p.topless {
+    margin: 5px 10px 10px 10px;
+}
+
+div.sphinxsidebar ul {
+    margin: 10px;
+    padding: 0;
+    color: #ffffff;
+}
+
+div.sphinxsidebar a {
+    color: #98dbcc;
+}
+
+div.sphinxsidebar input {
+    border: 1px solid #98dbcc;
+    font-family: sans-serif;
+    font-size: 1em;
+}
+
+
+
+/* -- hyperlink styles ------------------------------------------------------ */
+
+a {
+    color: #355f7c;
+    text-decoration: none;
+}
+
+a:visited {
+    color: #355f7c;
+    text-decoration: none;
+}
+
+a:hover {
+    text-decoration: underline;
+}
+
+
+
+/* -- body styles ----------------------------------------------------------- */
+
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+    font-family: 'Trebuchet MS', sans-serif;
+    background-color: #f2f2f2;
+    font-weight: normal;
+    color: #20435c;
+    border-bottom: 1px solid #ccc;
+    margin: 20px -20px 10px -20px;
+    padding: 3px 0 3px 10px;
+}
+
+div.body h1 { margin-top: 0; font-size: 200%; }
+div.body h2 { font-size: 160%; }
+div.body h3 { font-size: 140%; }
+div.body h4 { font-size: 120%; }
+div.body h5 { font-size: 110%; }
+div.body h6 { font-size: 100%; }
+
+a.headerlink {
+    color: #c60f0f;
+    font-size: 0.8em;
+    padding: 0 4px 0 4px;
+    text-decoration: none;
+}
+
+a.headerlink:hover {
+    background-color: #c60f0f;
+    color: white;
+}
+
+div.body p, div.body dd, div.body li {
+    text-align: justify;
+    line-height: 130%;
+}
+
+div.admonition p.admonition-title + p {
+    display: inline;
+}
+
+div.admonition p {
+    margin-bottom: 5px;
+}
+
+div.admonition pre {
+    margin-bottom: 5px;
+}
+
+div.admonition ul, div.admonition ol {
+    margin-bottom: 5px;
+}
+
+div.note {
+    background-color: #eee;
+    border: 1px solid #ccc;
+}
+
+div.seealso {
+    background-color: #ffc;
+    border: 1px solid #ff6;
+}
+
+div.topic {
+    background-color: #eee;
+}
+
+div.warning {
+    background-color: #ffe4e4;
+    border: 1px solid #f66;
+}
+
+p.admonition-title {
+    display: inline;
+}
+
+p.admonition-title:after {
+    content: ":";
+}
+
+pre {
+    padding: 5px;
+    background-color: #eeffcc;
+    color: #333333;
+    line-height: 120%;
+    border: 1px solid #ac9;
+    border-left: none;
+    border-right: none;
+}
+
+tt {
+    background-color: #ecf0f3;
+    padding: 0 1px 0 1px;
+    font-size: 0.95em;
+}
+
+th {
+    background-color: #ede;
+}
+
+.warning tt {
+    background: #efc2c2;
+}
+
+.note tt {
+    background: #d6d6d6;
+}
+
+.viewcode-back {
+    font-family: sans-serif;
+}
+
+div.viewcode-block:target {
+    background-color: #f4debf;
+    border-top: 1px solid #ac9;
+    border-bottom: 1px solid #ac9;
+}
\ No newline at end of file
diff --git a/doc/build/html/_static/doctools.js b/doc/build/html/_static/doctools.js
new file mode 100644
index 0000000..c5455c9
--- /dev/null
+++ b/doc/build/html/_static/doctools.js
@@ -0,0 +1,238 @@
+/*
+ * doctools.js
+ * ~~~~~~~~~~~
+ *
+ * Sphinx JavaScript utilities for all documentation.
+ *
+ * :copyright: Copyright 2007-2014 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+/**
+ * select a different prefix for underscore
+ */
+$u = _.noConflict();
+
+/**
+ * make the code below compatible with browsers without
+ * an installed firebug like debugger
+if (!window.console || !console.firebug) {
+  var names = ["log", "debug", "info", "warn", "error", "assert", "dir",
+    "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace",
+    "profile", "profileEnd"];
+  window.console = {};
+  for (var i = 0; i < names.length; ++i)
+    window.console[names[i]] = function() {};
+}
+ */
+
+/**
+ * small helper function to urldecode strings
+ */
+jQuery.urldecode = function(x) {
+  return decodeURIComponent(x).replace(/\+/g, ' ');
+};
+
+/**
+ * small helper function to urlencode strings
+ */
+jQuery.urlencode = encodeURIComponent;
+
+/**
+ * This function returns the parsed url parameters of the
+ * current request. Multiple values per key are supported,
+ * it will always return arrays of strings for the value parts.
+ */
+jQuery.getQueryParameters = function(s) {
+  if (typeof s == 'undefined')
+    s = document.location.search;
+  var parts = s.substr(s.indexOf('?') + 1).split('&');
+  var result = {};
+  for (var i = 0; i < parts.length; i++) {
+    var tmp = parts[i].split('=', 2);
+    var key = jQuery.urldecode(tmp[0]);
+    var value = jQuery.urldecode(tmp[1]);
+    if (key in result)
+      result[key].push(value);
+    else
+      result[key] = [value];
+  }
+  return result;
+};
+
+/**
+ * highlight a given string on a jquery object by wrapping it in
+ * span elements with the given class name.
+ */
+jQuery.fn.highlightText = function(text, className) {
+  function highlight(node) {
+    if (node.nodeType == 3) {
+      var val = node.nodeValue;
+      var pos = val.toLowerCase().indexOf(text);
+      if (pos >= 0 && !jQuery(node.parentNode).hasClass(className)) {
+        var span = document.createElement("span");
+        span.className = className;
+        span.appendChild(document.createTextNode(val.substr(pos, text.length)));
+        node.parentNode.insertBefore(span, node.parentNode.insertBefore(
+          document.createTextNode(val.substr(pos + text.length)),
+          node.nextSibling));
+        node.nodeValue = val.substr(0, pos);
+      }
+    }
+    else if (!jQuery(node).is("button, select, textarea")) {
+      jQuery.each(node.childNodes, function() {
+        highlight(this);
+      });
+    }
+  }
+  return this.each(function() {
+    highlight(this);
+  });
+};
+
+/**
+ * Small JavaScript module for the documentation.
+ */
+var Documentation = {
+
+  init : function() {
+    this.fixFirefoxAnchorBug();
+    this.highlightSearchWords();
+    this.initIndexTable();
+  },
+
+  /**
+   * i18n support
+   */
+  TRANSLATIONS : {},
+  PLURAL_EXPR : function(n) { return n == 1 ? 0 : 1; },
+  LOCALE : 'unknown',
+
+  // gettext and ngettext don't access this so that the functions
+  // can safely bound to a different name (_ = Documentation.gettext)
+  gettext : function(string) {
+    var translated = Documentation.TRANSLATIONS[string];
+    if (typeof translated == 'undefined')
+      return string;
+    return (typeof translated == 'string') ? translated : translated[0];
+  },
+
+  ngettext : function(singular, plural, n) {
+    var translated = Documentation.TRANSLATIONS[singular];
+    if (typeof translated == 'undefined')
+      return (n == 1) ? singular : plural;
+    return translated[Documentation.PLURALEXPR(n)];
+  },
+
+  addTranslations : function(catalog) {
+    for (var key in catalog.messages)
+      this.TRANSLATIONS[key] = catalog.messages[key];
+    this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')');
+    this.LOCALE = catalog.locale;
+  },
+
+  /**
+   * add context elements like header anchor links
+   */
+  addContextElements : function() {
+    $('div[id] > :header:first').each(function() {
+      $('<a class="headerlink">\u00B6</a>').
+      attr('href', '#' + this.id).
+      attr('title', _('Permalink to this headline')).
+      appendTo(this);
+    });
+    $('dt[id]').each(function() {
+      $('<a class="headerlink">\u00B6</a>').
+      attr('href', '#' + this.id).
+      attr('title', _('Permalink to this definition')).
+      appendTo(this);
+    });
+  },
+
+  /**
+   * workaround a firefox stupidity
+   */
+  fixFirefoxAnchorBug : function() {
+    if (document.location.hash && $.browser.mozilla)
+      window.setTimeout(function() {
+        document.location.href += '';
+      }, 10);
+  },
+
+  /**
+   * highlight the search words provided in the url in the text
+   */
+  highlightSearchWords : function() {
+    var params = $.getQueryParameters();
+    var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : [];
+    if (terms.length) {
+      var body = $('div.body');
+      if (!body.length) {
+        body = $('body');
+      }
+      window.setTimeout(function() {
+        $.each(terms, function() {
+          body.highlightText(this.toLowerCase(), 'highlighted');
+        });
+      }, 10);
+      $('<p class="highlight-link"><a href="javascript:Documentation.' +
+        'hideSearchWords()">' + _('Hide Search Matches') + '</a></p>')
+          .appendTo($('#searchbox'));
+    }
+  },
+
+  /**
+   * init the domain index toggle buttons
+   */
+  initIndexTable : function() {
+    var togglers = $('img.toggler').click(function() {
+      var src = $(this).attr('src');
+      var idnum = $(this).attr('id').substr(7);
+      $('tr.cg-' + idnum).toggle();
+      if (src.substr(-9) == 'minus.png')
+        $(this).attr('src', src.substr(0, src.length-9) + 'plus.png');
+      else
+        $(this).attr('src', src.substr(0, src.length-8) + 'minus.png');
+    }).css('display', '');
+    if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) {
+        togglers.click();
+    }
+  },
+
+  /**
+   * helper function to hide the search marks again
+   */
+  hideSearchWords : function() {
+    $('#searchbox .highlight-link').fadeOut(300);
+    $('span.highlighted').removeClass('highlighted');
+  },
+
+  /**
+   * make the url absolute
+   */
+  makeURL : function(relativeURL) {
+    return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL;
+  },
+
+  /**
+   * get the current relative url
+   */
+  getCurrentURL : function() {
+    var path = document.location.pathname;
+    var parts = path.split(/\//);
+    $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() {
+      if (this == '..')
+        parts.pop();
+    });
+    var url = parts.join('/');
+    return path.substring(url.lastIndexOf('/') + 1, path.length - 1);
+  }
+};
+
+// quick alias for translations
+_ = Documentation.gettext;
+
+$(document).ready(function() {
+  Documentation.init();
+});
diff --git a/doc/build/html/_static/down-pressed.png b/doc/build/html/_static/down-pressed.png
new file mode 100644
index 0000000..6f7ad78
Binary files /dev/null and b/doc/build/html/_static/down-pressed.png differ
diff --git a/doc/build/html/_static/down.png b/doc/build/html/_static/down.png
new file mode 100644
index 0000000..3003a88
Binary files /dev/null and b/doc/build/html/_static/down.png differ
diff --git a/doc/build/html/_static/file.png b/doc/build/html/_static/file.png
new file mode 100644
index 0000000..d18082e
Binary files /dev/null and b/doc/build/html/_static/file.png differ
diff --git a/doc/build/html/_static/jquery.js b/doc/build/html/_static/jquery.js
new file mode 100644
index 0000000..83589da
--- /dev/null
+++ b/doc/build/html/_static/jquery.js
@@ -0,0 +1,2 @@
+/*! jQuery v1.8.3 jquery.com | jquery.org/license */
+(function(e,t){function _(e){var t=M[e]={};return v.each(e.split(y),function(e,n){t[n]=!0}),t}function H(e,n,r){if(r===t&&e.nodeType===1){var i="data-"+n.replace(P,"-$1").toLowerCase();r=e.getAttribute(i);if(typeof r=="string"){try{r=r==="true"?!0:r==="false"?!1:r==="null"?null:+r+""===r?+r:D.test(r)?v.parseJSON(r):r}catch(s){}v.data(e,n,r)}else r=t}return r}function B(e){var t;for(t in e){if(t==="data"&&v.isEmptyObject(e[t]))continue;if(t!=="toJSON")return!1}return!0}function et(){retur [...]
\ No newline at end of file
diff --git a/doc/build/html/_static/minus.png b/doc/build/html/_static/minus.png
new file mode 100644
index 0000000..da1c562
Binary files /dev/null and b/doc/build/html/_static/minus.png differ
diff --git a/doc/build/html/_static/plus.png b/doc/build/html/_static/plus.png
new file mode 100644
index 0000000..b3cb374
Binary files /dev/null and b/doc/build/html/_static/plus.png differ
diff --git a/doc/build/html/_static/pygments.css b/doc/build/html/_static/pygments.css
new file mode 100644
index 0000000..d79caa1
--- /dev/null
+++ b/doc/build/html/_static/pygments.css
@@ -0,0 +1,62 @@
+.highlight .hll { background-color: #ffffcc }
+.highlight  { background: #eeffcc; }
+.highlight .c { color: #408090; font-style: italic } /* Comment */
+.highlight .err { border: 1px solid #FF0000 } /* Error */
+.highlight .k { color: #007020; font-weight: bold } /* Keyword */
+.highlight .o { color: #666666 } /* Operator */
+.highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */
+.highlight .cp { color: #007020 } /* Comment.Preproc */
+.highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */
+.highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */
+.highlight .gd { color: #A00000 } /* Generic.Deleted */
+.highlight .ge { font-style: italic } /* Generic.Emph */
+.highlight .gr { color: #FF0000 } /* Generic.Error */
+.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+.highlight .gi { color: #00A000 } /* Generic.Inserted */
+.highlight .go { color: #333333 } /* Generic.Output */
+.highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */
+.highlight .gs { font-weight: bold } /* Generic.Strong */
+.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+.highlight .gt { color: #0044DD } /* Generic.Traceback */
+.highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */
+.highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */
+.highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */
+.highlight .kp { color: #007020 } /* Keyword.Pseudo */
+.highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */
+.highlight .kt { color: #902000 } /* Keyword.Type */
+.highlight .m { color: #208050 } /* Literal.Number */
+.highlight .s { color: #4070a0 } /* Literal.String */
+.highlight .na { color: #4070a0 } /* Name.Attribute */
+.highlight .nb { color: #007020 } /* Name.Builtin */
+.highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */
+.highlight .no { color: #60add5 } /* Name.Constant */
+.highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */
+.highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */
+.highlight .ne { color: #007020 } /* Name.Exception */
+.highlight .nf { color: #06287e } /* Name.Function */
+.highlight .nl { color: #002070; font-weight: bold } /* Name.Label */
+.highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */
+.highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */
+.highlight .nv { color: #bb60d5 } /* Name.Variable */
+.highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */
+.highlight .w { color: #bbbbbb } /* Text.Whitespace */
+.highlight .mf { color: #208050 } /* Literal.Number.Float */
+.highlight .mh { color: #208050 } /* Literal.Number.Hex */
+.highlight .mi { color: #208050 } /* Literal.Number.Integer */
+.highlight .mo { color: #208050 } /* Literal.Number.Oct */
+.highlight .sb { color: #4070a0 } /* Literal.String.Backtick */
+.highlight .sc { color: #4070a0 } /* Literal.String.Char */
+.highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */
+.highlight .s2 { color: #4070a0 } /* Literal.String.Double */
+.highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */
+.highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */
+.highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */
+.highlight .sx { color: #c65d09 } /* Literal.String.Other */
+.highlight .sr { color: #235388 } /* Literal.String.Regex */
+.highlight .s1 { color: #4070a0 } /* Literal.String.Single */
+.highlight .ss { color: #517918 } /* Literal.String.Symbol */
+.highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */
+.highlight .vc { color: #bb60d5 } /* Name.Variable.Class */
+.highlight .vg { color: #bb60d5 } /* Name.Variable.Global */
+.highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */
+.highlight .il { color: #208050 } /* Literal.Number.Integer.Long */
\ No newline at end of file
diff --git a/doc/build/html/_static/searchtools.js b/doc/build/html/_static/searchtools.js
new file mode 100644
index 0000000..6e1f06b
--- /dev/null
+++ b/doc/build/html/_static/searchtools.js
@@ -0,0 +1,622 @@
+/*
+ * searchtools.js_t
+ * ~~~~~~~~~~~~~~~~
+ *
+ * Sphinx JavaScript utilties for the full-text search.
+ *
+ * :copyright: Copyright 2007-2014 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+
+/**
+ * Porter Stemmer
+ */
+var Stemmer = function() {
+
+  var step2list = {
+    ational: 'ate',
+    tional: 'tion',
+    enci: 'ence',
+    anci: 'ance',
+    izer: 'ize',
+    bli: 'ble',
+    alli: 'al',
+    entli: 'ent',
+    eli: 'e',
+    ousli: 'ous',
+    ization: 'ize',
+    ation: 'ate',
+    ator: 'ate',
+    alism: 'al',
+    iveness: 'ive',
+    fulness: 'ful',
+    ousness: 'ous',
+    aliti: 'al',
+    iviti: 'ive',
+    biliti: 'ble',
+    logi: 'log'
+  };
+
+  var step3list = {
+    icate: 'ic',
+    ative: '',
+    alize: 'al',
+    iciti: 'ic',
+    ical: 'ic',
+    ful: '',
+    ness: ''
+  };
+
+  var c = "[^aeiou]";          // consonant
+  var v = "[aeiouy]";          // vowel
+  var C = c + "[^aeiouy]*";    // consonant sequence
+  var V = v + "[aeiou]*";      // vowel sequence
+
+  var mgr0 = "^(" + C + ")?" + V + C;                      // [C]VC... is m>0
+  var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";    // [C]VC[V] is m=1
+  var mgr1 = "^(" + C + ")?" + V + C + V + C;              // [C]VCVC... is m>1
+  var s_v   = "^(" + C + ")?" + v;                         // vowel in stem
+
+  this.stemWord = function (w) {
+    var stem;
+    var suffix;
+    var firstch;
+    var origword = w;
+
+    if (w.length < 3)
+      return w;
+
+    var re;
+    var re2;
+    var re3;
+    var re4;
+
+    firstch = w.substr(0,1);
+    if (firstch == "y")
+      w = firstch.toUpperCase() + w.substr(1);
+
+    // Step 1a
+    re = /^(.+?)(ss|i)es$/;
+    re2 = /^(.+?)([^s])s$/;
+
+    if (re.test(w))
+      w = w.replace(re,"$1$2");
+    else if (re2.test(w))
+      w = w.replace(re2,"$1$2");
+
+    // Step 1b
+    re = /^(.+?)eed$/;
+    re2 = /^(.+?)(ed|ing)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      re = new RegExp(mgr0);
+      if (re.test(fp[1])) {
+        re = /.$/;
+        w = w.replace(re,"");
+      }
+    }
+    else if (re2.test(w)) {
+      var fp = re2.exec(w);
+      stem = fp[1];
+      re2 = new RegExp(s_v);
+      if (re2.test(stem)) {
+        w = stem;
+        re2 = /(at|bl|iz)$/;
+        re3 = new RegExp("([^aeiouylsz])\\1$");
+        re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+        if (re2.test(w))
+          w = w + "e";
+        else if (re3.test(w)) {
+          re = /.$/;
+          w = w.replace(re,"");
+        }
+        else if (re4.test(w))
+          w = w + "e";
+      }
+    }
+
+    // Step 1c
+    re = /^(.+?)y$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(s_v);
+      if (re.test(stem))
+        w = stem + "i";
+    }
+
+    // Step 2
+    re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      suffix = fp[2];
+      re = new RegExp(mgr0);
+      if (re.test(stem))
+        w = stem + step2list[suffix];
+    }
+
+    // Step 3
+    re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      suffix = fp[2];
+      re = new RegExp(mgr0);
+      if (re.test(stem))
+        w = stem + step3list[suffix];
+    }
+
+    // Step 4
+    re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
+    re2 = /^(.+?)(s|t)(ion)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(mgr1);
+      if (re.test(stem))
+        w = stem;
+    }
+    else if (re2.test(w)) {
+      var fp = re2.exec(w);
+      stem = fp[1] + fp[2];
+      re2 = new RegExp(mgr1);
+      if (re2.test(stem))
+        w = stem;
+    }
+
+    // Step 5
+    re = /^(.+?)e$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(mgr1);
+      re2 = new RegExp(meq1);
+      re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+      if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
+        w = stem;
+    }
+    re = /ll$/;
+    re2 = new RegExp(mgr1);
+    if (re.test(w) && re2.test(w)) {
+      re = /.$/;
+      w = w.replace(re,"");
+    }
+
+    // and turn initial Y back to y
+    if (firstch == "y")
+      w = firstch.toLowerCase() + w.substr(1);
+    return w;
+  }
+}
+
+
+
+/**
+ * Simple result scoring code.
+ */
+var Scorer = {
+  // Implement the following function to further tweak the score for each result
+  // The function takes a result array [filename, title, anchor, descr, score]
+  // and returns the new score.
+  /*
+  score: function(result) {
+    return result[4];
+  },
+  */
+
+  // query matches the full name of an object
+  objNameMatch: 11,
+  // or matches in the last dotted part of the object name
+  objPartialMatch: 6,
+  // Additive scores depending on the priority of the object
+  objPrio: {0:  15,   // used to be importantResults
+            1:  5,   // used to be objectResults
+            2: -5},  // used to be unimportantResults
+  //  Used when the priority is not in the mapping.
+  objPrioDefault: 0,
+
+  // query found in title
+  title: 15,
+  // query found in terms
+  term: 5
+};
+
+
+/**
+ * Search Module
+ */
+var Search = {
+
+  _index : null,
+  _queued_query : null,
+  _pulse_status : -1,
+
+  init : function() {
+      var params = $.getQueryParameters();
+      if (params.q) {
+          var query = params.q[0];
+          $('input[name="q"]')[0].value = query;
+          this.performSearch(query);
+      }
+  },
+
+  loadIndex : function(url) {
+    $.ajax({type: "GET", url: url, data: null,
+            dataType: "script", cache: true,
+            complete: function(jqxhr, textstatus) {
+              if (textstatus != "success") {
+                document.getElementById("searchindexloader").src = url;
+              }
+            }});
+  },
+
+  setIndex : function(index) {
+    var q;
+    this._index = index;
+    if ((q = this._queued_query) !== null) {
+      this._queued_query = null;
+      Search.query(q);
+    }
+  },
+
+  hasIndex : function() {
+      return this._index !== null;
+  },
+
+  deferQuery : function(query) {
+      this._queued_query = query;
+  },
+
+  stopPulse : function() {
+      this._pulse_status = 0;
+  },
+
+  startPulse : function() {
+    if (this._pulse_status >= 0)
+        return;
+    function pulse() {
+      var i;
+      Search._pulse_status = (Search._pulse_status + 1) % 4;
+      var dotString = '';
+      for (i = 0; i < Search._pulse_status; i++)
+        dotString += '.';
+      Search.dots.text(dotString);
+      if (Search._pulse_status > -1)
+        window.setTimeout(pulse, 500);
+    }
+    pulse();
+  },
+
+  /**
+   * perform a search for something (or wait until index is loaded)
+   */
+  performSearch : function(query) {
+    // create the required interface elements
+    this.out = $('#search-results');
+    this.title = $('<h2>' + _('Searching') + '</h2>').appendTo(this.out);
+    this.dots = $('<span></span>').appendTo(this.title);
+    this.status = $('<p style="display: none"></p>').appendTo(this.out);
+    this.output = $('<ul class="search"/>').appendTo(this.out);
+
+    $('#search-progress').text(_('Preparing search...'));
+    this.startPulse();
+
+    // index already loaded, the browser was quick!
+    if (this.hasIndex())
+      this.query(query);
+    else
+      this.deferQuery(query);
+  },
+
+  /**
+   * execute search (requires search index to be loaded)
+   */
+  query : function(query) {
+    var i;
+    var stopwords = ["a","and","are","as","at","be","but","by","for","if","in","into","is","it","near","no","not","of","on","or","such","that","the","their","then","there","these","they","this","to","was","will","with"];
+
+    // stem the searchterms and add them to the correct list
+    var stemmer = new Stemmer();
+    var searchterms = [];
+    var excluded = [];
+    var hlterms = [];
+    var tmp = query.split(/\s+/);
+    var objectterms = [];
+    for (i = 0; i < tmp.length; i++) {
+      if (tmp[i] !== "") {
+          objectterms.push(tmp[i].toLowerCase());
+      }
+
+      if ($u.indexOf(stopwords, tmp[i].toLowerCase()) != -1 || tmp[i].match(/^\d+$/) ||
+          tmp[i] === "") {
+        // skip this "word"
+        continue;
+      }
+      // stem the word
+      var word = stemmer.stemWord(tmp[i].toLowerCase());
+      var toAppend;
+      // select the correct list
+      if (word[0] == '-') {
+        toAppend = excluded;
+        word = word.substr(1);
+      }
+      else {
+        toAppend = searchterms;
+        hlterms.push(tmp[i].toLowerCase());
+      }
+      // only add if not already in the list
+      if (!$u.contains(toAppend, word))
+        toAppend.push(word);
+    }
+    var highlightstring = '?highlight=' + $.urlencode(hlterms.join(" "));
+
+    // console.debug('SEARCH: searching for:');
+    // console.info('required: ', searchterms);
+    // console.info('excluded: ', excluded);
+
+    // prepare search
+    var terms = this._index.terms;
+    var titleterms = this._index.titleterms;
+
+    // array of [filename, title, anchor, descr, score]
+    var results = [];
+    $('#search-progress').empty();
+
+    // lookup as object
+    for (i = 0; i < objectterms.length; i++) {
+      var others = [].concat(objectterms.slice(0, i),
+                             objectterms.slice(i+1, objectterms.length));
+      results = results.concat(this.performObjectSearch(objectterms[i], others));
+    }
+
+    // lookup as search terms in fulltext
+    results = results.concat(this.performTermsSearch(searchterms, excluded, terms, Scorer.term))
+                     .concat(this.performTermsSearch(searchterms, excluded, titleterms, Scorer.title));
+
+    // let the scorer override scores with a custom scoring function
+    if (Scorer.score) {
+      for (i = 0; i < results.length; i++)
+        results[i][4] = Scorer.score(results[i]);
+    }
+
+    // now sort the results by score (in opposite order of appearance, since the
+    // display function below uses pop() to retrieve items) and then
+    // alphabetically
+    results.sort(function(a, b) {
+      var left = a[4];
+      var right = b[4];
+      if (left > right) {
+        return 1;
+      } else if (left < right) {
+        return -1;
+      } else {
+        // same score: sort alphabetically
+        left = a[1].toLowerCase();
+        right = b[1].toLowerCase();
+        return (left > right) ? -1 : ((left < right) ? 1 : 0);
+      }
+    });
+
+    // for debugging
+    //Search.lastresults = results.slice();  // a copy
+    //console.info('search results:', Search.lastresults);
+
+    // print the results
+    var resultCount = results.length;
+    function displayNextItem() {
+      // results left, load the summary and display it
+      if (results.length) {
+        var item = results.pop();
+        var listItem = $('<li style="display:none"></li>');
+        if (DOCUMENTATION_OPTIONS.FILE_SUFFIX === '') {
+          // dirhtml builder
+          var dirname = item[0] + '/';
+          if (dirname.match(/\/index\/$/)) {
+            dirname = dirname.substring(0, dirname.length-6);
+          } else if (dirname == 'index/') {
+            dirname = '';
+          }
+          listItem.append($('<a/>').attr('href',
+            DOCUMENTATION_OPTIONS.URL_ROOT + dirname +
+            highlightstring + item[2]).html(item[1]));
+        } else {
+          // normal html builders
+          listItem.append($('<a/>').attr('href',
+            item[0] + DOCUMENTATION_OPTIONS.FILE_SUFFIX +
+            highlightstring + item[2]).html(item[1]));
+        }
+        if (item[3]) {
+          listItem.append($('<span> (' + item[3] + ')</span>'));
+          Search.output.append(listItem);
+          listItem.slideDown(5, function() {
+            displayNextItem();
+          });
+        } else if (DOCUMENTATION_OPTIONS.HAS_SOURCE) {
+          $.ajax({url: DOCUMENTATION_OPTIONS.URL_ROOT + '_sources/' + item[0] + '.txt',
+                  dataType: "text",
+                  complete: function(jqxhr, textstatus) {
+                    var data = jqxhr.responseText;
+                    if (data !== '') {
+                      listItem.append(Search.makeSearchSummary(data, searchterms, hlterms));
+                    }
+                    Search.output.append(listItem);
+                    listItem.slideDown(5, function() {
+                      displayNextItem();
+                    });
+                  }});
+        } else {
+          // no source available, just display title
+          Search.output.append(listItem);
+          listItem.slideDown(5, function() {
+            displayNextItem();
+          });
+        }
+      }
+      // search finished, update title and status message
+      else {
+        Search.stopPulse();
+        Search.title.text(_('Search Results'));
+        if (!resultCount)
+          Search.status.text(_('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.'));
+        else
+            Search.status.text(_('Search finished, found %s page(s) matching the search query.').replace('%s', resultCount));
+        Search.status.fadeIn(500);
+      }
+    }
+    displayNextItem();
+  },
+
+  /**
+   * search for object names
+   */
+  performObjectSearch : function(object, otherterms) {
+    var filenames = this._index.filenames;
+    var objects = this._index.objects;
+    var objnames = this._index.objnames;
+    var titles = this._index.titles;
+
+    var i;
+    var results = [];
+
+    for (var prefix in objects) {
+      for (var name in objects[prefix]) {
+        var fullname = (prefix ? prefix + '.' : '') + name;
+        if (fullname.toLowerCase().indexOf(object) > -1) {
+          var score = 0;
+          var parts = fullname.split('.');
+          // check for different match types: exact matches of full name or
+          // "last name" (i.e. last dotted part)
+          if (fullname == object || parts[parts.length - 1] == object) {
+            score += Scorer.objNameMatch;
+          // matches in last name
+          } else if (parts[parts.length - 1].indexOf(object) > -1) {
+            score += Scorer.objPartialMatch;
+          }
+          var match = objects[prefix][name];
+          var objname = objnames[match[1]][2];
+          var title = titles[match[0]];
+          // If more than one term searched for, we require other words to be
+          // found in the name/title/description
+          if (otherterms.length > 0) {
+            var haystack = (prefix + ' ' + name + ' ' +
+                            objname + ' ' + title).toLowerCase();
+            var allfound = true;
+            for (i = 0; i < otherterms.length; i++) {
+              if (haystack.indexOf(otherterms[i]) == -1) {
+                allfound = false;
+                break;
+              }
+            }
+            if (!allfound) {
+              continue;
+            }
+          }
+          var descr = objname + _(', in ') + title;
+
+          var anchor = match[3];
+          if (anchor === '')
+            anchor = fullname;
+          else if (anchor == '-')
+            anchor = objnames[match[1]][1] + '-' + fullname;
+          // add custom score for some objects according to scorer
+          if (Scorer.objPrio.hasOwnProperty(match[2])) {
+            score += Scorer.objPrio[match[2]];
+          } else {
+            score += Scorer.objPrioDefault;
+          }
+          results.push([filenames[match[0]], fullname, '#'+anchor, descr, score]);
+        }
+      }
+    }
+
+    return results;
+  },
+
+  /**
+   * search for full-text terms in the index
+   */
+  performTermsSearch : function(searchterms, excluded, terms, score) {
+    var filenames = this._index.filenames;
+    var titles = this._index.titles;
+
+    var i, j, file, files;
+    var fileMap = {};
+    var results = [];
+
+    // perform the search on the required terms
+    for (i = 0; i < searchterms.length; i++) {
+      var word = searchterms[i];
+      // no match but word was a required one
+      if ((files = terms[word]) === undefined)
+        break;
+      if (files.length === undefined) {
+        files = [files];
+      }
+      // create the mapping
+      for (j = 0; j < files.length; j++) {
+        file = files[j];
+        if (file in fileMap)
+          fileMap[file].push(word);
+        else
+          fileMap[file] = [word];
+      }
+    }
+
+    // now check if the files don't contain excluded terms
+    for (file in fileMap) {
+      var valid = true;
+
+      // check if all requirements are matched
+      if (fileMap[file].length != searchterms.length)
+          continue;
+
+      // ensure that none of the excluded terms is in the search result
+      for (i = 0; i < excluded.length; i++) {
+        if (terms[excluded[i]] == file ||
+          $u.contains(terms[excluded[i]] || [], file)) {
+          valid = false;
+          break;
+        }
+      }
+
+      // if we have still a valid result we can add it to the result list
+      if (valid) {
+        results.push([filenames[file], titles[file], '', null, score]);
+      }
+    }
+    return results;
+  },
+
+  /**
+   * helper function to return a node containing the
+   * search summary for a given text. keywords is a list
+   * of stemmed words, hlwords is the list of normal, unstemmed
+   * words. the first one is used to find the occurance, the
+   * latter for highlighting it.
+   */
+  makeSearchSummary : function(text, keywords, hlwords) {
+    var textLower = text.toLowerCase();
+    var start = 0;
+    $.each(keywords, function() {
+      var i = textLower.indexOf(this.toLowerCase());
+      if (i > -1)
+        start = i;
+    });
+    start = Math.max(start - 120, 0);
+    var excerpt = ((start > 0) ? '...' : '') +
+      $.trim(text.substr(start, 240)) +
+      ((start + 240 - text.length) ? '...' : '');
+    var rv = $('<div class="context"></div>').text(excerpt);
+    $.each(hlwords, function() {
+      rv = rv.highlightText(this, 'highlighted');
+    });
+    return rv;
+  }
+};
+
+$(document).ready(function() {
+  Search.init();
+});
\ No newline at end of file
diff --git a/doc/build/html/_static/sidebar.js b/doc/build/html/_static/sidebar.js
new file mode 100644
index 0000000..4f09a0d
--- /dev/null
+++ b/doc/build/html/_static/sidebar.js
@@ -0,0 +1,159 @@
+/*
+ * sidebar.js
+ * ~~~~~~~~~~
+ *
+ * This script makes the Sphinx sidebar collapsible.
+ *
+ * .sphinxsidebar contains .sphinxsidebarwrapper.  This script adds
+ * in .sphixsidebar, after .sphinxsidebarwrapper, the #sidebarbutton
+ * used to collapse and expand the sidebar.
+ *
+ * When the sidebar is collapsed the .sphinxsidebarwrapper is hidden
+ * and the width of the sidebar and the margin-left of the document
+ * are decreased. When the sidebar is expanded the opposite happens.
+ * This script saves a per-browser/per-session cookie used to
+ * remember the position of the sidebar among the pages.
+ * Once the browser is closed the cookie is deleted and the position
+ * reset to the default (expanded).
+ *
+ * :copyright: Copyright 2007-2014 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+$(function() {
+  
+  
+  
+  
+  
+  
+  
+
+  // global elements used by the functions.
+  // the 'sidebarbutton' element is defined as global after its
+  // creation, in the add_sidebar_button function
+  var bodywrapper = $('.bodywrapper');
+  var sidebar = $('.sphinxsidebar');
+  var sidebarwrapper = $('.sphinxsidebarwrapper');
+
+  // for some reason, the document has no sidebar; do not run into errors
+  if (!sidebar.length) return;
+
+  // original margin-left of the bodywrapper and width of the sidebar
+  // with the sidebar expanded
+  var bw_margin_expanded = bodywrapper.css('margin-left');
+  var ssb_width_expanded = sidebar.width();
+
+  // margin-left of the bodywrapper and width of the sidebar
+  // with the sidebar collapsed
+  var bw_margin_collapsed = '.8em';
+  var ssb_width_collapsed = '.8em';
+
+  // colors used by the current theme
+  var dark_color = $('.related').css('background-color');
+  var light_color = $('.document').css('background-color');
+
+  function sidebar_is_collapsed() {
+    return sidebarwrapper.is(':not(:visible)');
+  }
+
+  function toggle_sidebar() {
+    if (sidebar_is_collapsed())
+      expand_sidebar();
+    else
+      collapse_sidebar();
+  }
+
+  function collapse_sidebar() {
+    sidebarwrapper.hide();
+    sidebar.css('width', ssb_width_collapsed);
+    bodywrapper.css('margin-left', bw_margin_collapsed);
+    sidebarbutton.css({
+        'margin-left': '0',
+        'height': bodywrapper.height()
+    });
+    sidebarbutton.find('span').text('»');
+    sidebarbutton.attr('title', _('Expand sidebar'));
+    document.cookie = 'sidebar=collapsed';
+  }
+
+  function expand_sidebar() {
+    bodywrapper.css('margin-left', bw_margin_expanded);
+    sidebar.css('width', ssb_width_expanded);
+    sidebarwrapper.show();
+    sidebarbutton.css({
+        'margin-left': ssb_width_expanded-12,
+        'height': bodywrapper.height()
+    });
+    sidebarbutton.find('span').text('«');
+    sidebarbutton.attr('title', _('Collapse sidebar'));
+    document.cookie = 'sidebar=expanded';
+  }
+
+  function add_sidebar_button() {
+    sidebarwrapper.css({
+        'float': 'left',
+        'margin-right': '0',
+        'width': ssb_width_expanded - 28
+    });
+    // create the button
+    sidebar.append(
+        '<div id="sidebarbutton"><span>«</span></div>'
+    );
+    var sidebarbutton = $('#sidebarbutton');
+    light_color = sidebarbutton.css('background-color');
+    // find the height of the viewport to center the '<<' in the page
+    var viewport_height;
+    if (window.innerHeight)
+ 	  viewport_height = window.innerHeight;
+    else
+	  viewport_height = $(window).height();
+    sidebarbutton.find('span').css({
+        'display': 'block',
+        'margin-top': (viewport_height - sidebar.position().top - 20) / 2
+    });
+
+    sidebarbutton.click(toggle_sidebar);
+    sidebarbutton.attr('title', _('Collapse sidebar'));
+    sidebarbutton.css({
+        'color': '#FFFFFF',
+        'border-left': '1px solid ' + dark_color,
+        'font-size': '1.2em',
+        'cursor': 'pointer',
+        'height': bodywrapper.height(),
+        'padding-top': '1px',
+        'margin-left': ssb_width_expanded - 12
+    });
+
+    sidebarbutton.hover(
+      function () {
+          $(this).css('background-color', dark_color);
+      },
+      function () {
+          $(this).css('background-color', light_color);
+      }
+    );
+  }
+
+  function set_position_from_cookie() {
+    if (!document.cookie)
+      return;
+    var items = document.cookie.split(';');
+    for(var k=0; k<items.length; k++) {
+      var key_val = items[k].split('=');
+      var key = key_val[0].replace(/ /, "");  // strip leading spaces
+      if (key == 'sidebar') {
+        var value = key_val[1];
+        if ((value == 'collapsed') && (!sidebar_is_collapsed()))
+          collapse_sidebar();
+        else if ((value == 'expanded') && (sidebar_is_collapsed()))
+          expand_sidebar();
+      }
+    }
+  }
+
+  add_sidebar_button();
+  var sidebarbutton = $('#sidebarbutton');
+  set_position_from_cookie();
+});
\ No newline at end of file
diff --git a/doc/build/html/_static/underscore.js b/doc/build/html/_static/underscore.js
new file mode 100644
index 0000000..5b55f32
--- /dev/null
+++ b/doc/build/html/_static/underscore.js
@@ -0,0 +1,31 @@
+// Underscore.js 1.3.1
+// (c) 2009-2012 Jeremy Ashkenas, DocumentCloud Inc.
+// Underscore is freely distributable under the MIT license.
+// Portions of Underscore are inspired or borrowed from Prototype,
+// Oliver Steele's Functional, and John Resig's Micro-Templating.
+// For all details and documentation:
+// http://documentcloud.github.com/underscore
+(function(){function q(a,c,d){if(a===c)return a!==0||1/a==1/c;if(a==null||c==null)return a===c;if(a._chain)a=a._wrapped;if(c._chain)c=c._wrapped;if(a.isEqual&&b.isFunction(a.isEqual))return a.isEqual(c);if(c.isEqual&&b.isFunction(c.isEqual))return c.isEqual(a);var e=l.call(a);if(e!=l.call(c))return false;switch(e){case "[object String]":return a==String(c);case "[object Number]":return a!=+a?c!=+c:a==0?1/a==1/c:a==+c;case "[object Date]":case "[object Boolean]":return+a==+c;case "[object [...]
+c.source&&a.global==c.global&&a.multiline==c.multiline&&a.ignoreCase==c.ignoreCase}if(typeof a!="object"||typeof c!="object")return false;for(var f=d.length;f--;)if(d[f]==a)return true;d.push(a);var f=0,g=true;if(e=="[object Array]"){if(f=a.length,g=f==c.length)for(;f--;)if(!(g=f in a==f in c&&q(a[f],c[f],d)))break}else{if("constructor"in a!="constructor"in c||a.constructor!=c.constructor)return false;for(var h in a)if(b.has(a,h)&&(f++,!(g=b.has(c,h)&&q(a[h],c[h],d))))break;if(g){for(h i [...]
+h)&&!f--)break;g=!f}}d.pop();return g}var r=this,G=r._,n={},k=Array.prototype,o=Object.prototype,i=k.slice,H=k.unshift,l=o.toString,I=o.hasOwnProperty,w=k.forEach,x=k.map,y=k.reduce,z=k.reduceRight,A=k.filter,B=k.every,C=k.some,p=k.indexOf,D=k.lastIndexOf,o=Array.isArray,J=Object.keys,s=Function.prototype.bind,b=function(a){return new m(a)};if(typeof exports!=="undefined"){if(typeof module!=="undefined"&&module.exports)exports=module.exports=b;exports._=b}else r._=b;b.VERSION="1.3.1";var [...]
+b.forEach=function(a,c,d){if(a!=null)if(w&&a.forEach===w)a.forEach(c,d);else if(a.length===+a.length)for(var e=0,f=a.length;e<f;e++){if(e in a&&c.call(d,a[e],e,a)===n)break}else for(e in a)if(b.has(a,e)&&c.call(d,a[e],e,a)===n)break};b.map=b.collect=function(a,c,b){var e=[];if(a==null)return e;if(x&&a.map===x)return a.map(c,b);j(a,function(a,g,h){e[e.length]=c.call(b,a,g,h)});if(a.length===+a.length)e.length=a.length;return e};b.reduce=b.foldl=b.inject=function(a,c,d,e){var f=arguments.l [...]
+null&&(a=[]);if(y&&a.reduce===y)return e&&(c=b.bind(c,e)),f?a.reduce(c,d):a.reduce(c);j(a,function(a,b,i){f?d=c.call(e,d,a,b,i):(d=a,f=true)});if(!f)throw new TypeError("Reduce of empty array with no initial value");return d};b.reduceRight=b.foldr=function(a,c,d,e){var f=arguments.length>2;a==null&&(a=[]);if(z&&a.reduceRight===z)return e&&(c=b.bind(c,e)),f?a.reduceRight(c,d):a.reduceRight(c);var g=b.toArray(a).reverse();e&&!f&&(c=b.bind(c,e));return f?b.reduce(g,c,d,e):b.reduce(g,c)};b.f [...]
+function(a,c,b){var e;E(a,function(a,g,h){if(c.call(b,a,g,h))return e=a,true});return e};b.filter=b.select=function(a,c,b){var e=[];if(a==null)return e;if(A&&a.filter===A)return a.filter(c,b);j(a,function(a,g,h){c.call(b,a,g,h)&&(e[e.length]=a)});return e};b.reject=function(a,c,b){var e=[];if(a==null)return e;j(a,function(a,g,h){c.call(b,a,g,h)||(e[e.length]=a)});return e};b.every=b.all=function(a,c,b){var e=true;if(a==null)return e;if(B&&a.every===B)return a.every(c,b);j(a,function(a,g, [...]
+e&&c.call(b,a,g,h)))return n});return e};var E=b.some=b.any=function(a,c,d){c||(c=b.identity);var e=false;if(a==null)return e;if(C&&a.some===C)return a.some(c,d);j(a,function(a,b,h){if(e||(e=c.call(d,a,b,h)))return n});return!!e};b.include=b.contains=function(a,c){var b=false;if(a==null)return b;return p&&a.indexOf===p?a.indexOf(c)!=-1:b=E(a,function(a){return a===c})};b.invoke=function(a,c){var d=i.call(arguments,2);return b.map(a,function(a){return(b.isFunction(c)?c||a:a[c]).apply(a,d) [...]
+function(a,c){return b.map(a,function(a){return a[c]})};b.max=function(a,c,d){if(!c&&b.isArray(a))return Math.max.apply(Math,a);if(!c&&b.isEmpty(a))return-Infinity;var e={computed:-Infinity};j(a,function(a,b,h){b=c?c.call(d,a,b,h):a;b>=e.computed&&(e={value:a,computed:b})});return e.value};b.min=function(a,c,d){if(!c&&b.isArray(a))return Math.min.apply(Math,a);if(!c&&b.isEmpty(a))return Infinity;var e={computed:Infinity};j(a,function(a,b,h){b=c?c.call(d,a,b,h):a;b<e.computed&&(e={value:a [...]
+return e.value};b.shuffle=function(a){var b=[],d;j(a,function(a,f){f==0?b[0]=a:(d=Math.floor(Math.random()*(f+1)),b[f]=b[d],b[d]=a)});return b};b.sortBy=function(a,c,d){return b.pluck(b.map(a,function(a,b,g){return{value:a,criteria:c.call(d,a,b,g)}}).sort(function(a,b){var c=a.criteria,d=b.criteria;return c<d?-1:c>d?1:0}),"value")};b.groupBy=function(a,c){var d={},e=b.isFunction(c)?c:function(a){return a[c]};j(a,function(a,b){var c=e(a,b);(d[c]||(d[c]=[])).push(a)});return d};b.sortedInd [...]
+c,d){d||(d=b.identity);for(var e=0,f=a.length;e<f;){var g=e+f>>1;d(a[g])<d(c)?e=g+1:f=g}return e};b.toArray=function(a){return!a?[]:a.toArray?a.toArray():b.isArray(a)?i.call(a):b.isArguments(a)?i.call(a):b.values(a)};b.size=function(a){return b.toArray(a).length};b.first=b.head=function(a,b,d){return b!=null&&!d?i.call(a,0,b):a[0]};b.initial=function(a,b,d){return i.call(a,0,a.length-(b==null||d?1:b))};b.last=function(a,b,d){return b!=null&&!d?i.call(a,Math.max(a.length-b,0)):a[a.length- [...]
+b.tail=function(a,b,d){return i.call(a,b==null||d?1:b)};b.compact=function(a){return b.filter(a,function(a){return!!a})};b.flatten=function(a,c){return b.reduce(a,function(a,e){if(b.isArray(e))return a.concat(c?e:b.flatten(e));a[a.length]=e;return a},[])};b.without=function(a){return b.difference(a,i.call(arguments,1))};b.uniq=b.unique=function(a,c,d){var d=d?b.map(a,d):a,e=[];b.reduce(d,function(d,g,h){if(0==h||(c===true?b.last(d)!=g:!b.include(d,g)))d[d.length]=g,e[e.length]=a[h];retur [...]
+return e};b.union=function(){return b.uniq(b.flatten(arguments,true))};b.intersection=b.intersect=function(a){var c=i.call(arguments,1);return b.filter(b.uniq(a),function(a){return b.every(c,function(c){return b.indexOf(c,a)>=0})})};b.difference=function(a){var c=b.flatten(i.call(arguments,1));return b.filter(a,function(a){return!b.include(c,a)})};b.zip=function(){for(var a=i.call(arguments),c=b.max(b.pluck(a,"length")),d=Array(c),e=0;e<c;e++)d[e]=b.pluck(a,""+e);return d};b.indexOf=func [...]
+d){if(a==null)return-1;var e;if(d)return d=b.sortedIndex(a,c),a[d]===c?d:-1;if(p&&a.indexOf===p)return a.indexOf(c);for(d=0,e=a.length;d<e;d++)if(d in a&&a[d]===c)return d;return-1};b.lastIndexOf=function(a,b){if(a==null)return-1;if(D&&a.lastIndexOf===D)return a.lastIndexOf(b);for(var d=a.length;d--;)if(d in a&&a[d]===b)return d;return-1};b.range=function(a,b,d){arguments.length<=1&&(b=a||0,a=0);for(var d=arguments[2]||1,e=Math.max(Math.ceil((b-a)/d),0),f=0,g=Array(e);f<e;)g[f++]=a,a+=d; [...]
+var F=function(){};b.bind=function(a,c){var d,e;if(a.bind===s&&s)return s.apply(a,i.call(arguments,1));if(!b.isFunction(a))throw new TypeError;e=i.call(arguments,2);return d=function(){if(!(this instanceof d))return a.apply(c,e.concat(i.call(arguments)));F.prototype=a.prototype;var b=new F,g=a.apply(b,e.concat(i.call(arguments)));return Object(g)===g?g:b}};b.bindAll=function(a){var c=i.call(arguments,1);c.length==0&&(c=b.functions(a));j(c,function(c){a[c]=b.bind(a[c],a)});return a};b.mem [...]
+c){var d={};c||(c=b.identity);return function(){var e=c.apply(this,arguments);return b.has(d,e)?d[e]:d[e]=a.apply(this,arguments)}};b.delay=function(a,b){var d=i.call(arguments,2);return setTimeout(function(){return a.apply(a,d)},b)};b.defer=function(a){return b.delay.apply(b,[a,1].concat(i.call(arguments,1)))};b.throttle=function(a,c){var d,e,f,g,h,i=b.debounce(function(){h=g=false},c);return function(){d=this;e=arguments;var b;f||(f=setTimeout(function(){f=null;h&&a.apply(d,e);i()},c)) [...]
+a.apply(d,e);i();g=true}};b.debounce=function(a,b){var d;return function(){var e=this,f=arguments;clearTimeout(d);d=setTimeout(function(){d=null;a.apply(e,f)},b)}};b.once=function(a){var b=false,d;return function(){if(b)return d;b=true;return d=a.apply(this,arguments)}};b.wrap=function(a,b){return function(){var d=[a].concat(i.call(arguments,0));return b.apply(this,d)}};b.compose=function(){var a=arguments;return function(){for(var b=arguments,d=a.length-1;d>=0;d--)b=[a[d].apply(this,b)] [...]
+b.after=function(a,b){return a<=0?b():function(){if(--a<1)return b.apply(this,arguments)}};b.keys=J||function(a){if(a!==Object(a))throw new TypeError("Invalid object");var c=[],d;for(d in a)b.has(a,d)&&(c[c.length]=d);return c};b.values=function(a){return b.map(a,b.identity)};b.functions=b.methods=function(a){var c=[],d;for(d in a)b.isFunction(a[d])&&c.push(d);return c.sort()};b.extend=function(a){j(i.call(arguments,1),function(b){for(var d in b)a[d]=b[d]});return a};b.defaults=function( [...]
+1),function(b){for(var d in b)a[d]==null&&(a[d]=b[d])});return a};b.clone=function(a){return!b.isObject(a)?a:b.isArray(a)?a.slice():b.extend({},a)};b.tap=function(a,b){b(a);return a};b.isEqual=function(a,b){return q(a,b,[])};b.isEmpty=function(a){if(b.isArray(a)||b.isString(a))return a.length===0;for(var c in a)if(b.has(a,c))return false;return true};b.isElement=function(a){return!!(a&&a.nodeType==1)};b.isArray=o||function(a){return l.call(a)=="[object Array]"};b.isObject=function(a){ret [...]
+b.isArguments=function(a){return l.call(a)=="[object Arguments]"};if(!b.isArguments(arguments))b.isArguments=function(a){return!(!a||!b.has(a,"callee"))};b.isFunction=function(a){return l.call(a)=="[object Function]"};b.isString=function(a){return l.call(a)=="[object String]"};b.isNumber=function(a){return l.call(a)=="[object Number]"};b.isNaN=function(a){return a!==a};b.isBoolean=function(a){return a===true||a===false||l.call(a)=="[object Boolean]"};b.isDate=function(a){return l.call(a) [...]
+b.isRegExp=function(a){return l.call(a)=="[object RegExp]"};b.isNull=function(a){return a===null};b.isUndefined=function(a){return a===void 0};b.has=function(a,b){return I.call(a,b)};b.noConflict=function(){r._=G;return this};b.identity=function(a){return a};b.times=function(a,b,d){for(var e=0;e<a;e++)b.call(d,e)};b.escape=function(a){return(""+a).replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">").replace(/"/g,""").replace(/'/g,"&#x27;").replace(/\//g,"&#x2F;")};b.mixin [...]
+function(c){K(c,b[c]=a[c])})};var L=0;b.uniqueId=function(a){var b=L++;return a?a+b:b};b.templateSettings={evaluate:/<%([\s\S]+?)%>/g,interpolate:/<%=([\s\S]+?)%>/g,escape:/<%-([\s\S]+?)%>/g};var t=/.^/,u=function(a){return a.replace(/\\\\/g,"\\").replace(/\\'/g,"'")};b.template=function(a,c){var d=b.templateSettings,d="var __p=[],print=function(){__p.push.apply(__p,arguments);};with(obj||{}){__p.push('"+a.replace(/\\/g,"\\\\").replace(/'/g,"\\'").replace(d.escape||t,function(a,b){return [...]
+u(b)+"),'"}).replace(d.interpolate||t,function(a,b){return"',"+u(b)+",'"}).replace(d.evaluate||t,function(a,b){return"');"+u(b).replace(/[\r\n\t]/g," ")+";__p.push('"}).replace(/\r/g,"\\r").replace(/\n/g,"\\n").replace(/\t/g,"\\t")+"');}return __p.join('');",e=new Function("obj","_",d);return c?e(c,b):function(a){return e.call(this,a,b)}};b.chain=function(a){return b(a).chain()};var m=function(a){this._wrapped=a};b.prototype=m.prototype;var v=function(a,c){return c?b(a).chain():a},K=func [...]
+function(){var a=i.call(arguments);H.call(a,this._wrapped);return v(c.apply(b,a),this._chain)}};b.mixin(b);j("pop,push,reverse,shift,sort,splice,unshift".split(","),function(a){var b=k[a];m.prototype[a]=function(){var d=this._wrapped;b.apply(d,arguments);var e=d.length;(a=="shift"||a=="splice")&&e===0&&delete d[0];return v(d,this._chain)}});j(["concat","join","slice"],function(a){var b=k[a];m.prototype[a]=function(){return v(b.apply(this._wrapped,arguments),this._chain)}});m.prototype.ch [...]
+true;return this};m.prototype.value=function(){return this._wrapped}}).call(this);
diff --git a/doc/build/html/_static/up-pressed.png b/doc/build/html/_static/up-pressed.png
new file mode 100644
index 0000000..8bd587a
Binary files /dev/null and b/doc/build/html/_static/up-pressed.png differ
diff --git a/doc/build/html/_static/up.png b/doc/build/html/_static/up.png
new file mode 100644
index 0000000..b946256
Binary files /dev/null and b/doc/build/html/_static/up.png differ
diff --git a/doc/build/html/_static/websupport.js b/doc/build/html/_static/websupport.js
new file mode 100644
index 0000000..71c0a13
--- /dev/null
+++ b/doc/build/html/_static/websupport.js
@@ -0,0 +1,808 @@
+/*
+ * websupport.js
+ * ~~~~~~~~~~~~~
+ *
+ * sphinx.websupport utilties for all documentation.
+ *
+ * :copyright: Copyright 2007-2014 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+(function($) {
+  $.fn.autogrow = function() {
+    return this.each(function() {
+    var textarea = this;
+
+    $.fn.autogrow.resize(textarea);
+
+    $(textarea)
+      .focus(function() {
+        textarea.interval = setInterval(function() {
+          $.fn.autogrow.resize(textarea);
+        }, 500);
+      })
+      .blur(function() {
+        clearInterval(textarea.interval);
+      });
+    });
+  };
+
+  $.fn.autogrow.resize = function(textarea) {
+    var lineHeight = parseInt($(textarea).css('line-height'), 10);
+    var lines = textarea.value.split('\n');
+    var columns = textarea.cols;
+    var lineCount = 0;
+    $.each(lines, function() {
+      lineCount += Math.ceil(this.length / columns) || 1;
+    });
+    var height = lineHeight * (lineCount + 1);
+    $(textarea).css('height', height);
+  };
+})(jQuery);
+
+(function($) {
+  var comp, by;
+
+  function init() {
+    initEvents();
+    initComparator();
+  }
+
+  function initEvents() {
+    $('a.comment-close').live("click", function(event) {
+      event.preventDefault();
+      hide($(this).attr('id').substring(2));
+    });
+    $('a.vote').live("click", function(event) {
+      event.preventDefault();
+      handleVote($(this));
+    });
+    $('a.reply').live("click", function(event) {
+      event.preventDefault();
+      openReply($(this).attr('id').substring(2));
+    });
+    $('a.close-reply').live("click", function(event) {
+      event.preventDefault();
+      closeReply($(this).attr('id').substring(2));
+    });
+    $('a.sort-option').live("click", function(event) {
+      event.preventDefault();
+      handleReSort($(this));
+    });
+    $('a.show-proposal').live("click", function(event) {
+      event.preventDefault();
+      showProposal($(this).attr('id').substring(2));
+    });
+    $('a.hide-proposal').live("click", function(event) {
+      event.preventDefault();
+      hideProposal($(this).attr('id').substring(2));
+    });
+    $('a.show-propose-change').live("click", function(event) {
+      event.preventDefault();
+      showProposeChange($(this).attr('id').substring(2));
+    });
+    $('a.hide-propose-change').live("click", function(event) {
+      event.preventDefault();
+      hideProposeChange($(this).attr('id').substring(2));
+    });
+    $('a.accept-comment').live("click", function(event) {
+      event.preventDefault();
+      acceptComment($(this).attr('id').substring(2));
+    });
+    $('a.delete-comment').live("click", function(event) {
+      event.preventDefault();
+      deleteComment($(this).attr('id').substring(2));
+    });
+    $('a.comment-markup').live("click", function(event) {
+      event.preventDefault();
+      toggleCommentMarkupBox($(this).attr('id').substring(2));
+    });
+  }
+
+  /**
+   * Set comp, which is a comparator function used for sorting and
+   * inserting comments into the list.
+   */
+  function setComparator() {
+    // If the first three letters are "asc", sort in ascending order
+    // and remove the prefix.
+    if (by.substring(0,3) == 'asc') {
+      var i = by.substring(3);
+      comp = function(a, b) { return a[i] - b[i]; };
+    } else {
+      // Otherwise sort in descending order.
+      comp = function(a, b) { return b[by] - a[by]; };
+    }
+
+    // Reset link styles and format the selected sort option.
+    $('a.sel').attr('href', '#').removeClass('sel');
+    $('a.by' + by).removeAttr('href').addClass('sel');
+  }
+
+  /**
+   * Create a comp function. If the user has preferences stored in
+   * the sortBy cookie, use those, otherwise use the default.
+   */
+  function initComparator() {
+    by = 'rating'; // Default to sort by rating.
+    // If the sortBy cookie is set, use that instead.
+    if (document.cookie.length > 0) {
+      var start = document.cookie.indexOf('sortBy=');
+      if (start != -1) {
+        start = start + 7;
+        var end = document.cookie.indexOf(";", start);
+        if (end == -1) {
+          end = document.cookie.length;
+          by = unescape(document.cookie.substring(start, end));
+        }
+      }
+    }
+    setComparator();
+  }
+
+  /**
+   * Show a comment div.
+   */
+  function show(id) {
+    $('#ao' + id).hide();
+    $('#ah' + id).show();
+    var context = $.extend({id: id}, opts);
+    var popup = $(renderTemplate(popupTemplate, context)).hide();
+    popup.find('textarea[name="proposal"]').hide();
+    popup.find('a.by' + by).addClass('sel');
+    var form = popup.find('#cf' + id);
+    form.submit(function(event) {
+      event.preventDefault();
+      addComment(form);
+    });
+    $('#s' + id).after(popup);
+    popup.slideDown('fast', function() {
+      getComments(id);
+    });
+  }
+
+  /**
+   * Hide a comment div.
+   */
+  function hide(id) {
+    $('#ah' + id).hide();
+    $('#ao' + id).show();
+    var div = $('#sc' + id);
+    div.slideUp('fast', function() {
+      div.remove();
+    });
+  }
+
+  /**
+   * Perform an ajax request to get comments for a node
+   * and insert the comments into the comments tree.
+   */
+  function getComments(id) {
+    $.ajax({
+     type: 'GET',
+     url: opts.getCommentsURL,
+     data: {node: id},
+     success: function(data, textStatus, request) {
+       var ul = $('#cl' + id);
+       var speed = 100;
+       $('#cf' + id)
+         .find('textarea[name="proposal"]')
+         .data('source', data.source);
+
+       if (data.comments.length === 0) {
+         ul.html('<li>No comments yet.</li>');
+         ul.data('empty', true);
+       } else {
+         // If there are comments, sort them and put them in the list.
+         var comments = sortComments(data.comments);
+         speed = data.comments.length * 100;
+         appendComments(comments, ul);
+         ul.data('empty', false);
+       }
+       $('#cn' + id).slideUp(speed + 200);
+       ul.slideDown(speed);
+     },
+     error: function(request, textStatus, error) {
+       showError('Oops, there was a problem retrieving the comments.');
+     },
+     dataType: 'json'
+    });
+  }
+
+  /**
+   * Add a comment via ajax and insert the comment into the comment tree.
+   */
+  function addComment(form) {
+    var node_id = form.find('input[name="node"]').val();
+    var parent_id = form.find('input[name="parent"]').val();
+    var text = form.find('textarea[name="comment"]').val();
+    var proposal = form.find('textarea[name="proposal"]').val();
+
+    if (text == '') {
+      showError('Please enter a comment.');
+      return;
+    }
+
+    // Disable the form that is being submitted.
+    form.find('textarea,input').attr('disabled', 'disabled');
+
+    // Send the comment to the server.
+    $.ajax({
+      type: "POST",
+      url: opts.addCommentURL,
+      dataType: 'json',
+      data: {
+        node: node_id,
+        parent: parent_id,
+        text: text,
+        proposal: proposal
+      },
+      success: function(data, textStatus, error) {
+        // Reset the form.
+        if (node_id) {
+          hideProposeChange(node_id);
+        }
+        form.find('textarea')
+          .val('')
+          .add(form.find('input'))
+          .removeAttr('disabled');
+	var ul = $('#cl' + (node_id || parent_id));
+        if (ul.data('empty')) {
+          $(ul).empty();
+          ul.data('empty', false);
+        }
+        insertComment(data.comment);
+        var ao = $('#ao' + node_id);
+        ao.find('img').attr({'src': opts.commentBrightImage});
+        if (node_id) {
+          // if this was a "root" comment, remove the commenting box
+          // (the user can get it back by reopening the comment popup)
+          $('#ca' + node_id).slideUp();
+        }
+      },
+      error: function(request, textStatus, error) {
+        form.find('textarea,input').removeAttr('disabled');
+        showError('Oops, there was a problem adding the comment.');
+      }
+    });
+  }
+
+  /**
+   * Recursively append comments to the main comment list and children
+   * lists, creating the comment tree.
+   */
+  function appendComments(comments, ul) {
+    $.each(comments, function() {
+      var div = createCommentDiv(this);
+      ul.append($(document.createElement('li')).html(div));
+      appendComments(this.children, div.find('ul.comment-children'));
+      // To avoid stagnating data, don't store the comments children in data.
+      this.children = null;
+      div.data('comment', this);
+    });
+  }
+
+  /**
+   * After adding a new comment, it must be inserted in the correct
+   * location in the comment tree.
+   */
+  function insertComment(comment) {
+    var div = createCommentDiv(comment);
+
+    // To avoid stagnating data, don't store the comments children in data.
+    comment.children = null;
+    div.data('comment', comment);
+
+    var ul = $('#cl' + (comment.node || comment.parent));
+    var siblings = getChildren(ul);
+
+    var li = $(document.createElement('li'));
+    li.hide();
+
+    // Determine where in the parents children list to insert this comment.
+    for(i=0; i < siblings.length; i++) {
+      if (comp(comment, siblings[i]) <= 0) {
+        $('#cd' + siblings[i].id)
+          .parent()
+          .before(li.html(div));
+        li.slideDown('fast');
+        return;
+      }
+    }
+
+    // If we get here, this comment rates lower than all the others,
+    // or it is the only comment in the list.
+    ul.append(li.html(div));
+    li.slideDown('fast');
+  }
+
+  function acceptComment(id) {
+    $.ajax({
+      type: 'POST',
+      url: opts.acceptCommentURL,
+      data: {id: id},
+      success: function(data, textStatus, request) {
+        $('#cm' + id).fadeOut('fast');
+        $('#cd' + id).removeClass('moderate');
+      },
+      error: function(request, textStatus, error) {
+        showError('Oops, there was a problem accepting the comment.');
+      }
+    });
+  }
+
+  function deleteComment(id) {
+    $.ajax({
+      type: 'POST',
+      url: opts.deleteCommentURL,
+      data: {id: id},
+      success: function(data, textStatus, request) {
+        var div = $('#cd' + id);
+        if (data == 'delete') {
+          // Moderator mode: remove the comment and all children immediately
+          div.slideUp('fast', function() {
+            div.remove();
+          });
+          return;
+        }
+        // User mode: only mark the comment as deleted
+        div
+          .find('span.user-id:first')
+          .text('[deleted]').end()
+          .find('div.comment-text:first')
+          .text('[deleted]').end()
+          .find('#cm' + id + ', #dc' + id + ', #ac' + id + ', #rc' + id +
+                ', #sp' + id + ', #hp' + id + ', #cr' + id + ', #rl' + id)
+          .remove();
+        var comment = div.data('comment');
+        comment.username = '[deleted]';
+        comment.text = '[deleted]';
+        div.data('comment', comment);
+      },
+      error: function(request, textStatus, error) {
+        showError('Oops, there was a problem deleting the comment.');
+      }
+    });
+  }
+
+  function showProposal(id) {
+    $('#sp' + id).hide();
+    $('#hp' + id).show();
+    $('#pr' + id).slideDown('fast');
+  }
+
+  function hideProposal(id) {
+    $('#hp' + id).hide();
+    $('#sp' + id).show();
+    $('#pr' + id).slideUp('fast');
+  }
+
+  function showProposeChange(id) {
+    $('#pc' + id).hide();
+    $('#hc' + id).show();
+    var textarea = $('#pt' + id);
+    textarea.val(textarea.data('source'));
+    $.fn.autogrow.resize(textarea[0]);
+    textarea.slideDown('fast');
+  }
+
+  function hideProposeChange(id) {
+    $('#hc' + id).hide();
+    $('#pc' + id).show();
+    var textarea = $('#pt' + id);
+    textarea.val('').removeAttr('disabled');
+    textarea.slideUp('fast');
+  }
+
+  function toggleCommentMarkupBox(id) {
+    $('#mb' + id).toggle();
+  }
+
+  /** Handle when the user clicks on a sort by link. */
+  function handleReSort(link) {
+    var classes = link.attr('class').split(/\s+/);
+    for (var i=0; i<classes.length; i++) {
+      if (classes[i] != 'sort-option') {
+	by = classes[i].substring(2);
+      }
+    }
+    setComparator();
+    // Save/update the sortBy cookie.
+    var expiration = new Date();
+    expiration.setDate(expiration.getDate() + 365);
+    document.cookie= 'sortBy=' + escape(by) +
+                     ';expires=' + expiration.toUTCString();
+    $('ul.comment-ul').each(function(index, ul) {
+      var comments = getChildren($(ul), true);
+      comments = sortComments(comments);
+      appendComments(comments, $(ul).empty());
+    });
+  }
+
+  /**
+   * Function to process a vote when a user clicks an arrow.
+   */
+  function handleVote(link) {
+    if (!opts.voting) {
+      showError("You'll need to login to vote.");
+      return;
+    }
+
+    var id = link.attr('id');
+    if (!id) {
+      // Didn't click on one of the voting arrows.
+      return;
+    }
+    // If it is an unvote, the new vote value is 0,
+    // Otherwise it's 1 for an upvote, or -1 for a downvote.
+    var value = 0;
+    if (id.charAt(1) != 'u') {
+      value = id.charAt(0) == 'u' ? 1 : -1;
+    }
+    // The data to be sent to the server.
+    var d = {
+      comment_id: id.substring(2),
+      value: value
+    };
+
+    // Swap the vote and unvote links.
+    link.hide();
+    $('#' + id.charAt(0) + (id.charAt(1) == 'u' ? 'v' : 'u') + d.comment_id)
+      .show();
+
+    // The div the comment is displayed in.
+    var div = $('div#cd' + d.comment_id);
+    var data = div.data('comment');
+
+    // If this is not an unvote, and the other vote arrow has
+    // already been pressed, unpress it.
+    if ((d.value !== 0) && (data.vote === d.value * -1)) {
+      $('#' + (d.value == 1 ? 'd' : 'u') + 'u' + d.comment_id).hide();
+      $('#' + (d.value == 1 ? 'd' : 'u') + 'v' + d.comment_id).show();
+    }
+
+    // Update the comments rating in the local data.
+    data.rating += (data.vote === 0) ? d.value : (d.value - data.vote);
+    data.vote = d.value;
+    div.data('comment', data);
+
+    // Change the rating text.
+    div.find('.rating:first')
+      .text(data.rating + ' point' + (data.rating == 1 ? '' : 's'));
+
+    // Send the vote information to the server.
+    $.ajax({
+      type: "POST",
+      url: opts.processVoteURL,
+      data: d,
+      error: function(request, textStatus, error) {
+        showError('Oops, there was a problem casting that vote.');
+      }
+    });
+  }
+
+  /**
+   * Open a reply form used to reply to an existing comment.
+   */
+  function openReply(id) {
+    // Swap out the reply link for the hide link
+    $('#rl' + id).hide();
+    $('#cr' + id).show();
+
+    // Add the reply li to the children ul.
+    var div = $(renderTemplate(replyTemplate, {id: id})).hide();
+    $('#cl' + id)
+      .prepend(div)
+      // Setup the submit handler for the reply form.
+      .find('#rf' + id)
+      .submit(function(event) {
+        event.preventDefault();
+        addComment($('#rf' + id));
+        closeReply(id);
+      })
+      .find('input[type=button]')
+      .click(function() {
+        closeReply(id);
+      });
+    div.slideDown('fast', function() {
+      $('#rf' + id).find('textarea').focus();
+    });
+  }
+
+  /**
+   * Close the reply form opened with openReply.
+   */
+  function closeReply(id) {
+    // Remove the reply div from the DOM.
+    $('#rd' + id).slideUp('fast', function() {
+      $(this).remove();
+    });
+
+    // Swap out the hide link for the reply link
+    $('#cr' + id).hide();
+    $('#rl' + id).show();
+  }
+
+  /**
+   * Recursively sort a tree of comments using the comp comparator.
+   */
+  function sortComments(comments) {
+    comments.sort(comp);
+    $.each(comments, function() {
+      this.children = sortComments(this.children);
+    });
+    return comments;
+  }
+
+  /**
+   * Get the children comments from a ul. If recursive is true,
+   * recursively include childrens' children.
+   */
+  function getChildren(ul, recursive) {
+    var children = [];
+    ul.children().children("[id^='cd']")
+      .each(function() {
+        var comment = $(this).data('comment');
+        if (recursive)
+          comment.children = getChildren($(this).find('#cl' + comment.id), true);
+        children.push(comment);
+      });
+    return children;
+  }
+
+  /** Create a div to display a comment in. */
+  function createCommentDiv(comment) {
+    if (!comment.displayed && !opts.moderator) {
+      return $('<div class="moderate">Thank you!  Your comment will show up '
+               + 'once it is has been approved by a moderator.</div>');
+    }
+    // Prettify the comment rating.
+    comment.pretty_rating = comment.rating + ' point' +
+      (comment.rating == 1 ? '' : 's');
+    // Make a class (for displaying not yet moderated comments differently)
+    comment.css_class = comment.displayed ? '' : ' moderate';
+    // Create a div for this comment.
+    var context = $.extend({}, opts, comment);
+    var div = $(renderTemplate(commentTemplate, context));
+
+    // If the user has voted on this comment, highlight the correct arrow.
+    if (comment.vote) {
+      var direction = (comment.vote == 1) ? 'u' : 'd';
+      div.find('#' + direction + 'v' + comment.id).hide();
+      div.find('#' + direction + 'u' + comment.id).show();
+    }
+
+    if (opts.moderator || comment.text != '[deleted]') {
+      div.find('a.reply').show();
+      if (comment.proposal_diff)
+        div.find('#sp' + comment.id).show();
+      if (opts.moderator && !comment.displayed)
+        div.find('#cm' + comment.id).show();
+      if (opts.moderator || (opts.username == comment.username))
+        div.find('#dc' + comment.id).show();
+    }
+    return div;
+  }
+
+  /**
+   * A simple template renderer. Placeholders such as <%id%> are replaced
+   * by context['id'] with items being escaped. Placeholders such as <#id#>
+   * are not escaped.
+   */
+  function renderTemplate(template, context) {
+    var esc = $(document.createElement('div'));
+
+    function handle(ph, escape) {
+      var cur = context;
+      $.each(ph.split('.'), function() {
+        cur = cur[this];
+      });
+      return escape ? esc.text(cur || "").html() : cur;
+    }
+
+    return template.replace(/<([%#])([\w\.]*)\1>/g, function() {
+      return handle(arguments[2], arguments[1] == '%' ? true : false);
+    });
+  }
+
+  /** Flash an error message briefly. */
+  function showError(message) {
+    $(document.createElement('div')).attr({'class': 'popup-error'})
+      .append($(document.createElement('div'))
+               .attr({'class': 'error-message'}).text(message))
+      .appendTo('body')
+      .fadeIn("slow")
+      .delay(2000)
+      .fadeOut("slow");
+  }
+
+  /** Add a link the user uses to open the comments popup. */
+  $.fn.comment = function() {
+    return this.each(function() {
+      var id = $(this).attr('id').substring(1);
+      var count = COMMENT_METADATA[id];
+      var title = count + ' comment' + (count == 1 ? '' : 's');
+      var image = count > 0 ? opts.commentBrightImage : opts.commentImage;
+      var addcls = count == 0 ? ' nocomment' : '';
+      $(this)
+        .append(
+          $(document.createElement('a')).attr({
+            href: '#',
+            'class': 'sphinx-comment-open' + addcls,
+            id: 'ao' + id
+          })
+            .append($(document.createElement('img')).attr({
+              src: image,
+              alt: 'comment',
+              title: title
+            }))
+            .click(function(event) {
+              event.preventDefault();
+              show($(this).attr('id').substring(2));
+            })
+        )
+        .append(
+          $(document.createElement('a')).attr({
+            href: '#',
+            'class': 'sphinx-comment-close hidden',
+            id: 'ah' + id
+          })
+            .append($(document.createElement('img')).attr({
+              src: opts.closeCommentImage,
+              alt: 'close',
+              title: 'close'
+            }))
+            .click(function(event) {
+              event.preventDefault();
+              hide($(this).attr('id').substring(2));
+            })
+        );
+    });
+  };
+
+  var opts = {
+    processVoteURL: '/_process_vote',
+    addCommentURL: '/_add_comment',
+    getCommentsURL: '/_get_comments',
+    acceptCommentURL: '/_accept_comment',
+    deleteCommentURL: '/_delete_comment',
+    commentImage: '/static/_static/comment.png',
+    closeCommentImage: '/static/_static/comment-close.png',
+    loadingImage: '/static/_static/ajax-loader.gif',
+    commentBrightImage: '/static/_static/comment-bright.png',
+    upArrow: '/static/_static/up.png',
+    downArrow: '/static/_static/down.png',
+    upArrowPressed: '/static/_static/up-pressed.png',
+    downArrowPressed: '/static/_static/down-pressed.png',
+    voting: false,
+    moderator: false
+  };
+
+  if (typeof COMMENT_OPTIONS != "undefined") {
+    opts = jQuery.extend(opts, COMMENT_OPTIONS);
+  }
+
+  var popupTemplate = '\
+    <div class="sphinx-comments" id="sc<%id%>">\
+      <p class="sort-options">\
+        Sort by:\
+        <a href="#" class="sort-option byrating">best rated</a>\
+        <a href="#" class="sort-option byascage">newest</a>\
+        <a href="#" class="sort-option byage">oldest</a>\
+      </p>\
+      <div class="comment-header">Comments</div>\
+      <div class="comment-loading" id="cn<%id%>">\
+        loading comments... <img src="<%loadingImage%>" alt="" /></div>\
+      <ul id="cl<%id%>" class="comment-ul"></ul>\
+      <div id="ca<%id%>">\
+      <p class="add-a-comment">Add a comment\
+        (<a href="#" class="comment-markup" id="ab<%id%>">markup</a>):</p>\
+      <div class="comment-markup-box" id="mb<%id%>">\
+        reStructured text markup: <i>*emph*</i>, <b>**strong**</b>, \
+        <tt>``code``</tt>, \
+        code blocks: <tt>::</tt> and an indented block after blank line</div>\
+      <form method="post" id="cf<%id%>" class="comment-form" action="">\
+        <textarea name="comment" cols="80"></textarea>\
+        <p class="propose-button">\
+          <a href="#" id="pc<%id%>" class="show-propose-change">\
+            Propose a change ▹\
+          </a>\
+          <a href="#" id="hc<%id%>" class="hide-propose-change">\
+            Propose a change ▿\
+          </a>\
+        </p>\
+        <textarea name="proposal" id="pt<%id%>" cols="80"\
+                  spellcheck="false"></textarea>\
+        <input type="submit" value="Add comment" />\
+        <input type="hidden" name="node" value="<%id%>" />\
+        <input type="hidden" name="parent" value="" />\
+      </form>\
+      </div>\
+    </div>';
+
+  var commentTemplate = '\
+    <div id="cd<%id%>" class="sphinx-comment<%css_class%>">\
+      <div class="vote">\
+        <div class="arrow">\
+          <a href="#" id="uv<%id%>" class="vote" title="vote up">\
+            <img src="<%upArrow%>" />\
+          </a>\
+          <a href="#" id="uu<%id%>" class="un vote" title="vote up">\
+            <img src="<%upArrowPressed%>" />\
+          </a>\
+        </div>\
+        <div class="arrow">\
+          <a href="#" id="dv<%id%>" class="vote" title="vote down">\
+            <img src="<%downArrow%>" id="da<%id%>" />\
+          </a>\
+          <a href="#" id="du<%id%>" class="un vote" title="vote down">\
+            <img src="<%downArrowPressed%>" />\
+          </a>\
+        </div>\
+      </div>\
+      <div class="comment-content">\
+        <p class="tagline comment">\
+          <span class="user-id"><%username%></span>\
+          <span class="rating"><%pretty_rating%></span>\
+          <span class="delta"><%time.delta%></span>\
+        </p>\
+        <div class="comment-text comment"><#text#></div>\
+        <p class="comment-opts comment">\
+          <a href="#" class="reply hidden" id="rl<%id%>">reply ▹</a>\
+          <a href="#" class="close-reply" id="cr<%id%>">reply ▿</a>\
+          <a href="#" id="sp<%id%>" class="show-proposal">proposal ▹</a>\
+          <a href="#" id="hp<%id%>" class="hide-proposal">proposal ▿</a>\
+          <a href="#" id="dc<%id%>" class="delete-comment hidden">delete</a>\
+          <span id="cm<%id%>" class="moderation hidden">\
+            <a href="#" id="ac<%id%>" class="accept-comment">accept</a>\
+          </span>\
+        </p>\
+        <pre class="proposal" id="pr<%id%>">\
+<#proposal_diff#>\
+        </pre>\
+          <ul class="comment-children" id="cl<%id%>"></ul>\
+        </div>\
+        <div class="clearleft"></div>\
+      </div>\
+    </div>';
+
+  var replyTemplate = '\
+    <li>\
+      <div class="reply-div" id="rd<%id%>">\
+        <form id="rf<%id%>">\
+          <textarea name="comment" cols="80"></textarea>\
+          <input type="submit" value="Add reply" />\
+          <input type="button" value="Cancel" />\
+          <input type="hidden" name="parent" value="<%id%>" />\
+          <input type="hidden" name="node" value="" />\
+        </form>\
+      </div>\
+    </li>';
+
+  $(document).ready(function() {
+    init();
+  });
+})(jQuery);
+
+$(document).ready(function() {
+  // add comment anchors for all paragraphs that are commentable
+  $('.sphinx-has-comment').comment();
+
+  // highlight search words in search results
+  $("div.context").each(function() {
+    var params = $.getQueryParameters();
+    var terms = (params.q) ? params.q[0].split(/\s+/) : [];
+    var result = $(this);
+    $.each(terms, function() {
+      result.highlightText(this.toLowerCase(), 'highlighted');
+    });
+  });
+
+  // directly open comment window if requested
+  var anchor = document.location.hash;
+  if (anchor.substring(0, 9) == '#comment-') {
+    $('#ao' + anchor.substring(9)).click();
+    document.location.hash = '#s' + anchor.substring(9);
+  }
+});
diff --git a/doc/build/html/building.html b/doc/build/html/building.html
new file mode 100644
index 0000000..c611c9e
--- /dev/null
+++ b/doc/build/html/building.html
@@ -0,0 +1,284 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+    
+    <title>Requirements — Sailfish 0.6.4 documentation</title>
+    
+    <link rel="stylesheet" href="_static/default.css" type="text/css" />
+    <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+    
+    <script type="text/javascript">
+      var DOCUMENTATION_OPTIONS = {
+        URL_ROOT:    './',
+        VERSION:     '0.6.4',
+        COLLAPSE_INDEX: false,
+        FILE_SUFFIX: '.html',
+        HAS_SOURCE:  true
+      };
+    </script>
+    <script type="text/javascript" src="_static/jquery.js"></script>
+    <script type="text/javascript" src="_static/underscore.js"></script>
+    <script type="text/javascript" src="_static/doctools.js"></script>
+    <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    <link rel="top" title="Sailfish 0.6.4 documentation" href="index.html" />
+    <link rel="prev" title="Welcome to Sailfish’s documentation!" href="index.html" /> 
+  </head>
+  <body>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="genindex.html" title="General Index"
+             accesskey="I">index</a></li>
+        <li class="right" >
+          <a href="index.html" title="Welcome to Sailfish’s documentation!"
+             accesskey="P">previous</a> |</li>
+        <li><a href="index.html">Sailfish 0.6.4 documentation</a> »</li> 
+      </ul>
+    </div>  
+
+    <div class="document">
+      <div class="documentwrapper">
+        <div class="bodywrapper">
+          <div class="body">
+            
+  <div class="section" id="requirements">
+<h1>Requirements<a class="headerlink" href="#requirements" title="Permalink to this headline">¶</a></h1>
+<ul class="simple">
+<li>A C++11 conformant compiler (currently tested with GCC>=4.7 and Clang>=3.4)</li>
+<li><a class="reference external" href="http://www.cmake.org">CMake</a>. Sailfish uses the CMake build system to check, fetch and install
+dependencies, and to compile and install Sailfish. CMake is available for all
+major platforms (though Sailfish is currently unsupported on Windows.)</li>
+</ul>
+</div>
+<div class="section" id="installation">
+<h1>Installation<a class="headerlink" href="#installation" title="Permalink to this headline">¶</a></h1>
+<p>After downloading the Sailfish source distribution and unpacking it, change into the top-level directory:</p>
+<div class="highlight-python"><div class="highlight"><pre>> cd Sailfish
+</pre></div>
+</div>
+<p>Then, create and out-of-source build directory and change into it:</p>
+<div class="highlight-python"><div class="highlight"><pre>> mkdir build
+> cd build
+</pre></div>
+</div>
+<p>Sailfish makes extensive use of <a class="reference external" href="http://www.boost.org">Boost</a>.  We recommend installing the most
+recent version (1.55) systemwide if possible. If Boost is not installed on your
+system, the build process will fetch, compile and install it locally.  However,
+if you already have a recent version of Boost available on your system, it make
+sense to tell the build system to use that.</p>
+<p>If you have Boost installed you can tell CMake where to look for it. Likewise,
+if you already have <a class="reference external" href="http://threadingbuildingblocks.org/">Intel’s Threading Building Blocks</a> library installed, you can tell CMake
+where it is as well. The flags for CMake are as follows:</p>
+<ul class="simple">
+<li>-DFETCH_BOOST=TRUE –  If you don’t have Boost installed (or have an older
+version of it), you can provide the FETCH_BOOST flag instead of the
+BOOST_ROOT variable, which will cause CMake to fetch and build Boost locally.</li>
+<li>-DBOOST_ROOT=<boostdir> – Tells CMake where an existing installtion of Boost
+resides, and looks for the appropritate version in <boostdir>.  This is the
+top-level directory where Boost is installed (e.g. /opt/local).</li>
+<li>-DTBB_INSTALL_DIR=<tbbroot> – Tells CMake where an existing installation of
+Intel’s TBB is installed (<tbbroot>), and looks for the apropriate headers
+and libraries there. This is the top-level directory where TBB is installed
+(e.g. /opt/local).</li>
+<li>-DCMAKE_INSTALL_PREFIX=<install_dir> – <install_dir> is the directory to
+which you wish Sailfish to be installed.  If you don’t specify this option,
+it will be installed locally in the top-level directory (i.e. the directory
+directly above “build”).</li>
+</ul>
+<p>Setting the appropriate flags, you can then run the CMake configure step as
+follows:</p>
+<div class="highlight-python"><div class="highlight"><pre>> cmake [FLAGS] ..
+</pre></div>
+</div>
+<p>The above command is the cmake configuration step, which <em>should</em> complain if
+anything goes wrong.  Next, you have to run the build step. Depending on what
+libraries need to be fetched and installed, this could take a while
+(specifically if the installation needs to install Boost).  To start the build,
+just run make.</p>
+<div class="highlight-python"><div class="highlight"><pre>> make
+</pre></div>
+</div>
+<p>If the build is successful, the appropriate executables and libraries should be
+created. There are two points to note about the build process.  First, if the
+build system is downloading and compiling boost, you may see a large number of
+warnings during compilation; these are normal.  Second, note that CMake has
+colored output by default, and the steps which create or link libraries are
+printed in red.  This is the color chosen by CMake for linking messages, and
+does not denote an error in the build process.</p>
+<p>Finally, after everything is built, the libraries and executable can be
+installed with:</p>
+<div class="highlight-python"><div class="highlight"><pre>> make install
+</pre></div>
+</div>
+<p>To ensure that Sailfish has access to the appropriate libraries you should
+ensure that the PATH variabile contains <install_dir>/bin, and that
+LD_LIBRARY_PATH (or DYLD_FALLBACK_LIBRARY_PATH on OSX) contains
+<install_dir>/lib.</p>
+<p>After the paths are set, you can test the installation by running</p>
+<div class="highlight-python"><div class="highlight"><pre>> make test
+</pre></div>
+</div>
+<p>This should run a simple test and tell you if it succeeded or not.</p>
+</div>
+<div class="section" id="running-sailfish">
+<h1>Running Sailfish<a class="headerlink" href="#running-sailfish" title="Permalink to this headline">¶</a></h1>
+<p>Sailfish is a tool for transcript quantification from RNA-seq data.  It
+requires a set of target transcripts (either from a reference or _de-<a href="#id3"><span class="problematic" id="id4">novo_</span></a>
+assembly) to quantify.  All you need to run Sailfish is a fasta file containing
+your reference transcripts and a (set of) fasta/fastq file(s) containing your
+reads.  Sailfish runs in two phases; indexing and quantification.  The indexing
+step is independent of the reads, and only need to be run one for a particular
+set of reference transcripts and choice of k (the k-mer size). The
+quantification step, obviously, is specific to the set of RNA-seq reads and is
+thus run more frequently. For a more complete description of all available
+options in Sailfish, see the manual.</p>
+<div class="section" id="indexing">
+<h2>Indexing<a class="headerlink" href="#indexing" title="Permalink to this headline">¶</a></h2>
+<p>To generate the Sailfish index for your reference set of transcripts, you
+should run the following command:</p>
+<dl class="docutils">
+<dt>::</dt>
+<dd>> sailfish index -t <ref_transcripts> -o <out_dir> -k <kmer_len></dd>
+</dl>
+<p>This will build a Sailfish index for k-mers of length <kmer_len> for the
+reference transcripts  provided in the file <ref_transcripts> and place the
+index under the directory <out_dir>.  There  are additional options that can
+be passed to the Sailfish indexer (e.g. the number of threads to use).  These
+can be seen by executing the command “Sailfish index -h”.</p>
+</div>
+<div class="section" id="quantification">
+<h2>Quantification<a class="headerlink" href="#quantification" title="Permalink to this headline">¶</a></h2>
+<p>Now that you have generated the Sailfish index (say that it’s the directory
+<index_dir> — this corresponds to the <out_dir> argument provided in the
+previous step), you can quantify the transcript expression for a given set of
+reads.  To perform the quantification, you run a command like the following:</p>
+<div class="highlight-python"><div class="highlight"><pre>> sailfish quant -i <index_dir> -l "<libtype>" {-r <unmated> | -1 <mates1> -2 <mates2>} -o <quant_dir>
+</pre></div>
+</div>
+<p>Where <index_dir> is, as described above, the location of the sailfish index,
+<libtype> is a string describing the format of the read library (see the
+[library string](#library-string) section below) <unmated> is a list of files
+containing unmated reads, <mates{1,2}> are lists of files containg,
+respectively, the first and second mates of paired-end reads.  Finally,
+<quant_dir> is the directory where the output should be written. Just like the
+indexing step, additional options are available, and can be viewed by running
+“sailfish quant -h”.</p>
+<p>When the quantification step is finished, the directory <quant_dir> will
+contain a file named “quant.sf” (and, if bias correction is enabled, an
+additional file names “quant_bias_corrected.sf”).  This file contains the
+result of the Sailfish quantification step.  This file contains a number of
+columns (which are listed in the last of the header lines beginning with ‘#’).
+Specifically, the columns are (1) Transcript ID, (2) Transcript Length, (3)
+Transcripts per Million (TPM), (4) Reads Per Kilobase per Million mapped reads
+(RPKM), (5) K-mers Per Kilobase per Million mapped k-mers (KPKM), (6) Estimated
+number of k-mers (an estimate of the number of k-mers drawn from this
+transcript given the transcript’s relative abundance and length) and (7)
+Estimated number of reads (an estimate of the number of reads drawn from this
+transcript given the transcript’s relative abnundance and length).  The first
+two columns are self-explanatory, the next four are measures of transcript
+abundance and the final is a commonly used input for differential expression
+tools.  The Transcripts per Million quantification number is computed as
+described in <a class="footnote-reference" href="#id2" id="id1">[1]</a>, and is meant as an estimate of the number of transcripts, per
+million observed transcripts, originating from each isoform.  Its benefit over
+the K/RPKM measure is that it is independent of the mean expressed transcript
+length (i.e. if the mean expressed transcript length varies between samples,
+for example, this alone can affect differential analysis based on the K/RPKM.)
+The RPKM is a classic measure of relative transcript abundance, and is an
+estimate of the number of reads per kilobase of transcript (per million mapped
+reads) originating from each transcript. The KPKM should closely track the
+RPKM, but is defined for very short features which are larger than the chosen
+k-mer length but may be shorter than the read length. Typically, you should
+prefer the KPKM measure to the RPKM measure, since the k-mer is the most
+natural unit of coverage for Sailfish.</p>
+</div>
+</div>
+<div class="section" id="license">
+<h1>License<a class="headerlink" href="#license" title="Permalink to this headline">¶</a></h1>
+<p>This program is free software: you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.</p>
+<p>This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.</p>
+<p>You should have received a copy of the GNU General Public License along with
+this program.  If not, see <<a class="reference external" href="http://www.gnu.org/licenses/">http://www.gnu.org/licenses/</a>>.</p>
+</div>
+<div class="section" id="references">
+<h1>References<a class="headerlink" href="#references" title="Permalink to this headline">¶</a></h1>
+<table class="docutils footnote" frame="void" id="id2" rules="none">
+<colgroup><col class="label" /><col /></colgroup>
+<tbody valign="top">
+<tr><td class="label"><a class="fn-backref" href="#id1">[1]</a></td><td>Li, Bo, et al. “RNA-Seq gene expression estimation with read mapping uncertainty.”
+Bioinformatics 26.4 (2010): 493-500.i</td></tr>
+</tbody>
+</table>
+</div>
+
+
+          </div>
+        </div>
+      </div>
+      <div class="sphinxsidebar">
+        <div class="sphinxsidebarwrapper">
+  <h3><a href="index.html">Table Of Contents</a></h3>
+  <ul>
+<li><a class="reference internal" href="#">Requirements</a></li>
+<li><a class="reference internal" href="#installation">Installation</a></li>
+<li><a class="reference internal" href="#running-sailfish">Running Sailfish</a><ul>
+<li><a class="reference internal" href="#indexing">Indexing</a></li>
+<li><a class="reference internal" href="#quantification">Quantification</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#license">License</a></li>
+<li><a class="reference internal" href="#references">References</a></li>
+</ul>
+
+  <h4>Previous topic</h4>
+  <p class="topless"><a href="index.html"
+                        title="previous chapter">Welcome to Sailfish’s documentation!</a></p>
+  <h3>This Page</h3>
+  <ul class="this-page-menu">
+    <li><a href="_sources/building.txt"
+           rel="nofollow">Show Source</a></li>
+  </ul>
+<div id="searchbox" style="display: none">
+  <h3>Quick search</h3>
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" />
+      <input type="submit" value="Go" />
+      <input type="hidden" name="check_keywords" value="yes" />
+      <input type="hidden" name="area" value="default" />
+    </form>
+    <p class="searchtip" style="font-size: 90%">
+    Enter search terms or a module, class or function name.
+    </p>
+</div>
+<script type="text/javascript">$('#searchbox').show(0);</script>
+        </div>
+      </div>
+      <div class="clearer"></div>
+    </div>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="genindex.html" title="General Index"
+             >index</a></li>
+        <li class="right" >
+          <a href="index.html" title="Welcome to Sailfish’s documentation!"
+             >previous</a> |</li>
+        <li><a href="index.html">Sailfish 0.6.4 documentation</a> »</li> 
+      </ul>
+    </div>
+    <div class="footer">
+        © Copyright 2014, Rob Patro, Carl Kingsford and Steve Mount.
+      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.2.2.
+    </div>
+  </body>
+</html>
\ No newline at end of file
diff --git a/doc/build/html/genindex.html b/doc/build/html/genindex.html
new file mode 100644
index 0000000..63c8340
--- /dev/null
+++ b/doc/build/html/genindex.html
@@ -0,0 +1,93 @@
+
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+    
+    <title>Index — Sailfish 0.6.4 documentation</title>
+    
+    <link rel="stylesheet" href="_static/default.css" type="text/css" />
+    <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+    
+    <script type="text/javascript">
+      var DOCUMENTATION_OPTIONS = {
+        URL_ROOT:    './',
+        VERSION:     '0.6.4',
+        COLLAPSE_INDEX: false,
+        FILE_SUFFIX: '.html',
+        HAS_SOURCE:  true
+      };
+    </script>
+    <script type="text/javascript" src="_static/jquery.js"></script>
+    <script type="text/javascript" src="_static/underscore.js"></script>
+    <script type="text/javascript" src="_static/doctools.js"></script>
+    <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    <link rel="top" title="Sailfish 0.6.4 documentation" href="index.html" /> 
+  </head>
+  <body>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="#" title="General Index"
+             accesskey="I">index</a></li>
+        <li><a href="index.html">Sailfish 0.6.4 documentation</a> »</li> 
+      </ul>
+    </div>  
+
+    <div class="document">
+      <div class="documentwrapper">
+        <div class="bodywrapper">
+          <div class="body">
+            
+
+<h1 id="index">Index</h1>
+
+<div class="genindex-jumpbox">
+ 
+</div>
+
+
+          </div>
+        </div>
+      </div>
+      <div class="sphinxsidebar">
+        <div class="sphinxsidebarwrapper">
+
+   
+
+<div id="searchbox" style="display: none">
+  <h3>Quick search</h3>
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" />
+      <input type="submit" value="Go" />
+      <input type="hidden" name="check_keywords" value="yes" />
+      <input type="hidden" name="area" value="default" />
+    </form>
+    <p class="searchtip" style="font-size: 90%">
+    Enter search terms or a module, class or function name.
+    </p>
+</div>
+<script type="text/javascript">$('#searchbox').show(0);</script>
+        </div>
+      </div>
+      <div class="clearer"></div>
+    </div>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="#" title="General Index"
+             >index</a></li>
+        <li><a href="index.html">Sailfish 0.6.4 documentation</a> »</li> 
+      </ul>
+    </div>
+    <div class="footer">
+        © Copyright 2014, Rob Patro, Carl Kingsford and Steve Mount.
+      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.2.2.
+    </div>
+  </body>
+</html>
\ No newline at end of file
diff --git a/doc/build/html/index.html b/doc/build/html/index.html
new file mode 100644
index 0000000..d76ff2e
--- /dev/null
+++ b/doc/build/html/index.html
@@ -0,0 +1,129 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+    
+    <title>Welcome to Sailfish’s documentation! — Sailfish 0.6.4 documentation</title>
+    
+    <link rel="stylesheet" href="_static/default.css" type="text/css" />
+    <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+    
+    <script type="text/javascript">
+      var DOCUMENTATION_OPTIONS = {
+        URL_ROOT:    './',
+        VERSION:     '0.6.4',
+        COLLAPSE_INDEX: false,
+        FILE_SUFFIX: '.html',
+        HAS_SOURCE:  true
+      };
+    </script>
+    <script type="text/javascript" src="_static/jquery.js"></script>
+    <script type="text/javascript" src="_static/underscore.js"></script>
+    <script type="text/javascript" src="_static/doctools.js"></script>
+    <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    <link rel="top" title="Sailfish 0.6.4 documentation" href="#" />
+    <link rel="next" title="Requirements" href="building.html" /> 
+  </head>
+  <body>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="genindex.html" title="General Index"
+             accesskey="I">index</a></li>
+        <li class="right" >
+          <a href="building.html" title="Requirements"
+             accesskey="N">next</a> |</li>
+        <li><a href="#">Sailfish 0.6.4 documentation</a> »</li> 
+      </ul>
+    </div>  
+
+    <div class="document">
+      <div class="documentwrapper">
+        <div class="bodywrapper">
+          <div class="body">
+            
+  <div class="section" id="welcome-to-sailfish-s-documentation">
+<h1>Welcome to Sailfish’s documentation!<a class="headerlink" href="#welcome-to-sailfish-s-documentation" title="Permalink to this headline">¶</a></h1>
+<p>Contents:</p>
+<div class="toctree-wrapper compound">
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="building.html">Requirements</a></li>
+<li class="toctree-l1"><a class="reference internal" href="building.html#installation">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="building.html#running-sailfish">Running Sailfish</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="building.html#indexing">Indexing</a></li>
+<li class="toctree-l2"><a class="reference internal" href="building.html#quantification">Quantification</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="building.html#license">License</a></li>
+<li class="toctree-l1"><a class="reference internal" href="building.html#references">References</a></li>
+</ul>
+</div>
+</div>
+<div class="section" id="indices-and-tables">
+<h1>Indices and tables<a class="headerlink" href="#indices-and-tables" title="Permalink to this headline">¶</a></h1>
+<ul class="simple">
+<li><a class="reference internal" href="genindex.html"><em>Index</em></a></li>
+<li><a class="reference internal" href="py-modindex.html"><em>Module Index</em></a></li>
+<li><a class="reference internal" href="search.html"><em>Search Page</em></a></li>
+</ul>
+</div>
+
+
+          </div>
+        </div>
+      </div>
+      <div class="sphinxsidebar">
+        <div class="sphinxsidebarwrapper">
+  <h3><a href="#">Table Of Contents</a></h3>
+  <ul>
+<li><a class="reference internal" href="#">Welcome to Sailfish’s documentation!</a></li>
+<li><a class="reference internal" href="#indices-and-tables">Indices and tables</a></li>
+</ul>
+
+  <h4>Next topic</h4>
+  <p class="topless"><a href="building.html"
+                        title="next chapter">Requirements</a></p>
+  <h3>This Page</h3>
+  <ul class="this-page-menu">
+    <li><a href="_sources/index.txt"
+           rel="nofollow">Show Source</a></li>
+  </ul>
+<div id="searchbox" style="display: none">
+  <h3>Quick search</h3>
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" />
+      <input type="submit" value="Go" />
+      <input type="hidden" name="check_keywords" value="yes" />
+      <input type="hidden" name="area" value="default" />
+    </form>
+    <p class="searchtip" style="font-size: 90%">
+    Enter search terms or a module, class or function name.
+    </p>
+</div>
+<script type="text/javascript">$('#searchbox').show(0);</script>
+        </div>
+      </div>
+      <div class="clearer"></div>
+    </div>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="genindex.html" title="General Index"
+             >index</a></li>
+        <li class="right" >
+          <a href="building.html" title="Requirements"
+             >next</a> |</li>
+        <li><a href="#">Sailfish 0.6.4 documentation</a> »</li> 
+      </ul>
+    </div>
+    <div class="footer">
+        © Copyright 2014, Rob Patro, Carl Kingsford and Steve Mount.
+      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.2.2.
+    </div>
+  </body>
+</html>
\ No newline at end of file
diff --git a/doc/build/html/objects.inv b/doc/build/html/objects.inv
new file mode 100644
index 0000000..d9b3845
Binary files /dev/null and b/doc/build/html/objects.inv differ
diff --git a/doc/build/html/search.html b/doc/build/html/search.html
new file mode 100644
index 0000000..c76ddf6
--- /dev/null
+++ b/doc/build/html/search.html
@@ -0,0 +1,100 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+    
+    <title>Search — Sailfish 0.6.4 documentation</title>
+    
+    <link rel="stylesheet" href="_static/default.css" type="text/css" />
+    <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+    
+    <script type="text/javascript">
+      var DOCUMENTATION_OPTIONS = {
+        URL_ROOT:    './',
+        VERSION:     '0.6.4',
+        COLLAPSE_INDEX: false,
+        FILE_SUFFIX: '.html',
+        HAS_SOURCE:  true
+      };
+    </script>
+    <script type="text/javascript" src="_static/jquery.js"></script>
+    <script type="text/javascript" src="_static/underscore.js"></script>
+    <script type="text/javascript" src="_static/doctools.js"></script>
+    <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    <script type="text/javascript" src="_static/searchtools.js"></script>
+    <link rel="top" title="Sailfish 0.6.4 documentation" href="index.html" />
+  <script type="text/javascript">
+    jQuery(function() { Search.loadIndex("searchindex.js"); });
+  </script>
+  
+  <script type="text/javascript" id="searchindexloader"></script>
+   
+
+  </head>
+  <body>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="genindex.html" title="General Index"
+             accesskey="I">index</a></li>
+        <li><a href="index.html">Sailfish 0.6.4 documentation</a> »</li> 
+      </ul>
+    </div>  
+
+    <div class="document">
+      <div class="documentwrapper">
+        <div class="bodywrapper">
+          <div class="body">
+            
+  <h1 id="search-documentation">Search</h1>
+  <div id="fallback" class="admonition warning">
+  <script type="text/javascript">$('#fallback').hide();</script>
+  <p>
+    Please activate JavaScript to enable the search
+    functionality.
+  </p>
+  </div>
+  <p>
+    From here you can search these documents. Enter your search
+    words into the box below and click "search". Note that the search
+    function will automatically search for all of the words. Pages
+    containing fewer words won't appear in the result list.
+  </p>
+  <form action="" method="get">
+    <input type="text" name="q" value="" />
+    <input type="submit" value="search" />
+    <span id="search-progress" style="padding-left: 10px"></span>
+  </form>
+  
+  <div id="search-results">
+  
+  </div>
+
+          </div>
+        </div>
+      </div>
+      <div class="sphinxsidebar">
+        <div class="sphinxsidebarwrapper">
+        </div>
+      </div>
+      <div class="clearer"></div>
+    </div>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="genindex.html" title="General Index"
+             >index</a></li>
+        <li><a href="index.html">Sailfish 0.6.4 documentation</a> »</li> 
+      </ul>
+    </div>
+    <div class="footer">
+        © Copyright 2014, Rob Patro, Carl Kingsford and Steve Mount.
+      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.2.2.
+    </div>
+  </body>
+</html>
\ No newline at end of file
diff --git a/doc/build/html/searchindex.js b/doc/build/html/searchindex.js
new file mode 100644
index 0000000..dd3de87
--- /dev/null
+++ b/doc/build/html/searchindex.js
@@ -0,0 +1 @@
+Search.setIndex({envversion:42,terms:{osx:0,all:0,abund:0,just:0,kpkm:0,when:0,over:0,install_dir:0,four:0,map:0,per:0,follow:0,vari:0,current:0,variabil:0,now:0,locat:0,execut:0,fit:0,chosen:0,"true":0,copi:0,configur:0,should:0,platform:0,window:0,version:0,novo_:0,under:0,bin:0,sens:0,local:0,modul:1,boost_root:0,build:0,merchant:0,sourc:0,dfetch_boost:0,string:0,format:0,read:0,foundat:0,rna:0,likewis:0,clang:0,bia:0,test:0,quantifi:0,veri:0,affect:0,tbb:0,complain:0,dure:0,like:0,ch [...]
\ No newline at end of file
diff --git a/doc/license-header.txt b/doc/license-header.txt
new file mode 100644
index 0000000..9ef6649
--- /dev/null
+++ b/doc/license-header.txt
@@ -0,0 +1,22 @@
+/**
+>HEADER
+    Copyright (c) 2013/2014 Rob Patro robp at cs.cmu.edu
+
+    This file is part of the Sailfish suite.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
diff --git a/doc/source/ReadLibraryIllustration.png b/doc/source/ReadLibraryIllustration.png
new file mode 100644
index 0000000..e4b1c02
Binary files /dev/null and b/doc/source/ReadLibraryIllustration.png differ
diff --git a/doc/source/building.rst b/doc/source/building.rst
new file mode 100644
index 0000000..86b948e
--- /dev/null
+++ b/doc/source/building.rst
@@ -0,0 +1,113 @@
+Requirements
+============
+
+Binary Releases
+---------------
+
+Pre-compiled binaries of the latest release of Salmon for a number different
+platforms are available available under the `Releases tab
+<https://github.com/COMBINE-lab/salmon/releases>`_ of Salmon's `GitHub
+repository <https://github.com/COMBINE-lab/salmon>`_.  You should be able to
+get started quickly byfinding a binary from the list that is compatible with
+your platform.  If you're running an old version of Linux and get errors
+related to `GLIBC`, try the pre-compiled "Debian Squeeze" binary.
+
+Requirements for Building from Source
+-------------------------------------
+
+* A C++11 conformant compiler (currently tested with GCC>=4.7 and Clang>=3.4)
+* CMake_. Salmon uses the CMake build system to check, fetch and install
+  dependencies, and to compile and install Salmon. CMake is available for all
+  major platforms (though Salmon is currently unsupported on Windows.)
+  
+Installation
+============
+
+After downloading the Salmon source distribution and unpacking it, change into the top-level directory:
+
+::
+
+    > cd salmon
+
+Then, create and out-of-source build directory and change into it:
+
+::
+
+    > mkdir build
+    > cd build
+
+
+Salmon makes extensive use of Boost_.  We recommend installing the most
+recent version (1.55) systemwide if possible. If Boost is not installed on your
+system, the build process will fetch, compile and install it locally.  However,
+if you already have a recent version of Boost available on your system, it make
+sense to tell the build system to use that.
+
+If you have Boost installed you can tell CMake where to look for it. Likewise,
+if you already have `Intel's Threading Building Blocks
+<http://threadingbuildingblocks.org/>`_ library installed, you can tell CMake
+where it is as well. The flags for CMake are as follows:
+
+* -DFETCH_BOOST=TRUE --  If you don't have Boost installed (or have an older
+  version of it), you can provide the FETCH_BOOST flag instead of the
+  BOOST_ROOT variable, which will cause CMake to fetch and build Boost locally.
+
+* -DBOOST_ROOT=<boostdir> -- Tells CMake where an existing installtion of Boost
+  resides, and looks for the appropritate version in <boostdir>.  This is the
+  top-level directory where Boost is installed (e.g. /opt/local).
+
+* -DTBB_INSTALL_DIR=<tbbroot> -- Tells CMake where an existing installation of
+  Intel's TBB is installed (<tbbroot>), and looks for the apropriate headers
+  and libraries there. This is the top-level directory where TBB is installed
+  (e.g. /opt/local).
+
+* -DCMAKE_INSTALL_PREFIX=<install_dir> -- <install_dir> is the directory to
+  which you wish Salmon to be installed.  If you don't specify this option,
+  it will be installed locally in the top-level directory (i.e. the directory
+  directly above "build").
+
+There are a number of other libraries upon which Salmon depends, but CMake 
+should fetch these for you automatically.
+
+Setting the appropriate flags, you can then run the CMake configure step as
+follows:
+
+::
+                                  
+    > cmake [FLAGS] ..
+
+The above command is the cmake configuration step, which *should* complain if
+anything goes wrong.  Next, you have to run the build step. Depending on what
+libraries need to be fetched and installed, this could take a while
+(specifically if the installation needs to install Boost).  To start the build,
+just run make.
+
+::
+
+    > make
+
+If the build is successful, the appropriate executables and libraries should be
+created. There are two points to note about the build process.  First, if the
+build system is downloading and compiling boost, you may see a large number of
+warnings during compilation; these are normal.  Second, note that CMake has
+colored output by default, and the steps which create or link libraries are
+printed in red.  This is the color chosen by CMake for linking messages, and
+does not denote an error in the build process. 
+                                  
+Finally, after everything is built, the libraries and executable can be
+installed with:
+
+::
+                                  
+    > make install
+
+You can test the installation by running
+
+::
+
+    > make test
+
+This should run a simple test and tell you if it succeeded or not.
+
+.. _CMake : http://www.cmake.org 
+.. _Boost: http://www.boost.org
diff --git a/doc/source/conf.py b/doc/source/conf.py
new file mode 100644
index 0000000..2df870c
--- /dev/null
+++ b/doc/source/conf.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+#
+# Salmon documentation build configuration file, created by
+# sphinx-quickstart on Tue Jul 15 17:48:43 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.todo',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.ifconfig',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Salmon'
+copyright = u'2015, Rob Patro, Carl Kingsford and Steve Mount'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.3.2'
+# The full version, including alpha/beta/rc tags.
+release = '0.3.2'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Salmondoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  ('index', 'Salmon.tex', u'Salmon Documentation',
+   u'Rob Patro, Carl Kingsford and Steve Mount', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'salmon', u'Salmon Documentation',
+     [u'Rob Patro, Carl Kingsford and Steve Mount'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'Salmon', u'Salmon Documentation',
+   u'Rob Patro, Carl Kingsford and Steve Mount', 'Salmon', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/doc/source/index.rst b/doc/source/index.rst
new file mode 100644
index 0000000..ec4abb5
--- /dev/null
+++ b/doc/source/index.rst
@@ -0,0 +1,24 @@
+.. Sailfish documentation master file, created by
+   sphinx-quickstart on Tue Jul 15 17:48:43 2014.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to Salmon's documentation!
+=============================================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+   
+   building.rst
+   salmon.rst
+   library_type.rst
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/doc/source/library_type.rst b/doc/source/library_type.rst
new file mode 100644
index 0000000..6cfaefe
--- /dev/null
+++ b/doc/source/library_type.rst
@@ -0,0 +1,105 @@
+.. _FragLibType:
+
+Fragment Library Types
+======================
+
+There are numerous library preparation protocols for RNA-seq that result in
+sequencing reads with different characteristics.  For example, reads can be
+single end (only one side of a fragment is recorded as a read) or paired-end
+(reads are generated from both ends of a fragment).  Further, the sequencing
+reads themselves may be unstraned or strand-specific.  Finally, paired-end
+protocols will have a specified relative orientation.  To characterize the
+various different typs of sequencing libraries, we've created a miniature
+"language" that allows for the succinct description of the many different types
+of possible fragment libraries.  For paired-end reads, the possible
+orientations, along with a graphical description of what they mean, are
+illustrated below:
+
+.. image:: ReadLibraryIllustration.png
+
+The library type string consists of three parts: the relative orientation of
+the reads, the strandedness of the library, and the directionality of the
+reads.
+
+The first part of the library string (relative orientation) is only provided if
+the library is paired-end. The possible options are:
+
+::
+
+    I = inward
+    O = outward
+    M = matching
+
+The second part of the read library string specifies whether the protocol is
+stranded or unstranded; the options are:
+
+::
+
+    S = stranded
+    U = unstranded
+
+If the protocol is unstranded, then we're done.  The final part of the library
+string specifies the strand from which the read originates in a strand-specific
+protocol — it is only provided if the library is stranded (i.e. if the
+library format string is of the form S).  The possible values are:
+
+::
+
+    F = read 1 (or single-end read) comes from the forward strand
+    R = read 1 (or single-end read) comes from the reverse strand
+
+So, for example, if you wanted to specify a fragment library of strand-specific
+paired-end reads, oriented toward each other, where read 1 comes from the
+forward strand and read 2 comes from the reverse strand, you would specify ``-l
+ISF`` on the command line.  This designates that the library being processed has
+the type "ISF" meaning, **I**\ nward (the relative orientation), **S**\ tranted
+(the protocol is strand-specific), **F**\ orward (read 1 comes from the forward
+strand).
+
+The single end library strings are a bit simpler than their pair-end counter
+parts, since there is no relative orientation of which to speak.  Thus, the
+only possible library format types for single-end reads are ``U`` (for
+unstranded), ``SF`` (for strand-specific reads coming from the forward strand)
+and ``SR`` (for strand-specific reads coming from the reverse strand).
+
+A few more examples of some library format strings and their interpretations are:
+
+::
+
+    IU (an unstranded paired-end library where the reads face each other)
+
+::
+
+    SF (a stranded single-end protocol where the reads come from the forward strand)
+
+::
+
+    OSR (a stranded paired-end protocol where the reads face away from each other,
+         read1 comes from reverse strand and read2 comes from the forward strand)
+
+.. note:: Correspondence to TopHat library types 
+
+   The popular `TopHat <http://ccb.jhu.edu/software/tophat/index.shtml>`_ RNA-seq 
+   read aligner has a different convention for specifying the format of the library.
+   Below is a table that provides the corresponding sailfish/salmon library format
+   string for each of the potential TopHat library types:
+
+
+   +---------------------+-------------------------+  
+   | TopHat              | Salmon (and Sailfish)   |
+   +=====================+============+============+
+   |                     | Paired-end | Single-end | 
+   +---------------------+------------+------------+
+   |``-fr-unstranded``   |``-l IU``   |``-l U``    |          
+   +---------------------+------------+------------+
+   |``-fr-firststrand``  |``-l ISR``  |``-l SR``   |          
+   +---------------------+------------+------------+
+   |``-fr-secondstrand`` |``-l ISF``  |``-l SF``   |          
+   +---------------------+------------+------------+
+
+   The remaining salmon library format strings are not directly expressible in terms
+   of the TopHat library types, and so there is no direct mapping for them.
+
+
+
+
diff --git a/doc/source/license.rst b/doc/source/license.rst
new file mode 100644
index 0000000..23f53d8
--- /dev/null
+++ b/doc/source/license.rst
@@ -0,0 +1,16 @@
+License
+=======
+
+This program is free software: you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
diff --git a/doc/source/sailfish.rst b/doc/source/sailfish.rst
new file mode 100644
index 0000000..d8cc967
--- /dev/null
+++ b/doc/source/sailfish.rst
@@ -0,0 +1,89 @@
+Sailfish
+================
+
+Sailfish is a tool for transcript quantification from RNA-seq data.  It
+requires a set of target transcripts (either from a reference or *de-novo*
+assembly) to quantify.  All you need to run sailfish is a fasta file containing
+your reference transcripts and a (set of) fasta/fastq file(s) containing your
+reads.  Sailfish runs in two phases; indexing and quantification.  The indexing
+step is independent of the reads, and only needs to be run once for a particular
+set of reference transcripts and choice of k (the k-mer size). The
+quantification step, obviously, is specific to the set of RNA-seq reads and is
+thus run more frequently. For a more complete description of all available
+options in sailfish, see the manual.
+
+
+Indexing
+--------
+
+To generate the sailfish index for your reference set of transcripts, you
+should run the following command:
+
+::
+
+    > sailfish index -t <ref_transcripts> -o <out_dir> -k <kmer_len>
+
+
+This will build a sailfish index for k-mers of length ``<kmer_len>`` for the
+reference transcripts  provided in the file ``<ref_transcripts>`` and place the
+index under the directory ``<out_dir>``.  There  are additional options that can
+be passed to the sailfish indexer (e.g. the number of threads to use).  These
+can be seen by executing the command ``sailfish index -h``.
+
+Quantification
+--------------
+
+Now that you have generated the sailfish index (say that it's the directory
+``<index_dir>`` --- this corresponds to the <out_dir> argument provided in the
+previous step), you can quantify the transcript expression for a given set of
+reads.  To perform the quantification, you run a command like the following:
+
+::
+
+    > sailfish quant -i <index_dir> -l "<libtype>" {-r <unmated> | -1 <mates1> -2 <mates2>} -o <quant_dir>
+
+Where ``<index_dir>`` is, as described above, the location of the sailfish
+index, ``<libtype>`` is a string describing the format of the fragment (read)
+library (see :ref:`FragLibType`), ``<unmated>`` is a list of files
+containing unmated reads, ``<mates{1,2}>`` are lists of files containg,
+respectively, the first and second mates of paired-end reads. Finally,
+``<quant_dir>`` is the directory where the output should be written. Just like the
+indexing step, additional options are available, and can be viewed by running
+``sailfish quant -h``.
+
+When the quantification step is finished, the directory ``<quant_dir>`` will
+contain a file named "quant.sf" (and, if bias correction is enabled, an
+additional file names "quant_bias_corrected.sf").  This file contains the
+result of the Sailfish quantification step.  This file contains a number of
+columns (which are listed in the last of the header lines beginning with '#').
+Specifically, the columns are (1) Transcript ID, (2) Transcript Length, (3)
+Transcripts per Million (TPM), (4) Reads Per Kilobase per Million mapped reads
+(RPKM), (5) K-mers Per Kilobase per Million mapped k-mers (KPKM), (6) Estimated
+number of k-mers (an estimate of the number of k-mers drawn from this
+transcript given the transcript's relative abundance and length) and (7)
+Estimated number of reads (an estimate of the number of reads drawn from this
+transcript given the transcript's relative abnundance and length).  The first
+two columns are self-explanatory, the next four are measures of transcript
+abundance and the final is a commonly used input for differential expression
+tools.  The Transcripts per Million quantification number is computed as
+described in [1]_, and is meant as an estimate of the number of transcripts, per
+million observed transcripts, originating from each isoform.  Its benefit over
+the K/RPKM measure is that it is independent of the mean expressed transcript
+length (i.e. if the mean expressed transcript length varies between samples,
+for example, this alone can affect differential analysis based on the K/RPKM.)
+The RPKM is a classic measure of relative transcript abundance, and is an
+estimate of the number of reads per kilobase of transcript (per million mapped
+reads) originating from each transcript. The KPKM should closely track the
+RPKM, but is defined for very short features which are larger than the chosen
+k-mer length but may be shorter than the read length. Typically, you should
+prefer the KPKM measure to the RPKM measure, since the k-mer is the most
+natural unit of coverage for Sailfish.
+
+References
+----------
+
+.. [1] Li, Bo, et al. "RNA-Seq gene expression estimation with read mapping uncertainty."
+    Bioinformatics 26.4 (2010): 493-500.
+
+.. _CMake : http://www.cmake.org
+.. _Boost: http://www.boost.org
diff --git a/doc/source/salmon.rst b/doc/source/salmon.rst
new file mode 100644
index 0000000..2c6e64d
--- /dev/null
+++ b/doc/source/salmon.rst
@@ -0,0 +1,322 @@
+Salmon
+================
+
+Salmon is a tool for **wicked-fast** transcript quantification from RNA-seq
+data.  It requires a set of target transcripts (either from a reference or
+*de-novo* assembly) to quantify.  All you need to run Salmon is a FASTA file
+containing your reference transcripts and a (set of) FASTA/FASTQ file(s)
+containing your reads.  Optionally, Salmon can make use of pre-computed
+alignments (in the form of a SAM/BAM file) to the transcripts rather than the
+raw reads.
+
+The **lightweight-alignment**-based mode of Salmon runs in two phases; indexing and
+quantification. The indexing step is independent of the reads, and only need to
+be run one for a particular set of reference transcripts. The quantification
+step, obviously, is specific to the set of RNA-seq reads and is thus run more
+frequently. For a more complete description of all available options in Salmon,
+see below.
+
+The **alignment**-based mode of Salmon does not require indexing.  Rather, you can 
+simply provide Salmon with a FASTA file of the transcripts and a SAM/BAM file
+containing the alignments you wish to use for quantification.
+
+Using Salmon
+------------
+
+As mentioned above, there are two "modes" of operation for Salmon.  The first,
+requires you to build an index for the transcriptome, but then subsequently
+processes reads directly.  The second mode simply requires you to provide a
+FASTA file of the transcriptome and a ``.sam`` or ``.bam`` file containing a
+set of alignments.
+
+.. note:: Read / alignment order
+
+    Salmon, like eXpress, uses a streaming inference method to perform 
+    transcript-level quantification.  One of the fundamental assumptions 
+    of such inference methods is that observations (i.e. reads or alignments)
+    are made "at random".  This means, for example, that alignments should 
+    **not** be sorted by target or position.  If your reads or alignments 
+    do not appear in a random order with respect to the target transcripts,
+    please randomize / shuffle them before performing quantification with 
+    Salmon.
+
+.. note:: Number of Threads
+
+    The number of threads that Salmon can effectively make use of depends 
+    upon the mode in which it is being run.  In alignment-based mode, the
+    main bottleneck is in parsing and decompressing the input BAM file.
+    We make use of the `Staden IO <http://sourceforge.net/projects/staden/files/io_lib/>`_ 
+    library for SAM/BAM/CRAM I/O (CRAM is, in theory, supported, but has not been
+    thorougly tested).  This means that multiple threads can be effectively used
+    to aid in BAM decompression.  However, we find that throwing more than a 
+    few threads at file decompression does not result in increased processing
+    speed.  Thus, alignment-based Salmon will only ever allocate up to 4 threads
+    to file decompression, with the rest being allocated to quantification.
+    If these threads are starved, they will sleep (the quantification threads 
+    do not busy wait), but there is a point beyond which allocating more threads
+    will not speed up alignment-based quantification.  We find that allocating 
+    8 --- 12 threads results in the maximum speed, threads allocated above this
+    limit will likely spend most of their time idle / sleeping.
+
+    For lightweight-alignment-based Salmon, the story is somewhat different.
+    Generally, performance continues to improve as more threads are made
+    available.  This is because the determiniation of the potential mapping
+    locations of each read is, generally, the slowest step in
+    lightweight-alignment-based quantification.  Since this process is
+    trivially parallelizable (and well-parallelized within Salmon), more
+    threads generally equates to faster quantification. However, there may
+    still be a limit to the return on invested threads. Specifically, writing
+    to the mapping cache (see `Misc`_ below) is done via a single thread.  With
+    a huge number of quantification threads or in environments with a very slow
+    disk, this may become the limiting step. If you're certain that you have
+    more than the required number of observations, or if you have reason to
+    suspect that your disk is particularly slow on writes, then you can disable
+    the mapping cache (``--disableMappingCache``), and potentially increase the
+    parallelizability of lightweight-alignment-based Salmon.
+
+Lightweight-alignment-based mode
+--------------------------------
+
+One of the novel and innovative features of Salmon is its ability to accurately
+quantify transcripts using *lightweight* alignments.  Lightweight alignments
+are mappings of reads to transcript positions that are computed without
+performing a base-to-base alignment of the read to the transcript.  Lightweight 
+alignments are typically much faster to compute than traditional (or full)
+alignments, and can sometimes provide superior accuracy by being more robust 
+to errors in the read or genomic variation from the reference sequence.
+If you want to use Salmon in lightweight alignment-based mode, then you first
+have to build an Salmon index for your transcriptome.  Assume that
+``transcripts.fa`` contains the set of transcripts you wish to quantify. First,
+you run the Salmon indexer:
+
+::
+    
+    > ./bin/salmon index -t transcripts.fa -i transcripts_index
+
+Then, you can quantify any set of reads (say, paired-end reads in files
+`reads1.fa` and `reads2.fa`) directly against this index using the Salmon
+``quant`` command as follows:
+
+::
+
+    > ./bin/salmon quant -i transcripts_index -l <LIBTYPE> -1 reads1.fa -2 reads2.fa -o transcripts_quant
+
+You can, of course, pass a number of options to control things such as the
+number of threads used or the different cutoffs used for counting reads.
+Just as with the alignment-based mode, after Salmon has finished running, there
+will be a directory called ``salmon_quant``, that contains a file called
+``quant.sf`` containing the quantification results.
+
+Alignment-based mode
+--------------------
+
+Say that you've prepared your alignments using your favorite aligner and the
+results are in the file ``aln.bam``, and assume that the sequence of the
+transcriptome you want to quantify is in the file ``transcripts.fa``.  You
+would run Salmon as follows:
+
+::
+
+    > ./bin/salmon quant -t transcripts.fa -l <LIBTYPE> -a aln.bam -o salmon_quant
+
+The ``<LIBTYPE>`` parameter is described below and is shared between both modes
+of Salmon.  After Salmon has finished running, there will be a directory called
+``salmon_quant``, that contains a file called ``quant.sf``.  This contains the
+quantification results for the run, and the columns it contains are similar to
+those of Sailfish (and self-explanatory where they differ).
+
+For the full set of options that can be passed to Salmon in its alignment-based
+mode, and a description of each, run ``salmon quant --help-alignment``.
+
+.. note:: Genomic vs. Transcriptomic alignments
+
+    Salmon expects that the alignment files provided are with respect to the
+    transcripts given in the corresponding fasta file.  That is, Salmon expects
+    that the reads have been aligned directly to the transcriptome (like RSEM,
+    eXpress, etc.) rather than to the genome (as does, e.g. Cufflinks).  If you
+    have reads that have already been aligned to the genome, there are
+    currently 3 options for converting them for use with Salmon.  First, you
+    could convert the SAM/BAM file to a FAST{A/Q} file and then use the
+    lightweight-alignment-based mode of Salmon described below.  Second, given the converted
+    FASTA{A/Q} file, you could re-align these converted reads directly to the
+    transcripts with your favorite aligner and run Salmon in alignment-based
+    mode as described above.  Third, you could use a tool like `sam-xlate <https://github.com/mozack/ubu/wiki>`_
+    to try and convert the genome-coordinate BAM files directly into transcript 
+    coordinates.  This avoids the necessity of having to re-map the reads. However,
+    we have very limited experience with this tool so far.
+
+.. topic:: Multiple alignment files
+    
+    If your alignments for the sample you want to quantify appear in multiple 
+    .bam/.sam files, then you can simply provide the Salmon ``-a`` parameter 
+    with a (space-separated) list of these files.  Salmon will automatically 
+    read through these one after the other quantifying transcripts using the 
+    alignments contained therein.  However, it is currently the case that these
+    separate files must (1) all be of the same library type and (2) all be
+    aligned with respect to the same reference (i.e. the @SQ records in the 
+    header sections must be identical).
+
+
+What's this ``LIBTYPE``?
+------------------------
+
+Salmon, like sailfish, has the user provide a description of the type of
+sequencing library from which the reads come, and this contains information
+about e.g. the relative orientation of paired end reads.  However, we've
+replaced the somewhat esoteric description of the library type with a simple
+set of strings; each of which represents a different type of read library. This
+new method of specifying the type of read library is being back-ported into
+Sailfish and will be available in the next release.
+
+The library type string consists of three parts: the relative orientation of
+the reads, the strandedness of the library, and the directionality of the
+reads.
+
+The first part of the library string (relative orientation) is only provided if
+the library is paired-end. The possible options are:
+
+::
+
+    I = inward
+    O = outward
+    M = matching
+
+The second part of the read library string specifies whether the protocol is
+stranded or unstranded; the options are:
+
+::
+
+    S = stranded
+    U = unstranded
+
+If the protocol is unstranded, then we're done.  The final part of the library
+string specifies the strand from which the read originates in a strand-specific
+protocol — it is only provided if the library is stranded (i.e. if the
+library format string is of the form S).  The possible values are:
+
+::
+
+    F = read 1 (or single-end read) comes from the forward strand
+    R = read 1 (or single-end read) comes from the reverse strand
+
+An example of some library format strings and their interpretations are:
+
+::
+
+    IU (an unstranded paired-end library where the reads face each other)
+
+::
+
+    SF (a stranded single-end protocol where the reads come from the forward strand)
+
+::
+
+    OSR (a stranded paired-end protocol where the reads face away from each other,
+         read1 comes from reverse strand and read2 comes from the forward strand)
+
+
+.. note:: Strand Matching
+
+    Above, when it is said that the read "comes from" a strand, we mean that
+    the read should align with / map to that strand.  For example, for
+    libraries having the ``OSR`` protocol as described above, we expect that
+    read1 maps to the reverse strand, and read2 maps to the forward strand. 
+
+
+For more details on the library type, see :ref:`FragLibType`. 
+
+Output
+------
+
+Salmon writes its output in a simple tab-delimited file format.  Any line that begins 
+with a ``#`` is a comment line, and can be safely ignored.  Salmon records the files
+and options passed to it in comments at the top of its output file.  The last comment 
+line gives the names of each of the data columns. The columns appear in the following order: 
+
++------+--------+-----+------+----------+
+| Name | Length | TPM | FPKM | NumReads |
++------+--------+-----+------+----------+
+
+Each subsequent row described a single quantification record.  The columns have
+the following interpretation.
+
+* **Name** --- 
+  This is the name of the target transcript provided in the input transcript database (FASTA file). 
+
+* **Length** ---
+  This is the length of the target transcript in nucleotides.
+
+* **TPM** ---
+  This is salmon's estimate of the relative abundance of this transcript in units of Transcripts Per Million (TPM).
+  TPM is the recommended relative abundance measure to use for downstream analysis. 
+
+* **FPKM** ---
+  This is salmon's estimate of the relative abundance of this transcript in units of Fragments Per Kilobase per Million
+  mapped reads (FPKM).  This relative abundance measure is proportional, within-sample, to the TPM measure.  However, 
+  the TPM should generally be preferred to FPKM.  This column is provided mostly for compatibility with tools that expect
+  FPKM as input.
+
+* **NumReads** --- 
+  This is salmon's estimate of the number of reads mapping to each transcript that was quantified.  It is an "estimate" 
+  insofar as it is the expected number of reads that have originated from each transcript given the structure of the uniquely 
+  mapping and multi-mapping reads and the relative abundance estimates for each transcript.  You can round these values 
+  to the nearest integer and use them directly as input to count-based methods like 
+  `Deseq2 <http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html>`_ and 
+  `EdgeR <http://master.bioconductor.org/packages/release/bioc/html/edgeR.html>`_, among others.
+
+Misc
+----
+
+Salmon deals with reading from compressed read files in the same way as
+sailfish --- by using process substitution.  Say in the
+lightweigh-alignment-based salmon example above, the reads were actually in the
+files ``reads1.fa.gz`` and ``reads2.fa.gz``, then you'd run the following
+command to decompress the reads "on-the-fly":
+
+::
+
+    > ./bin/salmon quant -i transcripts_index -l <LIBTYPE> -1 <(gzcat reads1.fa.gz) -2 <(gzcat reads2.fa.gz) -o transcripts_quant
+
+and the gzipped files will be decompressed via separate processes and the raw
+reads will be fed into salmon.
+
+.. note:: The Mapping Cache 
+
+    Salmon requires a specific number of observations (fragments) to
+    be observed before it will report its quantification results.  If it 
+    doesn't see enough fragments when reading through the read files the 
+    first time, it will process the information again (don't worry; it's not 
+    double counting. The results from the first pass essentially become 
+    a "prior" for assigning the proper read counts in subsequent passes).
+
+    The first time the file is processed, the set of potential mappings for
+    each fragment is written to a temporary file in an efficient binary format
+    --- this file is called the mapping cache.  As soon as the required number
+    of obvservations have been seen, salmon stops writing to the mapping cache
+    (ensuring that the file size will not grow too large).  However, for
+    experiments with fewer than the required number of observations, the
+    mapping cache is a significant optimization over reading through the raw
+    set of reads multiple times.  First, the work of determining the potential
+    mapping locations for a read is only performed once, during the inital pass
+    through the file.  Second, since the mapping cache is implemented as a
+    regular file on disk, the information contained within a file can be
+    processed multiple times, even if the file itself is being produced via
+    e.g. process substitution as in the example above.
+    
+    You can control the required number of observations and thus, indirectly,
+    the maximum size of the mapping cache file, via the ``-n`` argument.
+    Note that the cache itself is considered a "temporary" file, and it is
+    removed from disk by salmon before the program terminates.  If you are
+    certain that your read library is large enough that you will observe the
+    required number of fragments in the first pass, or if you have some other 
+    reason to avoid creating the temporary mapping cache, it can disabled with
+    the ``--disableMappingCache`` flag.
+
+**Finally**, the purpose of making this beta executable (as well as the Salmon
+code) available is for people to use it and provide feedback.  A pre-print and
+manuscript are in the works, but the earlier we get feedback, thoughts,
+suggestions and ideas, the better!  So, if you have something useful to report
+or just some interesting ideas or suggestions, please contact us
+(`rob.patro at cs.stonybrook.edu` and/or `carlk at cs.cmu.edu`).  Also, please use
+the same e-mail addresses to contact us with any *detailed* bug-reports (though
+bug-support for these early beta versions may be slow).
diff --git a/external/.gitignore b/external/.gitignore
new file mode 100644
index 0000000..b8bd026
--- /dev/null
+++ b/external/.gitignore
@@ -0,0 +1,28 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
diff --git a/include/AlignmentGroup.hpp b/include/AlignmentGroup.hpp
new file mode 100644
index 0000000..e9b1e94
--- /dev/null
+++ b/include/AlignmentGroup.hpp
@@ -0,0 +1,65 @@
+#ifndef ALIGNMENT_GROUP
+#define ALIGNMENT_GROUP
+
+extern "C" {
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+#undef max
+#undef min
+}
+
+
+// Cereal includes
+#include "cereal/types/vector.hpp"
+#include "cereal/archives/binary.hpp"
+
+#include <vector>
+#include "SalmonMath.hpp"
+#include "ReadPair.hpp"
+
+template <typename FragT>
+class AlignmentGroup {
+    public:
+        AlignmentGroup() : read_(nullptr), isUniquelyMapped_(true) { alignments_.reserve(10); }
+        AlignmentGroup(AlignmentGroup& other) = delete;
+        AlignmentGroup(AlignmentGroup&& other) = delete;
+        AlignmentGroup& operator=(AlignmentGroup& other) = delete;
+        AlignmentGroup& operator=(AlignmentGroup&& other) = delete;
+
+        void setRead(std::string* r) { read_ = r; }
+        std::string* read() { return read_; }
+
+        inline std::vector<FragT>& alignments() { return alignments_; }
+        void emplaceAlignment(FragT&& p) { alignments_.emplace_back(p); }
+        void addAlignment(FragT& p) { alignments_.push_back(p); }
+
+        void clearAlignments() {
+            alignments_.clear();
+            isUniquelyMapped_ = true;
+        }
+
+        inline bool& isUniquelyMapped() { return isUniquelyMapped_; }
+        inline size_t numAlignments() { return alignments_.size(); }
+        inline size_t size() { return numAlignments(); }
+
+        template <typename Archive>
+        void serialize(Archive& archive) {
+            archive(alignments_);
+        }
+
+        /**
+         *  Sort the alignments by their transcript ids
+         */
+        inline void sortHits() {
+            std::sort(alignments_.begin(), alignments_.end(),
+                    [](const FragT& x, const FragT& y) -> bool {
+                        return x->transcriptID() < y->transcriptID();
+                     });
+        }
+
+    private:
+        std::vector<FragT> alignments_;
+        std::string* read_;
+        bool isUniquelyMapped_;
+};
+#endif // ALIGNMENT_GROUP
diff --git a/include/AlignmentLibrary.hpp b/include/AlignmentLibrary.hpp
new file mode 100644
index 0000000..9387906
--- /dev/null
+++ b/include/AlignmentLibrary.hpp
@@ -0,0 +1,290 @@
+#ifndef ALIGNMENT_LIBRARY_HPP
+#define ALIGNMENT_LIBRARY_HPP
+
+extern "C" {
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+#undef max
+#undef min
+}
+
+// Our includes
+#include "ClusterForest.hpp"
+#include "Transcript.hpp"
+#include "BAMQueue.hpp"
+#include "SalmonUtils.hpp"
+#include "LibraryFormat.hpp"
+#include "SalmonOpts.hpp"
+#include "FragmentLengthDistribution.hpp"
+#include "FragmentStartPositionDistribution.hpp"
+#include "AlignmentGroup.hpp"
+#include "ErrorModel.hpp"
+#include "AlignmentModel.hpp"
+#include "FASTAParser.hpp"
+#include "concurrentqueue.h"
+#include "EquivalenceClassBuilder.hpp"
+
+// Boost includes
+#include <boost/filesystem.hpp>
+
+// Standard includes
+#include <vector>
+#include <memory>
+#include <functional>
+
+template <typename T>
+class NullFragmentFilter;
+
+/**
+  *  This class represents a library of alignments used to quantify
+  *  a set of target transcripts.  The AlignmentLibrary contains info
+  *  about both the alignment file and the target sequence (transcripts).
+  *  It is used to group them together and track information about them
+  *  during the quantification procedure.
+  */
+template <typename FragT>
+class AlignmentLibrary {
+
+    public:
+
+    AlignmentLibrary(std::vector<boost::filesystem::path>& alnFiles,
+                     const boost::filesystem::path& transcriptFile,
+                     LibraryFormat libFmt,
+                     SalmonOpts& salmonOpts) :
+        alignmentFiles_(alnFiles),
+        transcriptFile_(transcriptFile),
+        libFmt_(libFmt),
+        transcripts_(std::vector<Transcript>()),
+    	fragStartDists_(5),
+        seqBiasModel_(1.0),
+    	eqBuilder_(salmonOpts.jointLog),
+        quantificationPasses_(0) {
+            namespace bfs = boost::filesystem;
+
+            // Make sure the alignment file exists.
+            for (auto& alignmentFile : alignmentFiles_) {
+                if (!bfs::exists(alignmentFile)) {
+                    std::stringstream ss;
+                    ss << "The provided alignment file: " << alignmentFile <<
+                        " does not exist!\n";
+                    throw std::invalid_argument(ss.str());
+                }
+            }
+
+
+            // Make sure the transcript file exists.
+            if (!bfs::exists(transcriptFile_)) {
+                std::stringstream ss;
+                ss << "The provided transcript file: " << transcriptFile_ <<
+                    " does not exist!\n";
+                throw std::invalid_argument(ss.str());
+            }
+
+            // The alignment file existed, so create the alignment queue
+            size_t numParseThreads = salmonOpts.numParseThreads;
+            std::cerr << "parseThreads = " << numParseThreads << "\n";
+            bq = std::unique_ptr<BAMQueue<FragT>>(new BAMQueue<FragT>(alnFiles, libFmt_, numParseThreads,
+                                                                      salmonOpts.mappingCacheMemoryLimit));
+
+            std::cerr << "Checking that provided alignment files have consistent headers . . . ";
+            if (! salmon::utils::headersAreConsistent(bq->headers()) ) {
+                std::stringstream ss;
+                ss << "\nThe multiple alignment files provided had inconsistent headers.\n";
+                ss << "Currently, we require that if multiple SAM/BAM files are provided,\n";
+                ss << "they must have identical @SQ records.\n";
+                throw std::invalid_argument(ss.str());
+            }
+            std::cerr << "done\n";
+
+            SAM_hdr* header = bq->header();
+
+            // The transcript file existed, so load up the transcripts
+            double alpha = 0.005;
+            for (size_t i = 0; i < header->nref; ++i) {
+                transcripts_.emplace_back(i, header->ref[i].name, header->ref[i].len, alpha);
+            }
+
+            FASTAParser fp(transcriptFile.string());
+
+            fmt::print(stderr, "Populating targets from aln = {}, fasta = {} . . .",
+                       alnFiles.front(), transcriptFile_);
+            fp.populateTargets(transcripts_);
+	    for (auto& txp : transcripts_) {
+		    // Length classes taken from
+		    // ======
+		    // Roberts, Adam, et al.
+		    // "Improving RNA-Seq expression estimates by correcting for fragment bias."
+		    // Genome Biol 12.3 (2011): R22.
+		    // ======
+		    // perhaps, define these in a more data-driven way
+		    if (txp.RefLength <= 1334) {
+			    txp.lengthClassIndex(0);
+		    } else if (txp.RefLength <= 2104) {
+			    txp.lengthClassIndex(0);
+		    } else if (txp.RefLength <= 2988) {
+			    txp.lengthClassIndex(0);
+		    } else if (txp.RefLength <= 4389) {
+			    txp.lengthClassIndex(0);
+		    } else {
+			    txp.lengthClassIndex(0);
+		    }
+	    }
+            fmt::print(stderr, "done\n");
+
+            // Create the cluster forest for this set of transcripts
+            clusters_.reset(new ClusterForest(transcripts_.size(), transcripts_));
+
+            // Initialize the fragment length distribution
+            size_t maxFragLen = salmonOpts.fragLenDistMax;
+            size_t meanFragLen = salmonOpts.fragLenDistPriorMean;
+            size_t fragLenStd = salmonOpts.fragLenDistPriorSD;
+            size_t fragLenKernelN = 4;
+            double fragLenKernelP = 0.5;
+            flDist_.reset(new
+                    FragmentLengthDistribution(
+                    1.0, maxFragLen,
+                    meanFragLen, fragLenStd,
+                    fragLenKernelN,
+                    fragLenKernelP, 1)
+                    );
+
+            alnMod_.reset(new AlignmentModel(1.0, salmonOpts.numErrorBins));
+            alnMod_->setLogger(salmonOpts.jointLog);
+
+            // Start parsing the alignments
+           NullFragmentFilter<FragT>* nff = nullptr;
+           bool onlyProcessAmbiguousAlignments = false;
+           bq->start(nff, onlyProcessAmbiguousAlignments);
+        }
+
+    EquivalenceClassBuilder& equivalenceClassBuilder() {
+        return eqBuilder_;
+    }
+
+    std::vector<Transcript>& transcripts() { return transcripts_; }
+
+    inline bool getAlignmentGroup(AlignmentGroup<FragT>*& ag) { return bq->getAlignmentGroup(ag); }
+
+    //inline t_pool* threadPool() { return threadPool_.get(); }
+
+    inline SAM_hdr* header() { return bq->header(); }
+
+    inline std::vector<FragmentStartPositionDistribution>& fragmentStartPositionDistributions() {
+	    return fragStartDists_;
+    }
+
+    inline FragmentLengthDistribution& fragmentLengthDistribution() {
+        return *flDist_.get();
+    }
+
+    inline AlignmentModel& alignmentModel() {
+        return *alnMod_.get();
+    }
+
+    SequenceBiasModel& sequenceBiasModel() {
+        return seqBiasModel_;
+    }
+
+//    inline tbb::concurrent_queue<FragT*>& fragmentQueue() {
+    inline tbb::concurrent_queue<FragT*>& fragmentQueue() {
+        return bq->getFragmentQueue();
+    }
+
+//    inline tbb::concurrent_bounded_queue<AlignmentGroup<FragT*>*>& alignmentGroupQueue() {
+    inline moodycamel::ConcurrentQueue<AlignmentGroup<FragT*>*>& alignmentGroupQueue() {
+        return bq->getAlignmentGroupQueue();
+    }
+
+    inline BAMQueue<FragT>& getAlignmentGroupQueue() { return *bq.get(); }
+
+    inline size_t upperBoundHits() { return bq->numMappedReads(); }
+    inline size_t numMappedReads() { return bq->numMappedReads(); }
+    inline size_t numUniquelyMappedReads() { return bq->numUniquelyMappedReads(); }
+
+    //const boost::filesystem::path& alignmentFile() { return alignmentFile_; }
+
+    ClusterForest& clusterForest() { return *clusters_.get(); }
+
+    template <typename FilterT>
+    bool reset(bool incPasses=true, FilterT filter=nullptr, bool onlyProcessAmbiguousAlignments=false) {
+        namespace bfs = boost::filesystem;
+
+        for (auto& alignmentFile : alignmentFiles_) {
+            if (!bfs::is_regular_file(alignmentFile)) {
+                return false;
+            }
+        }
+
+        bq->reset();
+        bq->start(filter, onlyProcessAmbiguousAlignments);
+        if (incPasses) {
+            quantificationPasses_++;
+            fmt::print(stderr, "Current iteration = {}\n", quantificationPasses_);
+        }
+        return true;
+    }
+
+    inline LibraryFormat format() { return libFmt_; }
+
+    private:
+    /**
+     * The file from which the alignments will be read.
+     * This can be a SAM or BAM file, and can be a regular
+     * file or a fifo.
+     */
+    std::vector<boost::filesystem::path> alignmentFiles_;
+    /**
+     * The file from which the transcripts are read.
+     * This is expected to be a FASTA format file.
+     */
+    boost::filesystem::path transcriptFile_;
+    /**
+     * Describes the expected format of the sequencing
+     * fragment library.
+     */
+    LibraryFormat libFmt_;
+    /**
+     * The targets (transcripts) to be quantified.
+     */
+    std::vector<Transcript> transcripts_;
+    /**
+     * A pointer to the queue from which the fragments
+     * will be read.
+     */
+    //std::unique_ptr<t_pool, std::function<void(t_pool*)>> threadPool_;
+    std::unique_ptr<BAMQueue<FragT>> bq;
+
+    SequenceBiasModel seqBiasModel_;
+
+    /**
+     * The cluster forest maintains the dynamic relationship
+     * defined by transcripts and reads --- if two transcripts
+     * share an ambiguously mapped read, then they are placed
+     * in the same cluster.
+     */
+    std::unique_ptr<ClusterForest> clusters_;
+
+    /**
+      * The emperical fragment start position distribution
+      */
+    std::vector<FragmentStartPositionDistribution> fragStartDists_;
+
+    /**
+     * The emperical fragment length distribution.
+     *
+     */
+    std::unique_ptr<FragmentLengthDistribution> flDist_;
+    /**
+      * The emperical error model
+      */
+    std::unique_ptr<AlignmentModel> alnMod_;
+
+    /** Keeps track of the number of passes that have been
+     *  made through the alignment file.
+     */
+    size_t quantificationPasses_;
+
+    EquivalenceClassBuilder eqBuilder_;
+};
+
+#endif // ALIGNMENT_LIBRARY_HPP
diff --git a/include/AlignmentModel.hpp b/include/AlignmentModel.hpp
new file mode 100644
index 0000000..45785ff
--- /dev/null
+++ b/include/AlignmentModel.hpp
@@ -0,0 +1,111 @@
+#ifndef ALIGNMENT_MODEL
+#define ALIGNMENT_MODEL
+
+#include <mutex>
+#include <atomic>
+#include <memory>
+
+// logger includes
+#include "spdlog/spdlog.h"
+
+#include "tbb/concurrent_vector.h"
+#include "AtomicMatrix.hpp"
+
+extern "C" {
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+#undef max
+#undef min
+}
+
+struct UnpairedRead;
+struct ReadPair;
+class Transcript;
+
+class AlignmentModel{
+public:
+    AlignmentModel(double alpha, uint32_t readBins = 4);
+    bool burnedIn();
+    void burnedIn(bool burnedIn);
+
+    /**
+    *  For unpaired reads, update the error model to account
+    *  for errors we've observed in this read pair.
+    */
+    void update(const UnpairedRead&, Transcript& ref, double p, double mass);
+
+    /**
+      * Compute the log-likelihood of the observed unpaired alignment given the
+      * current error model.
+      */
+    double logLikelihood(const UnpairedRead&, Transcript& ref);
+
+    /**
+    *  For paired-end reads, update the error model to account
+    *  for errors we've observed in this read pair.
+    */
+    void update(const ReadPair&, Transcript& ref, double p, double mass);
+
+    /**
+     * Compute the log-likelihood of the observed paire-end alignment given the
+     * current error model.
+     */
+    double logLikelihood(const ReadPair&, Transcript& ref);
+
+
+    bool hasIndel(UnpairedRead& r);
+    bool hasIndel(ReadPair & r);
+
+    void setLogger(std::shared_ptr<spdlog::logger> logger);
+    bool hasLogger();
+
+    void normalize();
+
+private:
+
+    enum AlignmentModelChar {
+        ALN_A = 0,
+        ALN_C = 1,
+        ALN_G = 2,
+        ALN_T = 3,
+        ALN_DASH = 4,
+        ALN_SOFT_CLIP = 5,
+        ALN_HARD_CLIP = 6,
+        ALN_PAD = 7,
+        ALN_REF_SKIP = 8
+    };
+
+    void setBasesFromCIGAROp_(enum cigar_op op, size_t& curRefBase, size_t& curReadBase);
+                              //std::stringstream& readStr, std::stringstream& matchStr,
+                              //std::stringstream& refstr);
+
+    // 11 states --- the 9 listed in "AlignmentModelChar" plus START (9) i
+    // and END (10)
+    constexpr static uint32_t numAlignmentStates() { return 82; }
+    constexpr static uint32_t numStates = 9;
+    constexpr static uint32_t startStateIdx = 81;
+
+    /**
+     * These functions, which work directly on bam_seq_t* types, drive the
+     * update() and logLikelihood() methods above.
+     */
+    void update(bam_seq_t* read, Transcript& ref, double p, double mass, std::vector<AtomicMatrix<double>>& mismatchProfile);
+    double logLikelihood(bam_seq_t* read, Transcript& ref, std::vector<AtomicMatrix<double>>& mismatchProfile);
+    bool hasIndel(bam_seq_t* r);
+
+    // NOTE: Do these need to be concurrent_vectors as before?
+    // Store the mismatch probability tables for the left and right reads
+    std::vector<AtomicMatrix<double>> transitionProbsLeft_;
+    std::vector<AtomicMatrix<double>> transitionProbsRight_;
+
+    bool isEnabled_;
+    //size_t maxLen_;
+    size_t readBins_;
+    std::atomic<bool> burnedIn_;
+    std::shared_ptr<spdlog::logger> logger_;
+    // Maintain a mutex in case the error model wants to talk to the
+    // console / log.
+    std::mutex outputMutex_;
+};
+
+#endif // ERROR_MODEL
diff --git a/include/AtomicMatrix.hpp b/include/AtomicMatrix.hpp
new file mode 100644
index 0000000..0a6f05a
--- /dev/null
+++ b/include/AtomicMatrix.hpp
@@ -0,0 +1,118 @@
+#ifndef ATOMIC_MATRIX
+#define ATOMIC_MATRIX
+
+#include "tbb/concurrent_vector.h"
+#include "tbb/atomic.h"
+
+#include "SalmonMath.hpp"
+
+#include <vector>
+
+template <typename T>
+class AtomicMatrix {
+public:
+    AtomicMatrix(size_t nRow, size_t nCol, T alpha, bool logSpace = true) :
+        storage_(nRow * nCol, logSpace ? std::log(alpha) : alpha),
+        rowsums_(nRow, logSpace ? std::log(nCol * alpha) : nCol * alpha),
+        nRow_(nRow),
+        nCol_(nCol),
+        alpha_(alpha),
+        logSpace_(logSpace) {
+    }
+
+    void incrementUnnormalized(size_t rowInd, size_t colInd, T amt) {
+        using salmon::math::logAdd;
+        size_t k = rowInd * nCol_ + colInd;
+        if (logSpace_) {
+            T oldVal = storage_[k];
+            T retVal = oldVal;
+            T newVal = logAdd(oldVal, amt);
+            do {
+                oldVal = retVal;
+                newVal = logAdd(oldVal, amt);
+                retVal = storage_[k].compare_and_swap(newVal, oldVal);
+            } while (retVal != oldVal);
+
+        } else {
+            T oldVal = storage_[k];
+            T retVal = oldVal;
+            T newVal = oldVal + amt;
+            do {
+                oldVal = retVal;
+                newVal = oldVal + amt;
+                retVal = storage_[k].compare_and_swap(newVal, oldVal);
+            } while (retVal != oldVal);
+        }
+    }
+
+    void computeRowSums() {
+        for (size_t rowInd = 0; rowInd < nRow_; ++rowInd) {
+            T rowSum = salmon::math::LOG_0;
+            for (size_t colInd = 0; colInd < nCol_; ++colInd) {
+                size_t k = rowInd * nCol_ + colInd;
+                rowSum = logAdd(rowSum, storage_[k]);
+            }
+            rowsums_[rowInd] = rowSum;
+        }
+    }
+
+    void increment(size_t rowInd, size_t colInd, T amt) {
+        using salmon::math::logAdd;
+        size_t k = rowInd * nCol_ + colInd;
+        if (logSpace_) {
+            T oldVal = storage_[k];
+            T retVal = oldVal;
+            T newVal = logAdd(oldVal, amt);
+            do {
+                oldVal = retVal;
+                newVal = logAdd(oldVal, amt);
+                retVal = storage_[k].compare_and_swap(newVal, oldVal);
+            } while (retVal != oldVal);
+
+            oldVal = rowsums_[rowInd];
+            retVal = oldVal;
+            newVal = logAdd(oldVal, amt);
+            do {
+                oldVal = retVal;
+                newVal = logAdd(oldVal, amt);
+                retVal = rowsums_[rowInd].compare_and_swap(newVal, oldVal);
+            } while (retVal != oldVal);
+        } else {
+            T oldVal = storage_[k];
+            T retVal = oldVal;
+            T newVal = oldVal + amt;
+            do {
+                oldVal = retVal;
+                newVal = oldVal + amt;
+                retVal = storage_[k].compare_and_swap(newVal, oldVal);
+            } while (retVal != oldVal);
+
+            oldVal = rowsums_[rowInd];
+            retVal = oldVal;
+            newVal = oldVal + amt;
+            do {
+                oldVal = retVal;
+                newVal = oldVal + amt;
+                retVal = rowsums_[rowInd].compare_and_swap(newVal, oldVal);
+            } while (retVal != oldVal);
+        }
+    }
+
+    T operator()(size_t rowInd, size_t colInd, bool normalized = true) {
+        size_t k = rowInd * nCol_ + colInd;
+        if (logSpace_) {
+            return storage_[k] - rowsums_[rowInd];
+        } else {
+            return storage_[k] / rowsums_[rowInd];
+        }
+    }
+
+private:
+    std::vector<tbb::atomic<T>> storage_;
+    std::vector<tbb::atomic<T>> rowsums_;
+    size_t nRow_, nCol_;
+    T alpha_;
+    bool logSpace_;
+};
+
+#endif // ATOMIC_MATRIX
diff --git a/include/BAMQueue.hpp b/include/BAMQueue.hpp
new file mode 100644
index 0000000..67122aa
--- /dev/null
+++ b/include/BAMQueue.hpp
@@ -0,0 +1,147 @@
+#ifndef __BAMQUEUE_HPP__
+#define __BAMQUEUE_HPP__
+
+//#include <boost/lockfree/spsc_queue.hpp>
+//#include <boost/lockfree/queue.hpp>
+#include <tbb/atomic.h>
+#include <iostream>
+#include <fstream>
+#include <atomic>
+#include <vector>
+#include <memory>
+#include <exception>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+
+#include <boost/timer/timer.hpp>
+#include <boost/filesystem.hpp>
+#include <tbb/concurrent_queue.h>
+#include "AlignmentGroup.hpp"
+#include "LibraryFormat.hpp"
+#include "SalmonMath.hpp"
+#include "ReadPair.hpp"
+#include "UnpairedRead.hpp"
+#include "spdlog/spdlog.h"
+#include "concurrentqueue.h"
+#include "readerwriterqueue.h"
+
+extern "C" {
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+#undef max
+#undef min
+}
+
+
+
+/**
+  * Simple structure holding info about the alignment file.
+  */
+struct AlignmentFile {
+    boost::filesystem::path fileName;
+    std::string readMode;
+    scram_fd* fp;
+    SAM_hdr* header;
+    uint32_t numParseThreads;
+};
+
+/**
+  * A queue from which to draw BAM alignments.  The queue is thread-safe, and
+  * can be written to and read from multiple threads.
+  *
+  * This class is templated on LibT --- the type of read library from which
+  * the provided alignments are being generated.
+  */
+template <typename FragT>
+class BAMQueue {
+public:
+  BAMQueue(std::vector<boost::filesystem::path>& fnames, LibraryFormat& libFmt,
+           uint32_t numParseThreads, uint32_t cacheSize);
+  ~BAMQueue();
+  void forceEndParsing();
+
+  SAM_hdr* header();
+  SAM_hdr* safeHeader();
+
+  std::vector<SAM_hdr*> headers();
+
+  template <typename FilterT>
+  void start(FilterT filt, bool onlyProcessAmbiguousAlignments = false);
+
+  inline bool getAlignmentGroup(AlignmentGroup<FragT*>*& group);
+
+  // Return the number of reads processed so far by the queue
+  size_t numObservedReads();
+  size_t numMappedReads();
+  size_t numUniquelyMappedReads();
+
+  void reset();
+
+  tbb::concurrent_queue<FragT*>& getFragmentQueue();
+  //moodycamel::ConcurrentQueue<FragT*>& getFragmentQueue();
+
+  //tbb::concurrent_bounded_queue<AlignmentGroup<FragT*>*>& getAlignmentGroupQueue();
+  moodycamel::ConcurrentQueue<AlignmentGroup<FragT*>*>& getAlignmentGroupQueue();
+
+private:
+  size_t popNum{0};
+  /** Fill the queue with the appropriate type of alignment
+   * depending on the template paramater T
+   */
+  template <typename FilterT>
+  void fillQueue_(FilterT, bool);
+
+  /** Overload of getFrag_ for paired-end reads */
+  template <typename FilterT>
+  inline bool getFrag_(ReadPair& rpair, FilterT filt);
+  /** Overload of getFrag_ for single-end reads */
+  template <typename FilterT>
+  inline bool getFrag_(UnpairedRead& sread, FilterT filt);
+
+public:
+  bool verbose=false;
+private:
+  std::vector<AlignmentFile> files_;
+  std::string fname_;
+  LibraryFormat libFmt_;
+
+  std::vector<AlignmentFile>::iterator currFile_;
+  scram_fd* fp_ = nullptr;
+  SAM_hdr* hdr_ = nullptr;
+
+  //htsFile* fp_ = nullptr;
+  size_t totalReads_;
+  size_t numUnaligned_;
+  size_t numMappedReads_;
+  size_t numUniquelyMappedReads_;
+  tbb::concurrent_queue<FragT*> fragmentQueue_;
+  //moodycamel::ConcurrentQueue<FragT*> fragmentQueue_;
+
+  //tbb::concurrent_bounded_queue<AlignmentGroup<FragT*>*> alnGroupPool_;
+  moodycamel::ConcurrentQueue<AlignmentGroup<FragT*>*> alnGroupPool_;
+
+  //tbb::concurrent_bounded_queue<AlignmentGroup<FragT*>*> alnGroupQueue_;
+  moodycamel::ReaderWriterQueue<AlignmentGroup<FragT*>*> alnGroupQueue_;
+
+  /*
+  boost::lockfree::spsc_queue<AlignmentGroup<FragT*>*,
+                              boost::lockfree::capacity<65535>> alnGroupQueue_;
+                              */
+  volatile bool doneParsing_;
+  volatile bool exhaustedAlnGroupPool_;
+  std::unique_ptr<std::thread> parsingThread_;
+  std::shared_ptr<spdlog::logger> logger_;
+
+  size_t batchNum_;
+  std::string readMode_;
+/*
+#if not defined(__APPLE__)
+   std::mutex agMutex_;
+    std::condition_variable workAvailable_;
+#endif
+*/
+};
+
+#include "BAMQueue.tpp"
+#endif //__BAMQUEUE_HPP__
diff --git a/include/BAMQueue.tpp b/include/BAMQueue.tpp
new file mode 100644
index 0000000..53aac9f
--- /dev/null
+++ b/include/BAMQueue.tpp
@@ -0,0 +1,819 @@
+#include "BAMQueue.hpp"
+#include "IOUtils.hpp"
+#include <boost/config.hpp> // for BOOST_LIKELY/BOOST_UNLIKELY
+#include <chrono>
+
+template <typename FragT>
+BAMQueue<FragT>::BAMQueue(std::vector<boost::filesystem::path>& fnames, LibraryFormat& libFmt,
+                          uint32_t numParseThreads, uint32_t cacheSize):
+    files_(std::vector<AlignmentFile>()),
+    libFmt_(libFmt), totalReads_(0),
+    numUnaligned_(0), numMappedReads_(0), 
+    numUniquelyMappedReads_(0),
+    //fragmentQueue_(2000000),
+    alnGroupPool_(2000000),
+    alnGroupQueue_(1000000),
+    doneParsing_(false),
+    exhaustedAlnGroupPool_(false) {
+        namespace bfs = boost::filesystem;
+
+        logger_ = spdlog::get("jointLog");
+
+        uint32_t localCacheSize = std::max(uint32_t{2000000}, cacheSize);
+        uint32_t capacity{localCacheSize};
+        for (size_t i = 0; i < capacity; ++i) {
+            // avoid r-value ref until we figure out what's
+            // up with TBB 4.3
+            auto* fragPtr = new FragT;
+            fragmentQueue_.push(fragPtr);
+        }
+
+        size_t groupCapacity = localCacheSize;
+        //alnGroupPool_.set_capacity(groupCapacity);
+        for (size_t i = 0; i < groupCapacity; ++i) {
+            // avoid r-value ref until we figure out what's
+            // up with TBB 4.3
+            auto* agrpPtr = new AlignmentGroup<FragT*>; 
+            alnGroupPool_.enqueue(agrpPtr);
+        }
+
+        bool firstFile = true;
+        for (auto& fname : fnames) {
+            if (bfs::is_regular_file(fname)) {
+               if (bfs::is_empty(fname)) {
+                    logger_->error("file [{}] appears to be empty "
+                        "(i.e. it has size 0).  This is likely an error. "
+                        "Please re-run salmon with a corrected input file.\n\n", 
+                        fname);
+                    std::exit(1);
+               }
+            }
+            readMode_ = "r";
+            if (fname.extension() == ".bam") {
+                readMode_ = "rb";
+            }
+            auto* fp = scram_open(fname.c_str(), readMode_.c_str());
+            // If this is the first file, then we'll be parsing it soon.
+            // set the number of parse threads.
+            if (firstFile) {
+                scram_set_option(fp, CRAM_OPT_NTHREADS, numParseThreads);
+            }
+            auto* header = scram_get_header(fp);
+            sam_hdr_incr_ref(header);
+            // If this isn't the first file, then close it.
+            // We'll open it again when we need it.
+            if (!firstFile) {
+                scram_close(fp);
+                fp = nullptr;
+            }
+            files_.push_back({fname, readMode_, fp, header, numParseThreads});
+            firstFile = false;
+        }
+}
+
+template <typename FragT>
+void BAMQueue<FragT>::reset() {
+  fmt::print(stderr, "Resetting BAMQueue from file(s) [ ");
+  parsingThread_->join();
+  for (auto& file : files_) {
+      fmt::print(stderr, "{} ", file.fileName);
+      // make sure that all of the current files are closed
+      if (file.fp != nullptr) {
+          scram_close(file.fp);
+          // but make sure we still have a reference to the header!
+          if (file.header == nullptr or file.header->ref_count <= 0) {
+              fmt::MemoryWriter errstr;
+              errstr << "The header for file " << file.fileName.c_str() 
+                     << " was deleted.  This should not happen! exiting!\n";
+              logger_->warn() << errstr.str();
+              std::exit(1);
+          }
+      }
+  }
+
+  // re-open the first file
+  auto& file = files_.front();
+  file.fp = scram_open(file.fileName.c_str(), file.readMode.c_str());
+
+  // If we couldn't open the file, then report this and exit.
+  if (file.fp == NULL) {
+    fmt::MemoryWriter errstr;
+    errstr << "ERROR: Failed to open file " << file.fileName.c_str() << ", exiting!\n";
+    logger_->warn() << errstr.str();
+    std::exit(1);
+  }
+  scram_set_option(file.fp, CRAM_OPT_NTHREADS, file.numParseThreads);
+
+  fmt::print(stderr, "] . . . done\n");
+  totalReads_ = 0;
+  numUnaligned_ = 0;
+  numMappedReads_ = 0;
+  numUniquelyMappedReads_ = 0;
+  doneParsing_ = false;
+  batchNum_ = 0;
+}
+
+template <typename FragT>
+BAMQueue<FragT>::~BAMQueue() {
+    fmt::print(stderr, "\nFreeing memory used by read queue . . . ");
+    parsingThread_->join();
+    fmt::print(stderr, "\nJoined parsing thread . . . ");
+  
+    for (auto& file : files_) {
+        fmt::print(stderr, "{} ", file.fileName);
+        // make sure that all of the current files are closed
+        if (file.fp != nullptr) {
+            scram_close(file.fp);
+            file.fp = nullptr;
+       }
+       // free the remaining reference to the header
+       if (file.header == nullptr or file.header->ref_count <= 0) {
+            fmt::MemoryWriter errstr;
+            errstr << "The header for file " << file.fileName.c_str() 
+                << " was deleted.  This should not happen! exiting!\n";
+            logger_->warn() << errstr.str();
+            std::exit(1);
+        } else {
+            sam_hdr_decr_ref(file.header); 
+            file.header = nullptr;
+        }
+    }
+
+    fmt::print(stderr, "\nClosed all files . . . ");
+    // Free the structure holding all of the reads
+    FragT* frag;
+    //while (!fragmentQueue_.empty()) { 
+    while (fragmentQueue_.try_pop(frag)) { 
+        delete frag;
+        frag = nullptr;
+    }
+    //}
+    fmt::print(stderr, "\nEmptied frag queue. . . ");
+
+    AlignmentGroup<FragT*>* grp;
+    //while(!alnGroupPool_.empty()) { alnGroupPool_.pop(grp); delete grp; grp = nullptr; }
+    while(alnGroupPool_.try_dequeue(grp)) { delete grp; grp = nullptr; }
+    fmt::print(stderr, "\nEmptied Alignemnt Group Pool. . ");
+    while(alnGroupQueue_.try_dequeue(grp)) { delete grp; grp = nullptr; }
+    fmt::print(stderr, "\nEmptied Alignment Group Queue. . . ");
+    fmt::print(stderr, "done\n");
+}
+
+/*
+ * Tries _numTries_ times to get work from _workQueue_.  It returns
+ * true immediately if it was able to find work, and false otherwise.
+ */
+template <typename FragT>
+inline bool tryToGetWork(moodycamel::ReaderWriterQueue<AlignmentGroup<FragT*>*>& workQueue,
+                         AlignmentGroup<FragT*>*& group,
+                         uint32_t numTries) {
+    uint32_t attempts{1};
+    bool foundWork = workQueue.try_dequeue(group);
+    while (!foundWork and attempts < numTries) {
+        foundWork = workQueue.try_dequeue(group);
+        ++attempts;
+    }
+    return foundWork;
+}
+
+
+
+
+template <typename FragT>
+inline bool BAMQueue<FragT>::getAlignmentGroup(AlignmentGroup<FragT*>*& group) {
+    while (!doneParsing_) {
+        while (alnGroupQueue_.try_dequeue(group)) {
+            return true;
+        }
+/*
+#if not defined(__APPLE__)
+{
+            std::unique_lock<std::mutex> l(agMutex_);
+            workAvailable_.wait(l, [&group, this]() { 
+                                    return this->alnGroupQueue_.try_dequeue(group) or this->doneParsing_; 
+                                });
+            return !doneParsing_;
+}
+#endif
+    }
+*/
+    }
+
+    while (alnGroupQueue_.try_dequeue(group)) {
+        return true;
+    }
+    return false;
+}
+
+template <typename FragT>
+void BAMQueue<FragT>::forceEndParsing() { doneParsing_ = true; }
+
+template <typename FragT>
+SAM_hdr* BAMQueue<FragT>::header() { return files_.front().header; } 
+
+template <typename FragT>
+std::vector<SAM_hdr*> BAMQueue<FragT>::headers() { 
+    std::vector<SAM_hdr*> hs;
+    for (auto& file : files_) {
+        hs.push_back(file.header);
+    }
+    return hs;
+}
+
+template <typename FragT>
+template <typename FilterT>
+void BAMQueue<FragT>::start(FilterT filt, bool onlyProcessAmbiguousAlignments) {
+    // Start the parsing thread that will fill the queue
+    parsingThread_.reset(new std::thread([this, filt, onlyProcessAmbiguousAlignments]()-> void {
+            this->fillQueue_(filt, onlyProcessAmbiguousAlignments);
+    }));
+}
+
+template <typename FragT>
+tbb::concurrent_queue<FragT*>& BAMQueue<FragT>::getFragmentQueue() {
+//moodycamel::ConcurrentQueue<FragT*>& BAMQueue<FragT>::getFragmentQueue() {
+    return fragmentQueue_;
+}
+
+template <typename FragT>
+//tbb::concurrent_bounded_queue<AlignmentGroup<FragT*>*>& BAMQueue<FragT>::getAlignmentGroupQueue() {
+moodycamel::ConcurrentQueue<AlignmentGroup<FragT*>*>& BAMQueue<FragT>::getAlignmentGroupQueue() {
+    return alnGroupPool_;
+}
+
+inline bool checkProperPairedNames_(const char* qname1, const char* qname2, const uint32_t nameLen) {
+    bool same{true};
+    bool sameEnd{true};
+    if (BOOST_LIKELY(nameLen > 1)) {
+        same = (memcmp(qname1, qname2, nameLen - 1) == 0);
+        sameEnd = qname1[nameLen-1] == qname2[nameLen-1];
+        same = same and (sameEnd or qname1[nameLen-1] == '1' or qname1[nameLen-1] == '2');
+        same = same and (sameEnd or qname2[nameLen-1] == '2' or qname2[nameLen-1] == '1');
+    } else {
+        same = (memcmp(qname1, qname2, nameLen) == 0);
+    }
+    return same;
+}
+
+
+/**
+* This set of functions checks if two reads have the same name.  If the reads
+* are paired-end, it allows them to differ in the last character if they end in 1/2.
+* If the reads are single-end, the strings must be identical.
+*/
+template <typename T>
+inline bool sameReadName_(const char* qname1, const char* qname2, const uint32_t nameLen); // (OSX) clang can't handle this? = delete;
+
+
+// Specialization for unpaired reads.
+template <>
+inline bool sameReadName_<UnpairedRead>(const char* qname1, const char* qname2, const uint32_t nameLen) {
+    return (memcmp(qname1, qname2, nameLen) == 0);
+}
+
+// Specialization for paired reads.
+template <>
+inline bool sameReadName_<ReadPair>(const char* qname1, const char* qname2, const uint32_t nameLen) {
+    return checkProperPairedNames_(qname1, qname2, nameLen);
+}
+
+enum class AlignmentType : uint8_t {
+    UnmappedOrphan = 0,
+    MappedOrphan = 1,
+    MappedDiscordantPair = 2,
+    MappedConcordantPair = 3,
+    UnmappedPair = 4
+};
+
+inline AlignmentType getPairedAlignmentType_(bam_seq_t* aln) {
+    bool readIsMapped = !(bam_flag(aln) & BAM_FUNMAP);
+    bool mateIsMapped = !(bam_flag(aln) & BAM_FMUNMAP);
+
+    if (readIsMapped and mateIsMapped) {
+        if ( bam_flag(aln) & BAM_FPROPER_PAIR ) {
+            if (bam_ref(aln) == bam_mate_ref(aln)) {
+                return AlignmentType::MappedConcordantPair;
+            } else {
+                // NOTE: It seems like some aligners (e.g. SNAP), currently mark
+                // discordant reads as proper pairs in the alignment file.  Here,
+                // we check that both pairs map to the same target.
+                // If the aligner insists they are both mapped, but reports 
+                // alignments to different targets, we will treat them as 
+                // mapped orphans 
+                return AlignmentType::MappedOrphan;
+            }
+        } else {
+            // FIXME
+            // NOTE: Since (currently) discordant pairs can cause us to
+            // drop alignments that should be sampled, treat these guys 
+            // as orphans for the time being.
+            // return AlignmentType::MappedDiscordantPair;
+            return AlignmentType::MappedOrphan;
+        }
+    }
+
+    if (readIsMapped and !mateIsMapped) {
+        return AlignmentType::MappedOrphan;
+    }
+    if (mateIsMapped and !readIsMapped) {
+        return AlignmentType::UnmappedOrphan;
+    }
+    if (!mateIsMapped and !readIsMapped) {
+        return AlignmentType::UnmappedPair;
+    }
+    std::cerr << "\n\n\nEncountered unknown alignemnt type; this should not happen!\n"
+              << "Please file a bug report on GitHub. Exiting.\n";
+    std::exit(1);
+}
+
+inline uint32_t getPairedNameLen(bam_seq_t* read) {
+        uint32_t l = bam_name_len(read);
+        char* r = bam_name(read);
+        if ( l > 2  and r[l-2] == '/') {
+            return l-2;
+        }
+        return l;
+}
+
+template <typename FragT>
+template <typename FilterT>
+inline bool BAMQueue<FragT>::getFrag_(ReadPair& rpair, FilterT filt) {
+    bool haveValidPair{false};
+    bool didRead1{false};
+    bool didRead2{false};
+    rpair.orphanStatus = salmon::utils::OrphanStatus::LeftOrphan;
+    while (!haveValidPair) {
+        // Consume a single read
+        didRead1 = (scram_get_seq(fp_, &rpair.read1) >= 0);
+        AlignmentType alnType;
+        // If we were able to obtain a read, determine what type
+        // of alignment it came from.
+        while (didRead1) {
+            alnType = getPairedAlignmentType_(rpair.read1);
+            // If this read is part of a concordantly mapped pair, or an
+            // unmapped pair, then go get the other read.
+            if ( (alnType == AlignmentType::MappedConcordantPair)
+                    or (alnType == AlignmentType::UnmappedPair)) { break; }
+            
+            bool isFwd{false};
+            uint32_t startPos;
+
+            switch (alnType) {
+                case AlignmentType::UnmappedOrphan:
+                    ++numUnaligned_;
+                    if (filt != nullptr) {
+                        rpair.orphanStatus = salmon::utils::OrphanStatus::LeftOrphan;
+                        filt->processFrag(&rpair);
+                    }
+                    break;
+                    // === end of UnmappedOrphan case
+                case AlignmentType::MappedOrphan:
+                    isFwd = !(bam_flag(rpair.read1) & BAM_FREVERSE);
+                    startPos = bam_pos(rpair.read1); 
+                    rpair.libFmt = salmon::utils::hitType(startPos, isFwd);
+                    rpair.orphanStatus = (!isFwd) ?
+                        salmon::utils::OrphanStatus::LeftOrphan :
+                        salmon::utils::OrphanStatus::RightOrphan;
+                    rpair.logProb = salmon::math::LOG_0;
+                    return true;
+                    // === end of MappedOrphan case
+                case AlignmentType::MappedDiscordantPair:
+                    // The discordant mapped pair case is sort of a nightmare
+                    // it requires arbitrary look-ahead followed by a pairing
+                    // of the ends once all of the records for a read have been
+                    // consumed. For the time being, we don't support discordant
+                    // mappings --- just skip over them.
+                    // Don't even pass them to the filter right now!
+                    /*
+                       if (filt != nullptr) {
+                       rpair.orphanStatus = salmon::utils::OrphanStatus::LeftOrphan;
+                       filt->processFrag(&rpair);
+                       }
+                     */
+                    break;
+                    // === end if MappedDiscordantPair case
+                default:
+                    logger_->error("\n\n\nEncountered unknown alignemnt type; this should not happen!\n"
+                                   "Please file a bug report on GitHub. Exiting.\n");
+                    std::exit(1);
+                    break;
+            }
+            // If this was not a properly mapped orphan read, then grab the next
+            // read.
+            didRead1 = (scram_get_seq(fp_, &rpair.read1) >= 0);
+        }
+
+        didRead2 = (scram_get_seq(fp_, &rpair.read2) >=0);
+
+        // If we didn't get a read, then we've exhausted this file. 
+        // NOTE: I'm not sure about the *or* condition here. In some cases, we
+        // may be discarding a single read, but it won't be properly paired
+        // anyway. Figure out what the right thing is to do here.
+        if (!didRead1 or !didRead2) { 
+            // close the current file
+            scram_close(currFile_->fp);
+            currFile_->fp = nullptr;
+            // increment the file iterator
+            currFile_++;
+            // If this is the last file, then we're done
+            if (currFile_ == files_.end()) { return false; }
+            // Otherwise, start parsing the next file.
+            fp_ = scram_open(currFile_->fileName.c_str(), currFile_->readMode.c_str());
+            hdr_ = currFile_->header;
+            continue;
+        }
+
+        // If we expected a paired read, but didn't find one
+        // then flip out and quit!
+        if (BOOST_UNLIKELY((
+                        !(bam_flag(rpair.read1) & BAM_FPAIRED) or
+                        !(bam_flag(rpair.read2) & BAM_FPAIRED))
+                    )) {
+            fmt::MemoryWriter errmsg;
+            errmsg << "\n\n"
+                    << ioutils::SET_RED << "ERROR: " << ioutils::RESET_COLOR
+                    << "Found unpaired read in a paired-end library. "
+                    << "The read was marked as unpaired in sequencing (not just unmapped)."
+                    << "The two ends of a paired-end read should be adjacent. "
+                    << "Don't know how to proceed; exiting!\n\n";
+            logger_->warn() << errmsg.str();
+            std::exit(-1);
+        }
+        // We've observed two, consecutive paired reads; now check if our reads
+        // have the same name.
+        // The names must first have the same length.
+        //bool sameName = getPairedNameLen(rpair.read1) == getPairedNameLen(rpair.read2);
+        bool sameName = (bam_name_len(rpair.read1) == bam_name_len(rpair.read2));
+        
+        // If the lengths are the same, check the actual strings. Use
+        // memcmp for efficiency since we know the length.
+        if (BOOST_LIKELY(sameName)) {
+            //auto nameLen = getPairedNameLen(rpair.read1);
+            auto nameLen = bam_name_len(rpair.read1);
+            char* qname1 = bam_name(rpair.read1);
+            char* qname2 = bam_name(rpair.read2);
+            sameName = (memcmp(qname1, qname2, nameLen) == 0);
+        }
+        // If we've gotten this far, then the read should be a pair (same name)
+        // and should either be concordantly aligned (proper pair) or both unaligned.
+        if (BOOST_UNLIKELY(
+                    !sameName or ((bam_flag(rpair.read1) & BAM_FPROPER_PAIR) != (bam_flag(rpair.read2) & BAM_FPROPER_PAIR)))) {
+            
+            fmt::MemoryWriter errmsg;
+            errmsg << "\n\n\n";
+            errmsg << "WARNING: Detected suspicious pair --- \n";
+            if (!sameName) {
+                errmsg << "\tThe names are different:\n";
+                errmsg << "\tread1 : " << bam_name(rpair.read1) << "\n";
+                errmsg << "\tread2 : " << bam_name(rpair.read2) << "\n";
+            }
+            if((bam_flag(rpair.read1) & BAM_FPROPER_PAIR) != (bam_flag(rpair.read2) & BAM_FPROPER_PAIR)) {
+                errmsg << "\tThe proper-pair statuses are inconsistent:\n";
+                errmsg << "read1 [" << bam_name(rpair.read1) << "] : " 
+                    << (!(bam_flag(rpair.read1) & BAM_FPROPER_PAIR) ? "no " : "") << "proper-pair; "
+                    << ((bam_flag(rpair.read1) & BAM_FUNMAP) ? "not " : "") << "mapped; mate"
+                    << ((bam_flag(rpair.read1) & BAM_FMUNMAP) ? "not " : "") << "mapped\n\n";
+                errmsg << "read2 : [" << bam_name(rpair.read1) << "] : " 
+                    << (!(bam_flag(rpair.read2) & BAM_FPROPER_PAIR) ? "no " : "") << "proper-pair; "
+                    << ((bam_flag(rpair.read2) & BAM_FUNMAP) ? "not " : "") << "mapped; mate"
+                    << ((bam_flag(rpair.read2) & BAM_FMUNMAP) ? "not " : "") << "mapped\n\n";
+            }
+            logger_->warn() << errmsg.str();
+        }
+
+
+        bool isFwd1 = false;
+        uint32_t startPos1 = 0;
+        bool isFwd2 = false;
+        uint32_t startPos2 = 0;
+
+        switch (alnType) {
+            case AlignmentType::MappedConcordantPair:
+                haveValidPair = true;
+                if (bam_flag(rpair.read1) & BAM_FREAD2) {
+                    std::swap(rpair.read1, rpair.read2);
+                }
+
+                // if the "fragment" is from the forward strand,
+                // the read will map to the reverse strand, and vice-versa
+                isFwd1 = !(bam_flag(rpair.read1) & BAM_FREVERSE);
+                startPos1 = bam_pos(rpair.read1); 
+                isFwd2 = !(bam_flag(rpair.read2) & BAM_FREVERSE);
+                startPos2 = bam_pos(rpair.read2); 
+ 
+                rpair.libFmt = salmon::utils::hitType(startPos1, isFwd1, 
+                                                      startPos2, isFwd2);
+
+                rpair.orphanStatus = salmon::utils::OrphanStatus::Paired;
+                break;
+            case AlignmentType::UnmappedPair:
+                ++numUnaligned_;
+                if ((filt != nullptr) and sameName) {
+                    rpair.orphanStatus = salmon::utils::OrphanStatus::Paired;
+                    filt->processFrag(&rpair);
+                }
+                break;
+            default:
+                logger_->error("\n\n\nEncountered unknown alignemnt type; this should not happen!\n"
+                               "Please file a bug report on GitHub. Exiting.\n");
+                std::exit(1);
+                break;
+        }
+        ++totalReads_;
+    }
+    rpair.logProb = salmon::math::LOG_0;
+    return true;
+}
+
+template <typename FragT>
+template <typename FilterT>
+inline bool BAMQueue<FragT>::getFrag_(UnpairedRead& sread, FilterT filt) {
+    bool haveValidRead{false};
+
+    while (!haveValidRead) {
+        bool didRead = (scram_get_seq(fp_, &sread.read) >= 0);
+        // If we didn't get a read, then we've exhausted this file
+        if (!didRead) { 
+            // close the current file
+            scram_close(currFile_->fp);
+            currFile_->fp = nullptr;
+            currFile_++;
+            // If this is the last file, then we're done
+            if (currFile_ == files_.end()) { return false; }
+            // Otherwise, start parsing the next file.
+            fp_ = scram_open(currFile_->fileName.c_str(), currFile_->readMode.c_str());
+            hdr_ = currFile_->header;
+            continue;
+        }
+
+        if (!(bam_flag(sread.read) & BAM_FDUP) and
+            !(bam_flag(sread.read) & BAM_FQCFAIL) and
+            !(bam_flag(sread.read) & BAM_FUNMAP) and
+            sread.transcriptID() >= 0) {
+            haveValidRead = true;
+        }
+
+        if (!haveValidRead) { 
+            if (filt != nullptr) {
+                filt->processFrag(&sread);
+            }
+            ++numUnaligned_; 
+        }
+        ++totalReads_;
+    }
+
+    sread.logProb = salmon::math::LOG_0;
+    return true;
+}
+
+template <typename FragT>
+size_t BAMQueue<FragT>::numObservedReads(){ return totalReads_; }
+
+template <typename FragT>
+size_t BAMQueue<FragT>::numMappedReads(){ 
+    return numMappedReads_;
+}
+
+template <typename FragT>
+size_t BAMQueue<FragT>::numUniquelyMappedReads(){ 
+    return numUniquelyMappedReads_;
+}
+
+
+template <typename FragT>
+template <typename FilterT>
+void BAMQueue<FragT>::fillQueue_(FilterT filt, bool onlyProcessAmbiguousAlignments) {
+    size_t n{0};
+    size_t numFragAlloc{0};
+    AlignmentGroup<FragT*>* alngroup;
+    //alnGroupPool_.pop(alngroup);
+    while (!alnGroupPool_.try_dequeue(alngroup));
+    bool notified{false};
+
+    currFile_ = files_.begin();
+    fp_ = currFile_->fp;
+    hdr_ = currFile_->header;
+
+    FragT* f;
+    if (!fragmentQueue_.try_pop(f)) {
+        f = new FragT;
+    }
+
+    uint32_t prevLen{1};
+    char* prevReadName = new char[255];
+    bool readAlignsUniquely{false};
+    int32_t prevTranscriptId{std::numeric_limits<int32_t>::min()};
+
+    while(getFrag_(*f, filt)) {
+
+        char* readName = f->getName();
+        uint32_t currLen = f->getNameLength();
+        // if this is a new read
+        if ( (currLen != prevLen) or
+            !sameReadName_<UnpairedRead>(readName, prevReadName, currLen) ) {
+
+            if (alngroup->size() > 0) {
+
+                // If we only wish to process ambiguous alignments,
+                // and the current read aligns uniquely, then return 
+                // what we parsed to the appropriate queue and continue
+                if (onlyProcessAmbiguousAlignments and 
+                        readAlignsUniquely) {
+                   // return the fragments
+                   for (auto aln : alngroup->alignments()) {
+                       fragmentQueue_.push(aln); aln = nullptr;
+                   }
+                   // clear the alignments vector
+                   alngroup->alignments().clear();
+                   // continue to use this alignment group
+                   numUniquelyMappedReads_++;
+                } else {
+                    // push the align group
+                    while(!alnGroupQueue_.try_enqueue(alngroup));
+                    alngroup = nullptr;
+                    if (!alnGroupPool_.try_dequeue(alngroup)) {  
+                        exhaustedAlnGroupPool_ = true;
+                        //alnGroupPool_.pop(alngroup);
+                        while(!alnGroupPool_.try_dequeue(alngroup));
+                    }
+                }
+            }
+            
+            if (readAlignsUniquely) { numUniquelyMappedReads_++; }
+            readAlignsUniquely = true;
+
+            alngroup->addAlignment(f);
+            memcpy(prevReadName, readName, currLen);
+            prevLen = currLen;
+            prevTranscriptId = f->transcriptID();
+            f = nullptr;
+            numMappedReads_++;
+        } else { // otherwise, this is another alignment for the same read
+
+            // If the new alignment for the read is to a 
+            // different transcript, then it's not a unique mapper
+            if (readAlignsUniquely and f->transcriptID() != prevTranscriptId) {
+                readAlignsUniquely = false;
+            }
+
+            alngroup->addAlignment(f);
+            f = nullptr;
+       }
+
+        while (!fragmentQueue_.try_pop(f)) {
+            if (!exhaustedAlnGroupPool_) {
+                f = new FragT;
+                ++numFragAlloc;
+                break;
+            }
+        }
+
+       if (!notified and exhaustedAlnGroupPool_) { 
+          logger_->info("\n\nThe alignment group queue pool has been exhausted.  {} extra fragments were allocated "
+                        "on the heap to saturate the pool.  No new fragments will be allocated\n\n", numFragAlloc);
+          notified = true;
+       }
+       ++n;
+    }
+
+    // If we popped a fragment structure off the queue, but didn't add it 
+    // to an alignment group, then reclaim it here
+    if (f != nullptr) { fragmentQueue_.push(f); f = nullptr; }
+
+    // If the last alignment group is non-empty, then send 
+    // it off to be processed.
+    if (alngroup->size() > 0) { 
+
+        // If we only wish to process ambiguous alignments,
+        // and the current read aligns uniquely, then return 
+        // what we parsed to the appropriate queue and continue
+        if (onlyProcessAmbiguousAlignments and 
+                readAlignsUniquely) {
+            // return the fragments
+            for (auto aln : alngroup->alignments()) {
+                fragmentQueue_.push(aln); aln = nullptr;
+            }
+            // clear the alignments vector
+            alngroup->alignments().clear();
+            // return the alignment group itself 
+            alnGroupPool_.enqueue(alngroup);
+            numUniquelyMappedReads_++;
+        } else {
+            while(!alnGroupQueue_.try_enqueue(alngroup));
+            alngroup = nullptr;
+        }
+    } else { // otherwise, reclaim the alignment group structure here
+        //alnGroupPool_.push(alngroup);
+        alnGroupPool_.enqueue(alngroup);
+    }
+
+    delete [] prevReadName;
+    // We're at the end of the list of input files
+    // and we're done parsing (for now).
+    currFile_ = files_.end();
+    fp_ = nullptr;
+    hdr_ = nullptr;
+    doneParsing_ = true;
+    return;
+}
+
+///////// Proper BAM parsing graveyard
+
+/* 
+        // If the reads don't have the same name, then the pair was not
+        // consecutive in the file --- complain and skip!
+        if (BOOST_UNLIKELY(!sameName)) {
+
+            // Flag stores whether or not we've consumed all of the 
+            // alignment records in the current file.
+            bool consumedFile{false};
+
+            // Consume reads until we find a pair with the same name
+            while (!sameName) {
+                // Complain if this is supposed to be a paired read
+                fmt::print(stderr, "{}WARNING:{} The mate of read [{}] did not "
+                        "appear next to it in the file. The next read was [{}].  "
+                        "Skipping the first read\n\n", 
+                        ioutils::SET_RED, 
+                        ioutils::RESET_COLOR, 
+                        bam1_qname(rpair.read1),
+                        bam1_qname(rpair.read2));
+
+                rpair.read1 = rpair.read2; 
+                didRead2 = (sam_read1(fp_, hdr_, rpair.read2) >= 0);
+                // If we hit the end of the file --- skip to the top of the loop
+                // to see if we need to move on to another file.
+                if (!didRead2) { consumedFile = true; continue; }
+
+                // As above, if we encounter a non-paired read, then, for the
+                // time being, flip out and quit.
+                if (BOOST_UNLIKELY(!(rpair.read2->core.flag & BAM_FPAIRED))) {
+                    fmt::MemoryWriter errmsg;
+                    errmsg << "\n\n" 
+                        << ioutils::SET_RED << "ERROR: " << ioutils::RESET_COLOR 
+                        << "Saw adjacent reads, at least one of which was unpaired. "
+                        << "The two ends of a paired-end read should be adjacent. "
+                        << "Don't know how to proceed; exiting!\n\n";
+                    std::cerr << errmsg.str();
+                    std::exit(-1);
+                } else { std::cerr << "BLARGHHH\n\n\n"; }
+                // As above, check first that the lengths of the names are the
+                // same and then that the names are, in fact, identical.
+                sameName = (rpair.read1->core.l_qname == rpair.read2->core.l_qname);
+                if (BOOST_LIKELY(sameName)) {
+                    auto nameLen = rpair.read1->core.l_qname;
+                    char* qname1 = bam1_qname(rpair.read1);
+                    char* qname2 = bam1_qname(rpair.read2);
+                    sameName = checkProperPairedNames_(qname1, qname2, nameLen);
+                }
+            } // end while (!sameName)
+
+            // If we consumed all of the current file, break to the top of the
+            // loop.
+            if (BOOST_UNLIKELY(consumedFile)) { continue; } 
+        }
+
+        bool read1IsValid{false};
+        if ( !(rpair.read1->core.flag & BAM_FUNMAP) 
+              and (rpair.read1->core.flag & BAM_FPROPER_PAIR)
+             //and !(rpair.read1->core.flag & BAM_FDUP) 
+             //and !(rpair.read1->core.flag & BAM_FQCFAIL)
+            ) {
+            read1IsValid = true;
+        }
+
+        bool read2IsValid{false};
+        if ( !(rpair.read2->core.flag & BAM_FUNMAP) 
+              and (rpair.read2->core.flag & BAM_FPROPER_PAIR)
+             //and !(rpair.read2->core.flag & BAM_FDUP) 
+             //and !(rpair.read2->core.flag & BAM_FQCFAIL)
+            ) {
+            read2IsValid = true;
+        }
+
+        haveValidPair = read1IsValid and read2IsValid and
+                        (rpair.read1->core.tid == rpair.read2->core.tid) and 
+                        sameName;
+        
+        // If the pair was not properly mapped 
+        if (!haveValidPair) { 
+            ++numUnaligned_; 
+            // If we have an active output filter, and the reads were not 
+            // a valid alignment (*but were a proper pair*), then pass them
+            // to the output filter.
+            if ((filt != nullptr) and sameName) {
+                filt->processFrag(&rpair);
+            }
+        } else {
+            // Make sure read1 is read1 and read2 is read2; else swap
+            if (rpair.read1->core.flag & BAM_FREAD2) {
+                std::swap(rpair.read1, rpair.read2);
+            }
+            rpair.orphanStatus = salmon::utils::OrphanStatus::Paired;
+
+            ++propPaired;
+        }
+        */
+
+
+
diff --git a/include/BiasIndex.hpp b/include/BiasIndex.hpp
new file mode 100644
index 0000000..837a6c8
--- /dev/null
+++ b/include/BiasIndex.hpp
@@ -0,0 +1,109 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef __BIASINDEX_HPP__
+#define __BIASINDEX_HPP__
+
+#include <fstream>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+
+#include <algorithm>
+
+template <class T>
+void endswap(T *objp)
+{
+  unsigned char *memp = reinterpret_cast<unsigned char*>(objp);
+  std::reverse(memp, memp + sizeof(T));
+}
+
+template <class T>
+void endswap(std::vector<T>& v) {
+	for ( size_t i = 0; i < v.size(); ++i ) { endswap(&v[i]); }
+}
+
+class Indexer {
+    public:
+	Indexer(std::vector<double>::iterator it) : it_(it), hasData_(true) {};
+	Indexer(double val) : val_(val), hasData_(false) {};
+
+	inline double operator[](size_t offset) {
+		return hasData_ ? *(it_+offset) : val_;
+	}
+
+    private:
+    	std::vector<double>::iterator it_;
+    	double val_;
+    	bool hasData_;
+};
+
+class BiasIndex {
+
+	using Bias = double;
+	using Offset = size_t;
+
+public:
+	BiasIndex() : haveBias_(false) {}
+	
+	BiasIndex(const std::string base) : haveBias_(true) {
+		auto dictName = base+".dict";
+		auto binName = base+".bin";
+
+		std::cerr << "reading bias dictionary ... ";
+		using std::string;
+		std::ifstream din(dictName, std::ios::in);
+		
+		size_t numBiasTerms;
+		din >> numBiasTerms;
+
+		string tname;
+		Offset length;
+		Offset offset=0;
+		while ( din >> tname >> length ) {
+			offsetMap_[tname] = offset;
+			offset += length;
+		}
+		din.close();
+		std::cerr << "done\n";
+
+		std::cerr << "reading bias terms ... ";
+		biases_.resize(numBiasTerms);
+
+		std::ifstream bin(binName, std::ios::binary);
+		bin.read( reinterpret_cast<char*>(&biases_.front()), sizeof(double)*numBiasTerms );
+		bin.close();
+		std::cerr << "done\n";
+	}
+
+	inline Indexer getBiases( const std::string& tname ) {
+		return haveBias_ ? Indexer( biases_.begin() + offsetMap_[tname] ) : Indexer(1.0);
+	}
+
+private:
+	bool haveBias_;
+	std::vector<Bias> biases_;
+	std::unordered_map<std::string, size_t> offsetMap_;
+};
+
+#endif //__BIASINDEX_HPP__
\ No newline at end of file
diff --git a/include/BinaryLiteral.hpp b/include/BinaryLiteral.hpp
new file mode 100644
index 0000000..f111634
--- /dev/null
+++ b/include/BinaryLiteral.hpp
@@ -0,0 +1,77 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef __BINARY_LITERAL_HPP__
+#define __BINARY_LITERAL_HPP__
+
+//===============================================================
+
+//
+// binary_literal_impl represents a compile-time function that
+// computes the unsigned long long int from a list of characters
+// Digits that MUST be composed of '0' or '1'...
+//
+template <char... Digits>
+struct binary_literal_impl;
+
+// If the next digit is zero, then compute the rest...
+template <char... Digits>
+struct binary_literal_impl<'0', Digits...>
+{
+  static constexpr unsigned long long to_ulonglong()
+  {
+    return binary_literal_impl<Digits...>::to_ulonglong();
+  }
+};
+
+// If the next digit is one, then shift 1 and compute the rest...
+template <char... Digits>
+struct binary_literal_impl<'1', Digits...>
+{
+  static constexpr unsigned long long to_ulonglong()
+  {
+    return (1UL << sizeof...(Digits))
+      | binary_literal_impl<Digits...>::to_ulonglong();
+  }
+};
+
+// Base case: No digits, so return 0...
+template <>
+struct binary_literal_impl<>
+{
+  static constexpr unsigned long long to_ulonglong()
+  {
+    return 0;
+  }
+};
+
+//===============================================================
+
+template <char... Digits>
+constexpr unsigned long long operator "" _binary()
+{
+  return binary_literal_impl<Digits...>::to_ulonglong();
+}
+
+//===============================================================
+
+#endif // __BINARY_LITERAL_HPP__
diff --git a/include/ClusterForest.hpp b/include/ClusterForest.hpp
new file mode 100644
index 0000000..825d343
--- /dev/null
+++ b/include/ClusterForest.hpp
@@ -0,0 +1,158 @@
+#ifndef __CLUSTER_FOREST_HPP__
+#define __CLUSTER_FOREST_HPP__
+
+
+#include <boost/pending/disjoint_sets.hpp>
+
+#include "Transcript.hpp"
+#include "TranscriptCluster.hpp"
+#include "SpinLock.hpp"
+
+#include <unordered_set>
+#include <vector>
+#include <mutex>
+
+/** A forest of transcript clusters */
+class ClusterForest {
+public:
+    ClusterForest(size_t numTranscripts, std::vector<Transcript>& refs) :
+        rank_(std::vector<size_t>(numTranscripts, 0)),
+        parent_(std::vector<size_t>(numTranscripts, 0)),
+        disjointSets_(&rank_[0], &parent_[0]),
+        clusters_(std::vector<TranscriptCluster>(numTranscripts))
+    {
+        // Initially make a unique set for each transcript
+        for(size_t tnum = 0; tnum < numTranscripts; ++tnum) {
+            disjointSets_.make_set(tnum);
+            clusters_[tnum].members_.push_front(tnum);
+            clusters_[tnum].addMass(refs[tnum].mass());
+        }
+    }
+
+    template <typename FragT>
+    void mergeClusters(typename std::vector<FragT>::iterator start,
+                       typename std::vector<FragT>::iterator finish) {
+        // Use a lock_guard to ensure this is a locked (and exception-safe) operation
+#if defined __APPLE__
+        spin_lock::scoped_lock sl(clusterMutex_);
+#else
+        std::lock_guard<std::mutex> lock(clusterMutex_);
+#endif
+        size_t firstCluster, otherCluster;
+        auto firstTranscriptID = start->transcriptID();
+        ++start;
+
+        for (auto it = start; it != finish; ++it) {
+            firstCluster = disjointSets_.find_set(firstTranscriptID);
+            otherCluster = disjointSets_.find_set(it->transcriptID());
+            if (otherCluster != firstCluster)  {
+                disjointSets_.link(firstCluster, otherCluster);
+                auto parentClust = disjointSets_.find_set(it->transcriptID());
+                auto childClust = (parentClust == firstCluster)  ? otherCluster : firstCluster;
+                if (parentClust == firstCluster or parentClust == otherCluster) {
+                    clusters_[parentClust].merge(clusters_[childClust]);
+                    clusters_[childClust].deactivate();
+                } else { std::cerr << "DANGER\n"; }
+            }
+        }
+    }
+
+
+    template <typename FragT>
+    void mergeClusters(typename std::vector<FragT*>::iterator start,
+                       typename std::vector<FragT*>::iterator finish) {
+        // Use a lock_guard to ensure this is a locked (and exception-safe) operation
+#if defined __APPLE__
+        spin_lock::scoped_lock sl(clusterMutex_);
+#else
+        std::lock_guard<std::mutex> lock(clusterMutex_);
+#endif
+
+        size_t firstCluster, otherCluster;
+        auto firstTranscriptID = (*start)->transcriptID();
+        ++start;
+
+        for (auto it = start; it != finish; ++it) {
+            firstCluster = disjointSets_.find_set(firstTranscriptID);
+            otherCluster = disjointSets_.find_set((*it)->transcriptID());
+            if (otherCluster != firstCluster)  {
+                disjointSets_.link(firstCluster, otherCluster);
+                auto parentClust = disjointSets_.find_set((*it)->transcriptID());
+                auto childClust = (parentClust == firstCluster)  ? otherCluster : firstCluster;
+                if (parentClust == firstCluster or parentClust == otherCluster) {
+                    clusters_[parentClust].merge(clusters_[childClust]);
+                    clusters_[childClust].deactivate();
+                } else { std::cerr << "DANGER\n"; }
+            }
+        }
+    }
+
+
+    /*
+    void mergeClusters(AlignmentBatch<ReadPair>::iterator start, AlignmentBatch<ReadPair>::iterator finish) {
+        // Use a lock_guard to ensure this is a locked (and exception-safe) operation
+        std::lock_guard<std::mutex> lock(clusterMutex_);
+        size_t firstCluster, otherCluster;
+        auto firstTranscriptID = start->read1->core.tid;
+        ++start;
+
+        for (auto it = start; it != finish; ++it) {
+            firstCluster = disjointSets_.find_set(firstTranscriptID);
+            otherCluster = disjointSets_.find_set(it->read1->core.tid);
+            if (otherCluster != firstCluster)  {
+                disjointSets_.link(firstCluster, otherCluster);
+                auto parentClust = disjointSets_.find_set(it->read1->core.tid);
+                auto childClust = (parentClust == firstCluster)  ? otherCluster : firstCluster;
+                if (parentClust == firstCluster or parentClust == otherCluster) {
+                    clusters_[parentClust].merge(clusters_[childClust]);
+                    clusters_[childClust].deactivate();
+                } else { std::cerr << "DANGER\n"; }
+            }
+        }
+    }
+*/
+    void updateCluster(size_t memberTranscript, size_t newCount, double logNewMass, bool updateCount) {
+        // Use a lock_guard to ensure this is a locked (and exception-safe) operation
+#if defined __APPLE__
+        spin_lock::scoped_lock sl(clusterMutex_);
+#else
+        std::lock_guard<std::mutex> lock(clusterMutex_);
+#endif
+        auto clusterID = disjointSets_.find_set(memberTranscript);
+        auto& cluster = clusters_[clusterID];
+        if (updateCount) {
+            cluster.incrementCount(newCount);
+        }
+        cluster.addMass(logNewMass);
+    }
+
+    std::vector<TranscriptCluster*> getClusters() {
+        std::vector<TranscriptCluster*> clusters;
+        std::unordered_set<size_t> observedReps;
+        for (size_t i = 0; i < clusters_.size(); ++i) {
+            auto rep = disjointSets_.find_set(i);
+            if (observedReps.find(rep) == observedReps.end()) {
+                if (!clusters_[rep].isActive()) {
+                    std::cerr << "returning a non-active cluster!\n";
+                    std::exit(1);
+                }
+                clusters.push_back(&clusters_[rep]);
+                observedReps.insert(rep);
+            }
+        }
+        return clusters;
+    }
+private:
+    std::vector<size_t> rank_;
+    std::vector<size_t> parent_;
+    boost::disjoint_sets<size_t*, size_t*> disjointSets_;
+    std::vector<TranscriptCluster> clusters_;
+    #if defined __APPLE__
+    spin_lock clusterMutex_;
+    #else
+    std::mutex clusterMutex_;
+    #endif
+};
+
+#endif // __CLUSTER_FOREST_HPP__
+
diff --git a/include/CollapsedEMOptimizer.hpp b/include/CollapsedEMOptimizer.hpp
new file mode 100644
index 0000000..d4aa3f6
--- /dev/null
+++ b/include/CollapsedEMOptimizer.hpp
@@ -0,0 +1,28 @@
+#ifndef COLLAPSED_EM_OPTIMIZER_HPP
+#define COLLAPSED_EM_OPTIMIZER_HPP
+
+#include <unordered_map>
+
+#include "tbb/atomic.h"
+#include "tbb/task_scheduler_init.h"
+
+#include "ReadExperiment.hpp"
+#include "SalmonOpts.hpp"
+
+#include "cuckoohash_map.hh"
+#include "Eigen/Dense"
+
+class CollapsedEMOptimizer {
+    public:
+        using VecType = std::vector<tbb::atomic<double>>;
+        CollapsedEMOptimizer();
+
+        template <typename ExpT>
+        bool optimize(ExpT& readExp,
+                      SalmonOpts& sopt,
+                      double tolerance = 0.01,
+                      uint32_t maxIter = 1000);
+
+};
+
+#endif // COLLAPSED_EM_OPTIMIZER_HPP
diff --git a/include/CollapsedGibbsSampler.hpp b/include/CollapsedGibbsSampler.hpp
new file mode 100644
index 0000000..c90f185
--- /dev/null
+++ b/include/CollapsedGibbsSampler.hpp
@@ -0,0 +1,27 @@
+#ifndef COLLAPSED_GIBBS_SAMPLER_HPP
+#define COLLAPSED_GIBBS_SAMPLER_HPP
+
+#include <unordered_map>
+
+#include "tbb/atomic.h"
+#include "tbb/task_scheduler_init.h"
+
+#include "SalmonOpts.hpp"
+
+#include "cuckoohash_map.hh"
+#include "Eigen/Dense"
+
+class CollapsedGibbsSampler {
+    public:
+        using VecType = std::vector<double>;
+        CollapsedGibbsSampler();
+
+        template <typename ExpT>
+        bool sample(ExpT& readExp,
+                      SalmonOpts& sopt,
+                      uint32_t numSamples = 500);
+
+};
+
+#endif // COLLAPSED_EM_OPTIMIZER_HPP
+
diff --git a/include/CollapsedIterativeOptimizer.hpp.bak b/include/CollapsedIterativeOptimizer.hpp.bak
new file mode 100644
index 0000000..cc9e4bd
--- /dev/null
+++ b/include/CollapsedIterativeOptimizer.hpp.bak
@@ -0,0 +1,2069 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef COLLAPSED_ITERATIVE_OPTIMIZER_HPP
+#define COLLAPSED_ITERATIVE_OPTIMIZER_HPP
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <unordered_map>
+#include <map>
+#include <vector>
+#include <unordered_set>
+#include <mutex>
+#include <thread>
+#include <sstream>
+#include <exception>
+#include <random>
+#include <queue>
+#include "btree_map.h"
+
+/** Boost Includes */
+#include <boost/dynamic_bitset/dynamic_bitset.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/program_options.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/heap/fibonacci_heap.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+#include <boost/accumulators/framework/accumulator_set.hpp>
+#include <boost/accumulators/statistics/p_square_quantile.hpp>
+#include <boost/accumulators/statistics/count.hpp>
+#include <boost/accumulators/statistics/median.hpp>
+#include <boost/accumulators/statistics/weighted_mean.hpp>
+#include <boost/lockfree/queue.hpp>
+#include <boost/thread/thread.hpp>
+#include <boost/math/special_functions/digamma.hpp>
+#include <boost/math/distributions/beta.hpp>
+
+
+//#include <Eigen/Core>
+#include <jellyfish/sequence_parser.hpp>
+#include <jellyfish/parse_read.hpp>
+#include <jellyfish/mer_counting.hpp>
+#include <jellyfish/misc.hpp>
+#include <jellyfish/compacted_hash.hpp>
+
+#include "tbb/concurrent_unordered_set.h"
+#include "tbb/concurrent_vector.h"
+#include "tbb/concurrent_unordered_map.h"
+#include "tbb/concurrent_queue.h"
+#include "tbb/parallel_for.h"
+#include "tbb/parallel_for_each.h"
+#include "tbb/parallel_reduce.h"
+#include "tbb/blocked_range.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/partitioner.h"
+
+#include "BiasIndex.hpp"
+#include "ezETAProgressBar.hpp"
+#include "LookUpTableUtils.hpp"
+#include "SalmonMath.hpp"
+
+template <typename ReadHash>
+class CollapsedIterativeOptimizer {
+
+private:
+    /**
+    * Type aliases
+    */
+    using TranscriptID = uint32_t;
+    using KmerID =  uint64_t;
+    using Count = uint32_t;
+    using KmerQuantity = double;
+    using Promiscutity = double;
+    using KmerMap = tbb::concurrent_unordered_map< uint64_t, tbb::concurrent_vector<uint32_t> >;
+
+    struct TranscriptGeneVectors;
+    using TranscriptIDVector = std::vector<TranscriptID>;
+    using KmerIDMap = std::vector<TranscriptIDVector>;
+
+
+    using TranscriptKmerSet = std::tuple<TranscriptID, std::vector<KmerID>>;
+    using StringPtr = std::string*;
+    using TranscriptScore = uint64_t;
+    using HashArray = jellyfish::invertible_hash::array<uint64_t, atomic::gcc, allocators::mmap>;
+    using ReadLength = size_t;
+
+    // Necessary forward declaration
+    struct TranscriptData;
+    using HeapPair = std::tuple<TranscriptScore, TranscriptID>;
+    using Handle = typename boost::heap::fibonacci_heap<HeapPair>::handle_type;
+    using BlockedIndexRange =  tbb::blocked_range<size_t>;
+
+    struct TranscriptGeneVectors {
+        tbb::concurrent_vector<uint32_t> transcripts;
+        tbb::concurrent_vector<uint32_t> genes;
+    };
+
+    struct TranscriptData {
+        TranscriptID id;
+        StringPtr header;
+        std::map<KmerID, KmerQuantity> binMers;
+        KmerQuantity mean;
+        size_t length;
+    };
+
+    class TranscriptInfo {
+      public:
+
+      TranscriptInfo() : binMers(std::unordered_map<KmerID, KmerQuantity>()),
+                         //logLikes(std::vector<KmerQuantity>()),
+                         //weights(std::vector<KmerQuantity>()),
+                         mean(0.0), fracLow(0.0), fracHigh(0.0), length(0), effectiveLength(0) { updated.store(0); /* weightNum.store(0); totalWeight.store(0.0);*/ }
+      TranscriptInfo(TranscriptInfo&& other) {
+        std::swap(binMers, other.binMers);
+        //std::swap(weights, other.weights);
+        //std::swap(logLikes, other.logLikes);
+        //totalWeight.store(other.totalWeight.load());
+        //weightNum.store(other.weightNum.load());
+        updated.store(other.updated.load());
+        mean = other.mean;
+        length = other.length;
+        effectiveLength = other.effectiveLength;
+      }
+        //std::atomic<double> totalWeight;
+        //btree::btree_map<KmerID, KmerQuantity> binMers;
+        //std::vector<KmerQuantity> weights;
+        //std::vector<KmerQuantity> logLikes;
+        //std::atomic<uint32_t> weightNum;
+        std::atomic<uint32_t> updated;
+        std::unordered_map<KmerID, KmerQuantity> binMers;
+        KmerQuantity mean;
+        KmerQuantity fracLow, fracHigh;
+        ReadLength length;
+        ReadLength effectiveLength;
+    };
+
+    // This struct represents a "job" (transcript) that needs to be processed
+    struct TranscriptJob {
+        StringPtr header;
+        StringPtr seq;
+        TranscriptID id;
+    };
+
+    struct TranscriptResult {
+        TranscriptData *data;
+        TranscriptKmerSet *ks;
+    };
+
+    struct BinmerUpdates {
+        std::vector<KmerID> zeroedBinmers;
+        std::vector<KmerID> updatedBinmers;
+    };
+
+    bool loggedCounts_;
+    uint32_t numThreads_;
+    size_t merLen_;
+    ReadHash & readHash_;
+    BiasIndex& biasIndex_;
+    std::string kmerEquivClassFname_;
+
+    // The number of occurences above whcih a kmer is considered promiscuous
+    size_t promiscuousKmerCutoff_ {std::numeric_limits<size_t>::max()};
+
+    // Map each kmer to the set of transcripts it occurs in
+    KmerIDMap transcriptsForKmer_;
+
+    // The actual data for each transcript
+    std::vector<TranscriptInfo> transcripts_;
+
+    TranscriptGeneMap& transcriptGeneMap_;
+
+    tbb::concurrent_unordered_set<uint64_t> genePromiscuousKmers_;
+
+    std::vector<Promiscutity> kmerGroupPromiscuities_;
+    std::vector<Promiscutity> kmerGroupBiases_;
+    std::vector<KmerQuantity> kmerGroupCounts_;
+    std::vector<Count> kmerGroupSizes_;
+    
+
+    /**
+     * logs all counts in the kmerGroupCounts_ variable.  Groups with a 
+     * count of zero are assigned the special value LOG_0.
+     */
+    inline void logKmerGroupCounts_() {
+        std::for_each(kmerGroupCounts_.begin(), kmerGroupCounts_.end(),
+                      [](KmerQuantity& count) {
+                          if (count > 0) { 
+                              count = std::log(count);
+                          } else {
+                              count = salmon::math::LOG_0;
+                          }
+                      });
+    }
+
+    /**
+     * Compute the "Inverse Document Frequency" (IDF) of a kmer within a set of transcripts.
+     * The inverse document frequency is the log of the number of documents (i.e. transcripts)
+     * divded by the number of documents containing this term (i.e. kmer).
+     * @param  k [kmer id for which the IDF should be computed]
+     * @return   [IDF(k)]
+     */
+    inline double _idf( uint64_t k ) {
+        double df = transcriptsForKmer_[k].size();
+        return (df > 0.0) ? std::log(transcripts_.size() / df) : 0.0;
+    }
+
+    /**
+     * Returns true if this kmer should be considered in our estimation, false
+     * otherwise
+     * @param  mer [kmer id to test for consideration]
+     * @return     [true if we consider the kmer with this id, false otherwise]
+     */
+    inline bool _considered( uint64_t mer ) {
+        // The kmer is only considered if it exists in the transcript set
+        // (i.e. it's possible to cover) and it's less prmiscuous than the
+        // cutoff.
+        return true;
+    }
+
+    /**
+     * The weight attributed to each appearence of the kmer with the given ID.
+     * If the kmer with ID k occurs in m different transcripts, then
+     * weight_(k) = 1 / m.
+     * @param  k [The ID of the kmer whose weight is to be computed]
+     * @return   [The weight of each appearance of the kmer with the given ID]
+     */
+    KmerQuantity weight_( KmerID k ) {
+        return 1.0 / (kmerGroupPromiscuities_[k] );
+    }
+
+    KmerQuantity _computeMedian( const TranscriptInfo& ti ) {
+
+      using namespace boost::accumulators;
+      using Accumulator = accumulator_set<double, stats<tag::median(with_p_square_quantile)>>;
+
+      Accumulator acc;
+      for (auto binmer : ti.binMers) {
+        acc(binmer.second);
+      }
+
+      return median(acc);
+    }
+
+    /**
+     * Computes the sum of kmer counts within the transcript given by ti, but clamping
+     * all non-zero counts to the given quantile.  For example, if quantile was 0.25, and
+     * x and y represented the 1st and 3rd quantile of kmer counts, then every nonzero count c
+     * would be transformed as c = max(x, min(y,c));
+     *
+     * @param  ti       [description]
+     * @param  quantile [description]
+     * @return          [description]
+     */
+    KmerQuantity _computeSumQuantile( const TranscriptInfo& ti, double quantile ) {
+        using namespace boost::accumulators;
+        using accumulator_t = accumulator_set<double, stats<tag::p_square_quantile> >;
+        //using tail_t = accumulator_set<double, stats<tag::tail
+        KmerQuantity sum = 0.0;
+
+
+        accumulator_t accLow(quantile_probability = quantile);
+        accumulator_t accHigh(quantile_probability = 1.0-quantile);
+        for ( auto binmer : ti.binMers ) {
+            accLow(binmer.second);
+            accHigh(binmer.second);
+        }
+
+        auto cutLow = p_square_quantile(accLow);
+        auto cutHigh = p_square_quantile(accHigh);
+
+        for ( auto binmer : ti.binMers ) {
+            KmerQuantity res = std::min( cutHigh, std::max(cutLow, binmer.second ) );
+            if (res != binmer.second) {
+              std::cerr << "cutLow = " << cutLow << ", cutHigh = " << cutHigh << " :: ";
+              std::cerr << "res = " << res << ", binmer.second = " << binmer.second << "\n";
+            }
+            sum += std::min( cutHigh, std::max(cutLow, binmer.second ) );
+        }
+        return sum;
+    }
+
+    KmerQuantity _computeSum( const TranscriptInfo& ti ) {
+        KmerQuantity sum = 0.0;
+        for ( auto binmer : ti.binMers ) {
+          sum += kmerGroupBiases_[binmer.first] * binmer.second;
+        }
+        return sum;
+    }
+
+    KmerQuantity _computeSumClamped( const TranscriptInfo& ti ) {
+        if (ti.binMers.size() < 5) {return _computeSum(ti);}
+        KmerQuantity sum = 0.0;
+
+        auto maxQuant = std::numeric_limits<KmerQuantity>::max();
+        auto minQuant = 0.0;
+
+        auto startValue = ti.binMers.begin()->second;
+        KmerQuantity lowestCount = startValue;
+        KmerQuantity secondLowestCount = startValue;
+
+        KmerQuantity highestCount = startValue;
+        KmerQuantity secondHighestCount = startValue;
+
+        for ( auto binmer : ti.binMers ) {
+          auto prevLowest = lowestCount;
+          lowestCount = std::min(binmer.second, lowestCount);
+          if (lowestCount < prevLowest) { secondLowestCount = prevLowest; }
+          auto prevHighest = highestCount;
+          highestCount = std::max(binmer.second, highestCount);
+          if (highestCount > prevHighest) { secondHighestCount = prevHighest; }
+        }
+        //std::cerr << lowestCount << ", " << secondLowestCount << ", " << highestCount << ", " << secondHighestCount << "\n";
+
+        for ( auto binmer : ti.binMers ) {
+          if (binmer.second > lowestCount and binmer.second < highestCount) {
+            sum += kmerGroupBiases_[binmer.first] * binmer.second;
+          } else if (binmer.second >= highestCount ) {
+            sum += secondHighestCount;
+          } else if (binmer.second <= lowestCount ) {
+            sum += secondLowestCount;
+          }
+        }
+
+        return sum;
+    }
+
+
+    bool _discard( const TranscriptInfo& ti) {
+        if ( ti.mean == 0.0 ) {
+            return false;
+        } else {
+            ti.mean = 0.0;
+            ti.binMers.clear();
+            return true;
+        }
+    }
+
+    KmerQuantity _computeSumVec( const TranscriptInfo& ti ) {
+        KmerQuantity sum = 0.0;
+        for ( auto w : ti.weights ) {
+          sum += w;
+        }
+        return sum;
+    }
+
+    KmerQuantity _computeClampedMean( const TranscriptInfo& ti ) {
+        return (ti.effectiveLength > 0.0) ? (_computeSumClamped(ti) / ti.effectiveLength) : 0.0;
+    }
+
+    KmerQuantity _computeMean( const TranscriptInfo& ti ) {
+        return (ti.effectiveLength > 0.0) ? (_computeSum(ti) / ti.effectiveLength) : 0.0;
+        //return (ti.effectiveLength > 0.0) ? (ti.totalWeight.load() / ti.effectiveLength) : 0.0;
+        //return (ti.effectiveLength > 0.0) ? (_computeSumVec(ti) / ti.effectiveLength) : 0.0;
+    }
+
+    KmerQuantity _computeWeightedMean( const TranscriptInfo& ti ) {
+        using namespace boost::accumulators;
+        accumulator_set<double, stats<tag::count, tag::weighted_mean>, double> acc;
+
+        for ( auto binmer : ti.binMers ) {
+          if ( this->genePromiscuousKmers_.find(binmer.first) == this->genePromiscuousKmers_.end() ){
+            acc(binmer.second, weight=kmerGroupBiases_[binmer.first] * weight_(binmer.first));
+          }
+        }
+
+        auto nnz = count(acc);
+
+        if ( nnz < ti.effectiveLength ) {
+            acc(0.0, weight=ti.effectiveLength-nnz);
+        }
+
+        auto sum = sum_of_weights(acc);
+        return sum > 0.0 ? weighted_mean(acc) : 0.0;
+    }
+
+    double _effectiveLength( const TranscriptInfo& ts ) {
+        double length = 0.0;
+        for ( auto binmer : ts.binMers ) {
+            length += weight_(binmer.first);
+        }
+        return length;
+    }
+
+    template <typename T>
+    T dotProd_(std::vector<T>& u, std::vector<T>& v) {
+
+      auto dot = tbb::parallel_reduce(
+        BlockedIndexRange(size_t(0), v.size()),
+            T(0.0),  // identity element for summation
+            [&]( const BlockedIndexRange& r, T current_sum ) -> T {
+             for (size_t i=r.begin(); i!=r.end(); ++i) {
+               current_sum += (u[i]*v[i]);
+             }
+             return current_sum; // body returns updated value of the accumulator
+             },
+             []( double s1, double s2 ) -> double {
+                return s1+s2;       // "joins" two accumulated values
+      });
+
+      return dot;
+    }
+
+    void normalizeTranscriptMeans_(){
+        //auto sumMean = 0.0;
+        //for ( auto ti : transcripts_ ) { sumMean += ti.mean; }
+
+        auto sumMean = tbb::parallel_reduce(
+            BlockedIndexRange(size_t(0), transcripts_.size()),
+            double(0.0),  // identity element for summation
+            [&, this]( const BlockedIndexRange& r, double current_sum ) -> double {
+                 for (size_t i=r.begin(); i!=r.end(); ++i) {
+                     double x = this->transcripts_[i].mean;
+                     current_sum += x;
+                 }
+                 return current_sum; // body returns updated value of the accumulator
+             },
+             []( double s1, double s2 ) -> double {
+                 return s1+s2;       // "joins" two accumulated values
+             });
+
+        // compute the new mean for each transcript
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
+            [this, sumMean](const BlockedIndexRange& range) -> void {
+              for(size_t tid = range.begin(); tid != range.end(); ++tid) {
+                this->transcripts_[tid].mean /= sumMean;
+              }
+            });
+
+    }
+
+    template <typename T>
+    T psumAccumulate(const tbb::blocked_range<T*>& r, T value) {
+            return std::accumulate(r.begin(),r.end(),value);
+    }
+
+    template <typename T>
+    T psum_(std::vector<T>& v) {
+      auto func = std::bind( std::mem_fn(&CollapsedIterativeOptimizer<ReadHash>::psumAccumulate<T>),
+                             this, std::placeholders::_1, std::placeholders::_2 );
+      auto sum = tbb::parallel_reduce(
+        tbb::blocked_range<T*>(&v[0], &v[v.size()]),
+          T{0},  // identity element for summation
+          func,
+          std::plus<T>()
+        );
+      return sum;
+    }
+
+    template <typename T>
+    T pdiff_(std::vector<T>& v0, std::vector<T>& v1) {
+        auto diff = tbb::parallel_reduce(
+        BlockedIndexRange(size_t(0), v0.size()),
+          double(0.0),  // identity element for difference
+          [&](const BlockedIndexRange& r, double currentDiff ) -> double {
+            for (size_t i=r.begin(); i!=r.end(); ++i) {
+              currentDiff += v0[i] - v1[i];
+            }
+            return currentDiff; // body returns updated value of the accumulator
+          },
+          []( double s1, double s2 ) -> double {
+               return s1+s2;       // "joins" two accumulated values
+          }
+        );
+
+        return diff;
+    }
+
+    template <typename T>
+    T pabsdiff_(std::vector<T>& v0, std::vector<T>& v1) {
+        auto diff = tbb::parallel_reduce(
+        BlockedIndexRange(size_t(0), v0.size()),
+          double(0.0),  // identity element for difference
+          [&]( const BlockedIndexRange& r, double currentDiff ) -> double {
+            for (size_t i=r.begin(); i!=r.end(); ++i) {
+              currentDiff = std::abs(v0[i] - v1[i]);
+            }
+            return currentDiff; // body returns updated value of the accumulator
+          },
+          []( double s1, double s2 ) -> double {
+               return s1+s2;       // "joins" two accumulated values
+          }
+        );
+
+        return diff;
+    }
+
+    template <typename T>
+    std::vector<T> relAbsDiff_(std::vector<T>& v0, std::vector<T>& v1, T minVal) {
+        std::vector<T> relDiff(v0.size(), T());
+        // compute the new mean for each transcript
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(v0.size())),
+                          [&v0, &v1, &relDiff, minVal](const BlockedIndexRange& range ) -> void {
+                              for (auto tid : boost::irange(range.begin(), range.end())) {
+                                  T oldVal = v0[tid];
+                                  T newVal = v1[tid];
+                                  if (oldVal > minVal or newVal > minVal) {
+                                      relDiff[tid] = std::abs(newVal - oldVal) / oldVal;
+                                  }
+                              }
+                          });
+        return relDiff;
+    }
+
+    void normalize_(std::vector<double>& means) {
+        auto sumMean = psum_(means);
+        auto invSumMean = 1.0 / sumMean;
+
+        // compute the new mean for each transcript
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
+            [&means, invSumMean](const BlockedIndexRange& range ) -> void {
+              for (auto tid : boost::irange(range.begin(), range.end())) {
+                means[tid] *= invSumMean;
+              }
+            });
+    }
+
+    double averageCount(const TranscriptInfo& ts){
+        if ( ts.binMers.size() == 0 ) { return 0.0; }
+        double sum = 0.0;
+        for ( auto binmer : ts.binMers ) {
+            sum += kmerGroupBiases_[binmer.first] * binmer.second;
+        }
+        return sum / ts.binMers.size();
+
+    }
+
+    /**
+     * This function should be run only after <b>after</b> an EM loop.
+     * It estimates kmer specific biases based on how much a kmer's count
+     * deviates from it's corresponding transcript's mean
+     */
+    void computeKmerFidelities_() {
+
+      // For each transcript, compute it's overall fidelity.  This is related
+      // to the variance of coverage across the transcript.  The more uniform
+      // the coverage, the more we believe the transcript.
+        std::vector<double> transcriptFidelities(transcripts_.size(), 0.0);
+        tbb::parallel_for( BlockedIndexRange(size_t(0), transcripts_.size()),
+            [this, &transcriptFidelities]( const BlockedIndexRange& range ) -> void {
+               for (auto tid = range.begin(); tid != range.end(); ++tid) {
+                double sumDiff = 0.0;
+                //if (tid >= this->transcripts_.size()) { std::cerr << "attempting to access transcripts_ out of range\n";}
+                auto& ts = this->transcripts_[tid];
+
+                //std::cerr << "transcript " << tid << "\n";
+                for ( auto& b : ts.binMers ) {
+                    //if (b.first >= this->kmerGroupBiases_.size()) { std::cerr << "attempting to access kmerGroupBiases_ out of range\n";}
+                    auto scaledMean = this->kmerGroupSizes_[b.first] * ts.mean;
+                    auto diff = std::abs(b.second - scaledMean);
+                    sumDiff += diff;//*diff;
+                }
+                // The rest of the positions have 0 coverage have an error
+                // of |0 - \mu_t| = \mu_t.  There are l(t) - ts.binMers.size() of these.
+                sumDiff += ts.mean * (ts.length - ts.binMers.size());
+                auto fidelity = (ts.length > 0.0) ? sumDiff / ts.length : 0.0;
+                fidelity = 1.0 / (fidelity + 1.0);
+                //if (tid >= transcriptFidelities.size()) { std::cerr << "attempting to access transcriptFidelities out of range\n";}
+                transcriptFidelities[tid] = fidelity;
+                //std::cerr << "fidelity (" << tid << ") = " << fidelity << "\n";
+               }
+            });
+
+        tbb::parallel_for(BlockedIndexRange(size_t(0), transcriptsForKmer_.size()),
+            [this, &transcriptFidelities](const BlockedIndexRange& range) -> void {
+              // Each transcript this kmer group appears in votes on the bias of this kmer.
+              // Underrepresented kmers get bias values > 1.0 while overrepresented kmers get
+              // bias values < 1.0.  The vote of each transcript is weigted by it's fidelity
+                for (auto kid = range.begin(); kid != range.end(); ++kid) {
+                  double totalBias = 0.0;
+                  double totalFidelity = 0.0;
+                  for( auto tid : this->transcriptsForKmer_[kid] ) {
+                    auto& transcript = this->transcripts_[tid];
+                    auto fidelity = transcriptFidelities[tid];
+                    auto totalMean = transcript.mean * this->kmerGroupSizes_[kid];
+                    auto curAlloc = transcript.binMers[kid];
+                    totalBias += (curAlloc > 0.0) ? fidelity * (totalMean / curAlloc) : 0.0;
+                    totalFidelity += fidelity;
+                  }
+                  double bias = totalBias / totalFidelity;
+                  double alpha = 0.25; //std::min( 1.0, 10.0*averageFidelity / confidence);
+                  double prevBias = this->kmerGroupBiases_[kid];
+                  this->kmerGroupBiases_[kid] = alpha * bias + (1.0 - alpha) * prevBias;
+              }
+            }
+        );
+
+
+    }
+
+    double logLikelihood_(std::vector<double>& means) {
+
+      const auto numTranscripts = transcripts_.size();
+      std::vector<double> likelihoods(numTranscripts, 0.0);
+
+        // Compute the log-likelihood
+        tbb::parallel_for(BlockedIndexRange(size_t(0), numTranscripts),
+          // for each transcript
+          [&likelihoods, &means, this](const BlockedIndexRange& range) ->void {
+            auto epsilon = 1e-40;
+            for (auto tid = range.begin(); tid != range.end(); ++tid) {
+              auto& ti = transcripts_[tid];
+              double relativeAbundance = means[tid];
+
+              if (ti.binMers.size() > 0 ) { likelihoods[tid] = 1.0; }
+              // For each kmer in this transcript
+              for ( auto& binmer : ti.binMers ) {
+                likelihoods[tid] *=  binmer.second /
+                           (this->kmerGroupBiases_[binmer.first] * this->kmerGroupCounts_[binmer.first]);
+              }
+              likelihoods[tid] = (relativeAbundance > epsilon and likelihoods[tid] > epsilon) ?
+                                 std::log(relativeAbundance * likelihoods[tid]) : 0.0;
+            }
+          });
+
+      return psum_(likelihoods);
+
+    }
+
+    double expectedLogLikelihood_(std::vector<double>& means) {
+      // alpha in arXiv:1104.3889v2
+      std::vector<double> sampProbs(means.size(), 0.0);
+
+      tbb::parallel_for(BlockedIndexRange(size_t(0), means.size()),
+          // for each transcript
+          [&sampProbs, &means, this](const BlockedIndexRange& range) ->void {
+            for (auto tid = range.begin(); tid != range.end(); ++tid) {
+              auto& ti = transcripts_[tid];
+              double relativeAbundance = means[tid];
+              sampProbs[tid] = ti.length * relativeAbundance;
+            }});
+
+      normalize_(sampProbs);
+
+      return (means.size() > 0) ? logLikelihood2_(sampProbs) / means.size() : 0.0;
+    }
+
+    double logLikelihood2_(std::vector<double>& sampProbs) {
+
+      std::vector<double> likelihoods(transcriptsForKmer_.size(), 0.0);
+
+        // Compute the log-likelihood
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptsForKmer_.size())),
+          // for each transcript
+          [&likelihoods, &sampProbs, this](const BlockedIndexRange& range) ->void {
+            for (auto kid = range.begin(); kid != range.end(); ++kid) {
+              double kmerLikelihood = 0.0;
+              KmerQuantity totalKmerMass = 0.0;
+              for (auto& tid : this->transcriptsForKmer_[kid]) {
+                double kmerMass{this->transcripts_[tid].binMers[kid]};
+                kmerLikelihood += kmerMass * (sampProbs[tid] / this->transcripts_[tid].length);
+                totalKmerMass += kmerMass;
+              }
+              // If
+              if (totalKmerMass > 0) {
+                  if (kmerLikelihood < 1e-20) {
+                      std::cerr << "kmer group: " << kid << " has probability (" << kmerLikelihood << "); too low\n";
+                  } else {
+                      likelihoods[kid] = std::log(kmerLikelihood);
+                  }
+              }
+            }
+          });
+
+      return psum_(likelihoods);
+
+    }
+
+
+    // Since there's no built in hash code for vectors
+  template <typename T>
+  class my_hasher{
+  public:
+    size_t operator()(const T& x) const {
+        if (x.size() == 0 ) { return 0; }
+        size_t seed = x[0];
+        for (auto i : boost::irange(size_t(1), x.size())) {
+            boost::hash_combine(seed, static_cast<size_t>(x[i]));
+        }
+        return seed;
+    }
+  };
+
+/**
+ * Collapses all kmer which share the same transcript multiset.  Such kmers can be
+ * treated as a "batch" with a count whose value is the sum of individual "batch"
+ * members.
+ * @param  isActiveKmer       [A bitvector which designates, for each kmer,
+ *                             whether or not that kmer is active in the current
+ *                             read set.]
+ */
+ void collapseKmers_( boost::dynamic_bitset<>& isActiveKmer ) {
+
+    auto numTranscripts = transcriptGeneMap_.numTranscripts();
+
+    /**
+     * Map from a vector of transcript IDs to the list of kmers that have this
+     * transcript list.  This allows us to collapse all kmers that exist in the
+     * exact same set of transcripts into a single kmer group.
+     */
+    tbb::concurrent_unordered_map< TranscriptIDVector,
+                                   tbb::concurrent_vector<KmerID>,
+                                   my_hasher<std::vector<TranscriptID>> > m;
+
+     // Asynchronously print out the progress of our hashing procedure
+     std::atomic<size_t> prog{0};
+     std::thread t([this, &prog]() -> void {
+        ez::ezETAProgressBar pb(this->transcriptsForKmer_.size());
+        pb.start();
+        size_t prevProg{0};
+        while ( prevProg < this->transcriptsForKmer_.size() ) {
+            if (prog > prevProg) {
+                auto diff = prog - prevProg;
+                pb += diff;
+                prevProg += diff;
+            }
+            boost::this_thread::sleep_for(boost::chrono::seconds(1));
+        }
+        if (!pb.isDone()) { pb.done(); }
+     });
+
+     //For every kmer, compute it's kmer group.
+     tbb::parallel_for(BlockedIndexRange(size_t(0), transcriptsForKmer_.size()),
+        [&](const BlockedIndexRange& range ) -> void {
+          for (auto j = range.begin(); j != range.end(); ++j) {
+            if (isActiveKmer[j]) {
+              m[ transcriptsForKmer_[j]  ].push_back(j);
+            }
+            ++prog;
+          }
+     });
+
+     // wait for the parallel hashing to finish
+     t.join();
+
+     // TESTING
+     std::ofstream eqFile("KMER_EQUIV_CLASSES.txt");
+     for (auto& kv : m ) {
+       for (auto e : kv.second) { eqFile << e << "\t"; }
+       eqFile << "\n";
+     }
+     eqFile.close();
+     // END TESTING
+
+     std::cerr << "Out of " << transcriptsForKmer_.size() << " potential kmers, "
+               << "there were " << m.size() << " distinct groups\n";
+
+     size_t totalKmers = 0;
+     size_t index = 0;
+     std::vector<KmerQuantity> kmerGroupCounts(m.size());
+     std::vector<Promiscutity> kmerGroupPromiscuities(m.size());
+     std::vector<TranscriptIDVector> transcriptsForKmer(m.size());
+     kmerGroupSizes_.resize(m.size(), 0);
+
+     using namespace boost::accumulators;
+     std::cerr << "building collapsed transcript map\n";
+     for ( auto& kv : m ) {
+
+        // For each transcript covered by this kmer group, add this group to the set of kmer groups contained in
+        // the transcript.  For efficiency, we also compute the kmer promiscuity values for each kmer
+        // group here --- the promiscuity of a kmer group is simply the number of distinct transcripts in
+        // which this group of kmers appears.
+        auto prevTID = std::numeric_limits<TranscriptID>::max();
+        KmerQuantity numDistinctTranscripts = 0.0;
+        for ( auto& tid : kv.first ) {
+          transcripts_[tid].binMers[index] += 1;
+          // Since the transcript IDs are sorted we just have to check
+          // if this id is different from the previous one
+          if (tid != prevTID) { numDistinctTranscripts += 1.0; }
+          prevTID = tid;
+        }
+        // Set the promiscuity and the set of transcripts for this kmer group
+        kmerGroupPromiscuities[index] = numDistinctTranscripts;
+        transcriptsForKmer[index] = kv.first;
+
+        // Aggregate the counts attributable to each kmer into its repective
+        // group's counts.
+        for (auto kid : kv.second) {
+            kmerGroupCounts[index] += readHash_.atIndex(kid);
+        }
+        kmerGroupSizes_[index] = kv.second.size();
+
+        // Update the total number of kmers we're accounting for
+        // and the index of the current kmer group.
+        totalKmers += kv.second.size();
+        ++index;
+      }
+
+      std::cerr << "Verifying that the unique set encodes " << totalKmers << " kmers\n";
+      std::cerr << "collapsedCounts.size() = " << transcriptsForKmer.size() << "\n";
+
+      // update the relevant structures holding info for the full kmer
+      // set with those holding the info for our collapsed kmer sets
+      std::swap(kmerGroupPromiscuities, kmerGroupPromiscuities_);
+      std::swap(kmerGroupCounts, kmerGroupCounts_);
+      std::swap(transcriptsForKmer, transcriptsForKmer_);
+
+      /*
+      uint64_t groupCounts = 0;
+      for(auto c : kmerGroupCounts_) { groupCounts += c; }
+      auto tmp = psum_(kmerGroupCounts_);
+      std::cerr << "groupCount(" << groupCounts << ") - parallelCount(" << tmp << ") = " << groupCounts - tmp << "\n";
+      std::atomic<uint64_t> individualCounts{0};
+      tbb::parallel_for(size_t{0}, readHash_.size(),
+        [&](size_t i) {
+          if (isActiveKmer[i]) {
+            individualCounts += readHash_.atIndex(i);
+          } });
+      auto diff = groupCounts - individualCounts;
+      std::cerr << "groupTotal(" << groupCounts << ") - totalNumKmers(" << individualCounts << ") = " << diff << "\n";
+      */
+  }
+
+
+  /**
+   * NEW! Assuming that equivalence classes were computed in the index
+   **/
+  void prepareCollapsedMaps_(
+                            const std::string& kmerEquivClassFname,
+                            bool discardZeroCountKmers) {
+
+    using std::cerr;
+
+    cerr << "reading Kmer equivalence classes \n";
+
+    auto memberships = LUTTools::readKmerEquivClasses(kmerEquivClassFname);
+    size_t numKmers{readHash_.size()};
+    size_t numKmerClasses{(*std::max_element(memberships.begin(), memberships.end())) + 1};
+    boost::dynamic_bitset<> isActiveKmer(numKmers);
+
+    kmerGroupCounts_.resize(numKmerClasses, 0.0);
+    kmerGroupSizes_.resize(numKmerClasses, 0.0);
+    kmerGroupPromiscuities_.resize(numKmerClasses, 0.0);
+
+    cerr << "updating Kmer group counts\n";
+    // Update the kmer group counts using the information from the read hash
+    for (auto kid : boost::irange(size_t{0}, numKmers)) {
+
+      size_t count = readHash_.atIndex(kid);
+      if (!discardZeroCountKmers or count != 0) {
+        auto kmerClassID = memberships[kid];
+        isActiveKmer[kmerClassID] = true;
+        kmerGroupCounts_[kmerClassID] += count;
+        kmerGroupSizes_[kmerClassID]++;
+      }
+
+    }
+
+    cerr << "updating transcript map\n";
+    for (auto kmerClassID : boost::irange(size_t{0}, numKmerClasses)) {
+
+      // For each transcript covered by this kmer group, add this group to the set of kmer groups contained in
+      // the transcript.  For efficiency, we also compute the kmer promiscuity values for each kmer
+      // group here --- the promiscuity of a kmer group is simply the number of distinct transcripts in
+      // which this group of kmers appears.
+      auto prevTID = std::numeric_limits<TranscriptID>::max();
+      KmerQuantity numDistinctTranscripts = 0.0;
+
+      //cerr << "numKmerClasses = " << numKmerClasses << ", kmerClassID = " << kmerClassID << ", transcriptsForKmer_.size() = " << transcriptsForKmer_.size() << "\n";
+      for (auto& tid : transcriptsForKmer_[kmerClassID]) {
+        transcripts_[tid].binMers[kmerClassID] += 1;
+        // Since the transcript IDs are sorted we just have to check
+        // if this id is different from the previous one
+        if (tid != prevTID) { numDistinctTranscripts += 1.0; }
+        prevTID = tid;
+      }
+      // Set the promiscuity and the set of transcripts for this kmer group
+      kmerGroupPromiscuities_[kmerClassID] = numDistinctTranscripts;
+      //kmerGroupPromiscuities_[kmerClassID] = transcriptsForKmer_[kmerClassID].size();
+    }
+    cerr << "done\n";
+
+    //promiscuousKmerCutoff_ = 50;
+    for (auto kmerClassID : boost::irange(size_t{0}, numKmerClasses)) {
+        if (kmerGroupPromiscuities_[kmerClassID] > promiscuousKmerCutoff_ ) {
+            kmerGroupCounts_[kmerClassID] = 0;
+        }
+    }
+    // By now we should have properly filled out the vairables
+    // kmerGroupPromiscuities_
+    // kmerGroupCounts_
+    // transcripts_
+  }
+
+
+  /**
+   * This function should be called before performing any optimization procedure.
+   * It builds all of the necessary data-structures which are used during the transcript
+   * quantification procedure.
+   * @param  klutfname [The name of the file containing the kmer lookup table.]
+   * @param  tlutfname [The name of the file containing the transcript lookup table.]
+   */
+    void initialize_(
+        const std::string& klutfname,
+        const std::string& tlutfname,
+        const std::string& kmerEquivClassFname,
+        const bool discardZeroCountKmers) {
+
+        // So we can concisely identify each transcript
+        TranscriptID transcriptIndex {0};
+
+        size_t numTranscripts = transcriptGeneMap_.numTranscripts();
+        size_t numKmers = readHash_.size();
+        auto merSize = readHash_.kmerLength();
+
+        size_t numActors = numThreads_;
+        std::vector<std::thread> threads;
+
+        transcripts_.resize(transcriptGeneMap_.numTranscripts());
+
+        // Get the kmer look-up-table from file
+        LUTTools::readKmerLUT(klutfname, transcriptsForKmer_);
+
+/*        size_t numContainingTranscripts = transcriptsForKmer_[kid].size();
+          assert(numContainingTranscripts > 0);
+          if (numContainingTranscripts == 1 or
+              std::all_of(transcriptsForKmer_[kid].begin(), transcriptsForKmer_[kid].end(), [this, kid](TranscriptID tid) -> bool {
+                return tid == this->transcriptsForKmer_[kid][0]; } )) {
+              uniquelyAnchoredTranscripts.insert(transcriptsForKmer_[kid][0]);
+          }
+
+        }
+*/
+        //std::cerr << "\nIn the original set, there were " << uniquelyAnchoredTranscripts.size() << " transcripts with unique nonzero anchors\n";
+
+        std::cerr << "\n";
+        //  collapseKmers_(isActiveKmer); // equiv-classes
+        prepareCollapsedMaps_(kmerEquivClassFname, discardZeroCountKmers);
+
+
+        // we have no k-mer-specific biases currently
+        kmerGroupBiases_.resize(transcriptsForKmer_.size(), 1.0);
+
+        // Get transcript lengths
+        std::ifstream ifile(tlutfname, std::ios::binary);
+        size_t numRecords {0};
+        ifile.read(reinterpret_cast<char *>(&numRecords), sizeof(numRecords));
+
+        std::cerr << "Transcript LUT contained " << numRecords << " records\n";
+        for (auto i : boost::irange(size_t(0), numRecords)) {
+            auto ti = LUTTools::readTranscriptInfo(ifile);
+            // copy over the length, then we're done.
+            transcripts_[ti->transcriptID].length = ti->length;
+            transcripts_[ti->transcriptID].effectiveLength = ti->length - merSize + 1;
+        }
+        ifile.close();
+        // --- done ---
+
+       // tbb::parallel_for( size_t(0), size_t(transcripts_.size()),
+       //     [this]( size_t idx ) {
+       //         auto& transcript = this->transcripts_[idx];
+       //         transcript.effectiveLength = transcript.effectiveLength - transcript.binMers.size();
+       //         for (auto binmer : transcript.binMers) {
+       //          transcript.effectiveLength += this->_weight(binmer.first);
+       //         }
+       // });
+
+        size_t numRes = 0;
+        std::cerr << "\n\nRemoving duplicates from kmer transcript lists ... ";
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptsForKmer_.size())),
+            [&numRes, this](const BlockedIndexRange& range) -> void {
+                for (auto idx = range.begin(); idx != range.end(); ++idx) {
+                  auto& transcripts = this->transcriptsForKmer_[idx];
+                  // should already be sorted -- extra check can be removed eventually
+                  std::is_sorted(transcripts.begin(), transcripts.end());
+                  // Uniqify the transcripts
+                  auto it = std::unique(transcripts.begin(), transcripts.end());
+                  transcripts.resize(std::distance(transcripts.begin(), it));
+                  ++numRes;
+                }
+         });
+
+         std::cerr << "done\n";
+
+         std::cerr << "Computing kmer group promiscuity rates\n";
+         /* -- done
+         kmerGroupPromiscuities_.resize(transcriptsForKmer_.size());
+         tbb::parallel_for( size_t{0}, kmerGroupPromiscuities_.size(),
+            [this]( KmerID kid ) -> void { this->kmerGroupPromiscuities_[kid] = this->_weight(kid); }
+         );
+         */
+
+        tbb::parallel_for(BlockedIndexRange(size_t{0}, transcripts_.size()),
+          [&, this](const BlockedIndexRange& range) -> void {
+            for (auto tid = range.begin(); tid != range.end(); ++tid) {
+              auto& ti = this->transcripts_[tid];
+              for (auto& binmer : ti.binMers) {
+                if (binmer.second > promiscuousKmerCutoff_) {
+                  ti.effectiveLength -= 1.0;
+                }
+              }
+            }
+        });
+
+        /**
+         * gene-promiscuous kmers can never be added to a transcript's counts, so
+         * it's unfair to consider them in the transcripts effective length.
+         */
+        /*
+        std::for_each( genePromiscuousKmers_.begin(), genePromiscuousKmers_.end(),
+            [this]( KmerID kmerId ) -> void {
+                for ( auto tid : transcriptsForKmer_[kmerId] ) {
+                    transcripts_[tid].effectiveLength -= 1.0;
+                }
+            });
+        */
+        std::cerr << "done\n";
+
+        //return mappedReads;
+    }
+
+    void _dumpCoverage( const boost::filesystem::path &cfname) {
+        auto memberships = LUTTools::readKmerEquivClasses(kmerEquivClassFname_);
+
+        size_t numTrans = transcripts_.size();
+        size_t numProc = 0;
+        std::ifstream kmerStructFile("zm_transcript.map", std::ios::in | std::ios::binary);
+        std::unordered_map<std::string, std::vector<KmerID>> kstruct;
+        uint64_t tlen{0};
+        uint32_t nameLen{0};
+        uint64_t tid{0};
+        for (size_t i = 0; i < numTrans; ++i) {
+            kmerStructFile.read(reinterpret_cast<char*>(&nameLen), sizeof(nameLen));
+            std::string name(nameLen, ' ');
+            kmerStructFile.read(reinterpret_cast<char*>(&name[0]), nameLen * sizeof(name[0]));
+            kmerStructFile.read(reinterpret_cast<char*>(&tlen), sizeof(tlen));
+            kstruct[name].resize(tlen);
+            auto& tvec = kstruct[name];
+            kmerStructFile.read(reinterpret_cast<char*>(&tvec[0]), tlen * sizeof(kstruct[name][0]));
+        }
+
+        kmerStructFile.close();
+
+       std::ofstream ofile(cfname.native());
+
+        ofile << "# numtranscripts_\n";
+        ofile << "# transcript_name_{1} num_kmer_classes{1} class_1_count class_2_count ... class_{num_eq_clases}_count\n";
+        ofile << "# ... \n";
+
+        ofile << transcripts_.size() << "\n";
+
+        std::cerr << "Dumping coverage statistics to " << cfname << "\n";
+
+
+        tbb::concurrent_queue<StringPtr> covQueue;
+
+        tbb::parallel_for(BlockedIndexRange(size_t{0}, transcripts_.size()),
+             [this, &covQueue, &memberships, &kstruct] (const BlockedIndexRange& range) -> void {
+                for (auto index = range.begin(); index != range.end(); ++index) {
+                  auto& td = this->transcripts_[index];
+                  const auto& name = this->transcriptGeneMap_.transcriptName(index);
+
+                  std::stringstream ostream;
+                  ostream << this->transcriptGeneMap_.transcriptName(index) << " " << kstruct[name].size();
+
+                  std::map<uint64_t, double> kmerClassRelativeMass;
+
+                  for (auto bm : kstruct[name]) {
+                      auto kclass = memberships[bm];
+                      double totalMass = 0.0;
+                      for (auto tid : this->transcriptsForKmer_[kclass]) {
+                          totalMass += this->transcripts_[tid].mean;
+                      }
+                      kmerClassRelativeMass[kclass] = (totalMass > 0.0) ? td.mean / totalMass : 0.0;
+                  }
+
+                  for (auto bm : kstruct[name]) {
+                      auto kclass = memberships[bm];
+                      //double classSize = this->kmerGroupSizes_[kclass];
+                      //double mass = td.binMers[kclass];
+                      //ostream << " " << mass / classSize;
+                      auto count = readHash_.atIndex(bm);
+                      ostream << " " << count * kmerClassRelativeMass[kclass];
+                  }
+                  ostream << "\n";
+                  std::string* ostr = new std::string(ostream.str());
+                  covQueue.push(ostr);
+                  // for boost lockfree
+                  // while(!covQueue.push(ostr));
+              }
+            }
+        );
+
+        ez::ezETAProgressBar pb(transcripts_.size());
+        pb.start();
+
+        std::string* sptr = nullptr;
+        while ( numProc < numTrans ) {
+            while( covQueue.try_pop(sptr) ) {
+                ofile << (*sptr);
+                ++pb;
+                ++numProc;
+                delete sptr;
+            }
+        }
+
+        ofile.close();
+
+    }
+
+    void filterByCoverage_( double coverageCutoff, std::vector<double>& means  ) {
+
+        tbb::parallel_for(BlockedIndexRange(size_t{0}, transcripts_.size()),
+                          [this, coverageCutoff, &means] (const BlockedIndexRange& range) -> void {
+                              for (auto index = range.begin(); index != range.end(); ++index) {
+                                  double numCovered{1.0};
+                                  double totalNumKmers{1.0};
+                                  bool unAnchored{true};
+                                  const auto& td = this->transcripts_[index];
+
+                                  for ( auto bm : td.binMers ) {
+                                      double numKmersToCover = this->kmerGroupSizes_[bm.first];
+                                      numCovered += (bm.second > 0.0) ? numKmersToCover : 0.0;
+                                      totalNumKmers += numKmersToCover;
+                                      //if (bm.second > 1000 * numKmersToCover and (std::abs(bm.second - this->kmerGroupCounts_[bm.first]) < 1e-5)) {
+                                      //    unAnchored = false;
+                                      //    break;
+                                      //}
+
+                                  }
+
+                              if (unAnchored and ((numCovered / totalNumKmers) < coverageCutoff)) {
+                                  means[index] = this->transcripts_[index].mean = 0.0;
+                              }
+                              }
+                          }
+                          );
+
+    }
+
+    void EMUpdate_( const std::vector<double>& meansIn, std::vector<double>& meansOut ) {
+      assert(meansIn.size() == meansOut.size());
+
+      auto reqNumJobs = transcriptsForKmer_.size();
+
+      std::atomic<size_t> numJobs{0};
+      std::atomic<size_t> completedJobs{0};
+
+      // Print out our progress
+      auto pbthread = std::thread(
+        [&completedJobs, reqNumJobs]() -> bool {
+          auto prevNumJobs = 0;
+          ez::ezETAProgressBar show_progress(reqNumJobs);
+          show_progress.start();
+          while ( prevNumJobs < reqNumJobs ) {
+            if ( prevNumJobs < completedJobs ) {
+              show_progress += completedJobs - prevNumJobs;
+            }
+            prevNumJobs = completedJobs.load();
+            boost::this_thread::sleep_for(boost::chrono::seconds(1));
+          }
+          if (!show_progress.isDone()) { show_progress.done(); }
+          return true;
+        });
+
+        //  E-Step : reassign the kmer group counts proportionally to each transcript
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptsForKmer_.size())),
+          // for each kmer group
+          [&completedJobs, &meansIn, &meansOut, this](const BlockedIndexRange& range) -> void {
+            for (auto kid : boost::irange(range.begin(), range.end())) {
+                auto kmer = kid;
+                // for each transcript containing this kmer group
+                auto& transcripts = this->transcriptsForKmer_[kmer];
+
+                double totalMass = 0.0;
+                for ( auto tid : transcripts ) {
+                    totalMass += meansIn[tid];
+                    /** TRY SPARSE
+                    auto& trans = this->transcripts_[tid];
+                    totalMass += trans.effectiveLength ? (meansIn[tid] - (trans.binMers[kmer] / trans.effectiveLength)) : 0.0;
+                    **/
+                }
+
+                double norm = (totalMass > 0.0) ? 1.0 / totalMass : 0.0;
+                for ( auto tid : transcripts ) {
+                  auto& trans = this->transcripts_[tid];
+                  auto lastIndex = trans.binMers.size()  - 1;
+                  // binMer based
+                  //auto idx = trans.weightNum++;
+                  //auto kmerIt = trans.binMers.find(kmer);
+
+                  trans.binMers[kmer] = meansIn[tid] * norm *
+                                        kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
+                  /** TRY SPARSE
+                  double myMass = trans.effectiveLength ? (meansIn[tid] - (trans.binMers[kmer] / trans.effectiveLength)) : 0.0;
+                  trans.binMers[kmer] = myMass * norm *
+                      kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
+                  **/
+
+                  // If we've seen all of the k-mers that appear in this transcript,
+                  // then we can compute it's new estimated abundance
+                  if (trans.updated++ == lastIndex) {
+                    trans.mean = meansOut[tid] = this->_computeMean(trans);
+                    //trans.mean = meansOut[tid] = this->_computeClampedMean(trans);
+
+                    //trans.weightNum.store(0);
+                    trans.updated.store(0);
+                  }
+                        //this->transcripts_[tid].binMers[kmer] =
+                        //meansIn[tid] * norm * kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
+
+
+
+
+                        /*
+                        auto idx = trans.weightNum++;
+                        auto weight = meansIn[tid] * norm * kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
+                        trans.logLikes[idx] = (weight > 0.0) ? std::log(weight / this->kmerGroupCounts_[kmer] ) : 0.0;
+                        trans.weights[idx] = weight;
+                        ++trans.updated;
+                        if (idx == trans.weights.size()-1) {
+                          while (trans.updated.load() < trans.weightNum.load() ) {}
+                          meansOut[tid] = this->_computeMean(trans);
+                          trans.weightNum.store(0);
+                          trans.updated.store(0);
+                        }*/
+
+                        /*
+                        auto& weight = trans.totalWeight;
+                        auto current = weight.load();
+                        auto updated = current + delta;
+                        while( !weight.compare_exchange_strong(current, updated) ) {
+                          current = weight.load(); updated = current + delta;
+                        }
+                        */
+                  }
+
+
+              ++completedJobs;
+            } // for kid in range
+
+          });
+
+          // wait for all kmer groups to be processed
+          pbthread.join();
+
+          /*
+          double delta = 0.0;
+          double norm = 1.0 / transcripts_.size();
+          size_t discard = 0;
+
+
+          // M-Step
+          // compute the new mean for each transcript
+          tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
+            [this, &meansOut](const BlockedIndexRange& range) -> void {
+              for (auto tid : boost::irange(range.begin(), range.end())) {
+                auto& ts = this->transcripts_[tid];
+                auto tsNorm = 1.0;//(ts.effectiveLength > 0.0) ? 1.0 / std::sqrt(ts.effectiveLength) : 1.0;
+                meansOut[tid] = tsNorm * this->_computeMean( ts );
+                ts.weightNum.store(0);
+              }
+          });
+          */
+
+          normalize_(meansOut);
+    }
+
+
+public:
+    /**
+     * Construct the solver with the read and transcript hashes
+     */
+    CollapsedIterativeOptimizer( ReadHash &readHash, TranscriptGeneMap& transcriptGeneMap,
+                                 BiasIndex& biasIndex, uint32_t numThreads ) :
+                                 readHash_(readHash), merLen_(readHash.kmerLength()),
+                                 transcriptGeneMap_(transcriptGeneMap), biasIndex_(biasIndex),
+                                 numThreads_(numThreads) {}
+
+
+    KmerQuantity optimize(const std::string& klutfname,
+                          const std::string& tlutfname,
+                          const std::string& kmerEquivClassFname,
+                          size_t numIt,
+                          double minMean,
+                          double maxDelta) {
+
+        kmerEquivClassFname_ = kmerEquivClassFname;
+        const bool discardZeroCountKmers = true;
+        initialize_(klutfname, tlutfname, kmerEquivClassFname, discardZeroCountKmers);
+
+        KmerQuantity globalError {0.0};
+        bool done {false};
+        std::atomic<size_t> numJobs {0};
+        std::atomic<size_t> completedJobs {0};
+        std::vector<KmerID> kmerList( transcriptsForKmer_.size(), 0 );
+        size_t idx = 0;
+
+        tbb::task_scheduler_init tbb_init(numThreads_);
+
+        std::cerr << "Computing initial coverage estimates ... ";
+
+        std::vector<double> means0(transcripts_.size(), 0.0);
+        std::vector<double> means1(transcripts_.size(), 0.0);
+        std::vector<double> means2(transcripts_.size(), 0.0);
+        std::vector<double> meansPrime(transcripts_.size(), 0.0);
+
+        std::vector<double> r(transcripts_.size(), 0.0);
+        std::vector<double> v(transcripts_.size(), 0.0);
+        std::atomic<size_t> uniquelyAnchoredTranscripts{0};
+        std::atomic<size_t> nonZeroTranscripts{0};
+        
+        loggedCounts_ = true;
+        logKmerGroupCounts_();
+
+        // Compute the initial mean for each transcript
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptGeneMap_.numTranscripts())),
+        [this, &uniquelyAnchoredTranscripts, &nonZeroTranscripts, &means0](const BlockedIndexRange& range) -> void {
+            for (auto tid = range.begin(); tid != range.end(); ++tid) {
+              auto& transcriptData = this->transcripts_[tid];
+              KmerQuantity total = 0.0;
+              //transcriptData.weights.resize(transcriptData.binMers.size());
+              //transcriptData.logLikes.resize(transcriptData.binMers.size());
+
+              bool notTotallyPromiscuous{false};
+              bool nonZero{false};
+
+              for ( auto & kv : transcriptData.binMers ) {
+                auto kmer = kv.first;
+                if ( this->genePromiscuousKmers_.find(kmer) == this->genePromiscuousKmers_.end() ){
+                    // count is the number of times kmer appears in transcript (tid)
+                    auto logCount = std::log(kv.second);
+                    kv.second = logCount * std::log(this->kmerGroupCounts_[kmer]) * std::log(this->weight_(kmer));
+                    if (kv.second > 0.0 and kmerGroupPromiscuities_[kmer] == 1) { notTotallyPromiscuous = true; }
+                    if (kv.second > 0.0) { nonZero = true; }
+                    //transcriptData.weights[transcriptData.weightNum++] = kv.second;
+                    //total += kv.second;
+                }
+              }
+              //transcriptData.totalWeight.store(total);
+              //transcriptData.weightNum.store(0);
+
+              transcriptData.mean = means0[tid] = this->_computeMean(transcriptData);
+              //transcriptData.mean = means0[tid] = this->_computeClampedMean(transcriptData);
+
+              //transcriptData.mean = notTotallyPromiscuous ? means0[tid] = this->_computeMean(transcriptData) : 0.0;
+              if (notTotallyPromiscuous) { ++uniquelyAnchoredTranscripts; }
+              if (nonZero) { ++nonZeroTranscripts; }
+              //transcriptData.mean = means0[tid] = this->_computeWeightedMean(transcriptData);
+              //transcriptData.mean = means0[tid] = 1.0 / this->transcripts_.size();//this->_computeMean(transcriptData);
+              //transcriptData.mean = this->_computeWeightedMean(transcriptData);
+              //transcriptData.mean = distribution(generator);
+              //this->_computeWeightedMean( transcriptData );
+            }
+        }
+        );
+        normalizeTranscriptMeans_();
+        normalize_(means0);
+
+        std::cerr << "\nThere were " << uniquelyAnchoredTranscripts.load() << " uniquely anchored transcripts\n";
+        std::cerr << "There were " << nonZeroTranscripts.load() << " transcripts with at least one overlapping k-mer\n";
+        std::cerr << "done\n";
+        size_t outerIterations = 1;
+        /*
+        for ( size_t iter = 0; iter < numIt; ++iter ) {
+          std::cerr << "EM iteraton: " << iter << "\n";
+          EMUpdate_(means0, means1);
+          std::swap(means0, means1);
+        }
+        */
+
+        /**
+         * Defaults for these values taken from the R implementation of
+         * [SQUAREM](http://cran.r-project.org/web/packages/SQUAREM/index.html).
+         */
+        double minStep0, minStep, maxStep0, maxStep, mStep, nonMonotonicity;
+        minStep0 = 1.0; minStep = 1.0;
+        maxStep0 = 1.0; maxStep = 1.0;
+        mStep = 4.0;
+        nonMonotonicity = 1.0;
+
+        double negLogLikelihoodOld = std::numeric_limits<double>::infinity();
+        double negLogLikelihoodNew = std::numeric_limits<double>::infinity();
+
+        std::function<bool(std::vector<double>&, std::vector<double>&)> hasConverged;
+        if (std::isfinite(maxDelta)) {
+            hasConverged = [maxDelta, this] (std::vector<double>& v0, std::vector<double>& v1) -> bool {
+                double minVal = 1e-7;
+                auto relDiff = this->relAbsDiff_(v0, v1, minVal);
+                double maxRelativeChange = *std::max_element( relDiff.begin(), relDiff.end() );
+                std::cerr << "max relative change: " << maxRelativeChange << "\n";
+                return maxRelativeChange < maxDelta;
+            };
+        } else {
+            hasConverged = [] (std::vector<double>& v0, std::vector<double>& v1) -> bool {
+                std::cerr << "no data-driven convergence criterion specified\n";
+                return false;
+            };
+        }
+
+        std::string clearline = "                                                                                \r\r";
+
+        // Until we've reached the specified maximum number of iterations, or hit ourt
+        // tolerance threshold
+        for ( size_t iter = 0; iter < numIt; ++iter ) {
+            std::string jumpBack = "\x1b[A";
+            std::cerr << clearline << "SQUAREM iteraton [" << iter << "]\n";
+            jumpBack += "\x1b[A";
+
+
+          // Theta_1 = EMUpdate(Theta_0)
+          std::cerr << clearline << "1/3\n";
+          EMUpdate_(means0, means1);
+          jumpBack += "\x1b[A\x1b[A";
+
+          if (!std::isfinite(negLogLikelihoodOld)) {
+            negLogLikelihoodOld = -expectedLogLikelihood_(means0);
+          }
+
+          // Theta_2 = EMUpdate(Theta_1)
+          std::cerr << clearline << "2/3\n";
+          EMUpdate_(means1, means2);
+          jumpBack += "\x1b[A\x1b[A";
+
+          // Check for data-driven convergence criteria
+          if (hasConverged(means1, means2)) {
+              std::cerr << "convergence criteria met; terminating SQUAREM\n";
+              break;
+          }
+
+          double delta = pabsdiff_(means1, means2);
+          std::cerr << clearline << "delta = " << delta << "\n";
+          jumpBack += "\x1b[A";
+
+          // r = Theta_1 - Theta_0
+          // v = (Theta_2 - Theta_1) - r
+          tbb::parallel_for(BlockedIndexRange(size_t(0), transcripts_.size()),
+            [&means0, &means1, &means2, &r, &v](const BlockedIndexRange& range) -> void {
+              for (auto tid = range.begin(); tid != range.end(); ++tid) {
+                r[tid] = means1[tid] - means0[tid];
+                v[tid] = (means2[tid] - means1[tid]) - r[tid];
+              }
+            }
+          );
+
+          double rNorm = std::sqrt(dotProd_(r,r));
+          double vNorm = std::sqrt(dotProd_(v,v));
+          double alphaS = rNorm / vNorm;
+
+          alphaS = std::max(minStep, std::min(maxStep, alphaS));
+
+          tbb::parallel_for(BlockedIndexRange(size_t(0), transcripts_.size()),
+            [&r, &v, alphaS, &means0, &meansPrime](const BlockedIndexRange& range) -> void {
+              for (auto tid = range.begin(); tid != range.end(); ++tid) {
+                meansPrime[tid] = std::max(0.0, means0[tid] + 2*alphaS*r[tid] + (alphaS*alphaS)*v[tid]);
+              }
+            }
+          );
+
+          // Stabilization step
+          if ( std::abs(alphaS - 1.0) > 0.01) {
+            std::cerr << clearline << "alpha = " << alphaS << ". ";
+            std::cerr << "Performing a stabilization step.\n";
+            EMUpdate_(meansPrime, meansPrime);
+            jumpBack += "\x1b[A\x1b[A";
+          }
+
+          /** Check for an error in meansPrime **/
+
+          /** If there is **/
+          if (std::isfinite(nonMonotonicity)) {
+            negLogLikelihoodNew = -expectedLogLikelihood_(meansPrime);
+          } else {
+            negLogLikelihoodNew = negLogLikelihoodOld;
+          }
+
+          if (negLogLikelihoodNew > negLogLikelihoodOld + nonMonotonicity) {
+            std::swap(meansPrime, means2);
+            negLogLikelihoodNew = -expectedLogLikelihood_(meansPrime);
+            if (alphaS == maxStep) { maxStep = std::max(maxStep0, maxStep/mStep); }
+            alphaS = 1.0;
+          }
+          std::cerr << clearline << "alpha = " << alphaS << ", ";
+
+          if (alphaS == maxStep) { maxStep = mStep * maxStep; }
+          if (minStep < 0 and alphaS == minStep) { minStep = mStep * minStep; }
+          std::swap(meansPrime, means0);
+
+          if (!std::isnan(negLogLikelihoodNew)) {
+            std::cerr << "negLogLikelihood = " << negLogLikelihoodNew << "\n";
+            jumpBack += "\x1b[A";
+            negLogLikelihoodOld = negLogLikelihoodNew;
+
+          }
+
+          bool doFilter{true};
+          if ( doFilter and ((iter == numIt - 1) or (iter > 0 and iter % 10 == 0))) {
+              double itFrac = iter / static_cast<double>(numIt);
+              // cutoff =  e^{x^2 - 0.5}
+              double cutoff = std::min(1.0, std::exp((itFrac*itFrac) - 0.12));//std::max(0.5, std::min(1.0, (iter + 10) / static_cast<double>(numIt)));
+              std::cerr << clearline << "Iteration[" << iter << "] :: Filter cutoff = " << cutoff << "\n";
+              jumpBack += "\x1b[A";
+              filterByCoverage_(cutoff, means0);
+          }
+
+          // If it's not the final iteration, then jump back and clear the console.
+          if (iter < numIt - 1) { std::cerr << jumpBack; }
+
+
+          /** Filter out super low-abundance transcripts **/
+          /*          double epsilon = 10e-8;
+          tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
+                            [&means0, epsilon, this](const BlockedIndexRange& range) -> void {
+                                 for (auto tid : boost::irange(range.begin(), range.end())) {
+                                     if (means0[tid] < epsilon) {
+                                         means0[tid] = 0.0;
+                                         this->transcripts_[tid].mean = 0.0;
+                                     }
+                                 }
+                             });
+          */
+
+          // if (iter > 0 and iter % 5 == 0 and iter < numIt - 1) {
+          //   std::cerr << "updating kmer biases: . . . ";
+          //   tbb::parallel_for( BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
+          //     [&means0, this](const BlockedIndexRange& range) -> void {
+          //       for (auto tid : boost::irange(range.begin(), range.end())) {
+          //         this->transcripts_[tid].mean = means0[tid];
+          //       }
+          //     });
+
+          //   computeKmerFidelities_();
+          //   std::cerr << "done\n";
+          // }
+
+
+        }
+
+
+        /*
+        for ( size_t oiter = 0; oiter < outerIterations; ++oiter ) {
+            for ( size_t iter = 0; iter < numIt; ++iter ) {
+
+                auto reqNumJobs = transcriptsForKmer_.size();
+                std::cerr << "iteraton: " << iter << "\n";
+
+                globalError = 0.0;
+                numJobs = 0;
+                completedJobs = 0;
+
+                // Print out our progress
+                auto pbthread = std::thread(
+                    [&completedJobs, reqNumJobs]() -> bool {
+                        auto prevNumJobs = 0;
+                        ez::ezETAProgressBar show_progress(reqNumJobs);
+                        show_progress.start();
+                        while ( prevNumJobs < reqNumJobs ) {
+                            if ( prevNumJobs < completedJobs ) {
+                                show_progress += completedJobs - prevNumJobs;
+                            }
+                            prevNumJobs = completedJobs.load();
+                            boost::this_thread::sleep_for(boost::chrono::seconds(1));
+                        }
+                        return true;
+                    });
+
+                //  E-Step : reassign the kmer group counts proportionally to each transcript
+                tbb::parallel_for( size_t(0), size_t(transcriptsForKmer_.size()),
+                    // for each kmer group
+                    [&kmerList, &completedJobs, this]( size_t kid ) {
+                        auto kmer = kid;
+                        if ( this->genePromiscuousKmers_.find(kmer) == this->genePromiscuousKmers_.end() ){
+
+                            // for each transcript containing this kmer group
+                            auto &transcripts = this->transcriptsForKmer_[kmer];
+                            if ( transcripts.size() > 0 ) {
+
+                                double totalMass = 0.0;
+                                for ( auto tid : transcripts ) {
+                                    totalMass += this->transcripts_[tid].mean;
+                                }
+
+                                if ( totalMass > 0.0 ) {
+                                    double norm = 1.0 / totalMass;
+                                    for ( auto tid : transcripts ) {
+                                        if ( this->transcripts_[tid].mean > 0.0 ) {
+                                            this->transcripts_[tid].binMers[kmer] =
+                                            this->transcripts_[tid].mean * norm * kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
+                                        }
+                                    }
+                                }
+
+                            }
+                        }
+                        ++completedJobs;
+                    });
+
+                // wait for all kmer groups to be processed
+                pbthread.join();
+
+                // reset the job counter
+                completedJobs = 0;
+
+                double delta = 0.0;
+                double norm = 1.0 / transcripts_.size();
+
+                std::vector<KmerQuantity> prevMeans( transcripts_.size(), 0.0 );
+                tbb::parallel_for( size_t(0), size_t(transcripts_.size()),
+                    [this, &prevMeans]( size_t tid ) -> void { prevMeans[tid] = this->transcripts_[tid].mean; });
+
+                std::cerr << "\ncomputing new means ... ";
+                size_t discard = 0;
+
+                // M-Step
+                // compute the new mean for each transcript
+                tbb::parallel_for( size_t(0), size_t(transcripts_.size()),
+                    [this, iter, numIt, norm, minMean, &discard]( size_t tid ) -> void {
+                        auto& ts = this->transcripts_[tid];
+                        auto tsNorm = 1.0;//(ts.effectiveLength > 0.0) ? 1.0 / std::sqrt(ts.effectiveLength) : 1.0;
+                        //ts.mean = tsNorm * this->_computeWeightedMean( ts );
+                        //ts->mean = tsNorm * this->averageCount( ts );
+                        ts.mean = tsNorm * this->_computeMean( ts );
+                        //ts.mean = tsNorm * this->_computeMedian( ts );
+                });
+
+                normalizeTranscriptMeans_();
+                for( auto tid : boost::irange(size_t{0}, prevMeans.size()) ){
+                    delta += std::abs( transcripts_[tid].mean - prevMeans[tid] );
+                }
+
+                std::cerr << "done\n";
+                std::cerr << "total variation in mean = " << delta << "\n";
+                std::cerr << "discarded " << discard << " transcripts in this round whose mean was below " << minMean << "\n";
+            }
+
+            std::cerr << "end of outer iteration " << oiter << " recomputing biases\n";
+            // Thresholding
+            tbb::parallel_for( size_t(0), size_t(transcripts_.size()),
+                [this, minMean](size_t tid) -> void {
+                    auto& ts = this->transcripts_[tid];
+                    if (ts.mean < minMean) {
+                        ts.mean = 0.0;
+                        for (auto& kv : ts.binMers) {
+                            kv.second = 0.0;
+                        }
+                    }
+            });
+            //computeKmerFidelities_();
+        }
+        */
+
+    }
+
+
+    void writeAbundances(const boost::filesystem::path& outputFilePath,
+                         const std::string& headerLines,
+                         double minAbundance,
+                         bool haveCI) {
+
+        haveCI = false;
+
+        auto writeCoverageInfo = true;
+        if ( writeCoverageInfo ) {
+            boost::filesystem::path covPath = outputFilePath.parent_path();
+            covPath /= "equivClassCoverage.txt";
+            _dumpCoverage( covPath );
+        }
+
+        std::cerr << "Writing output\n";
+        ez::ezETAProgressBar pb(transcripts_.size());
+        pb.start();
+
+        auto estimatedGroupTotal = psum_(kmerGroupCounts_);
+        auto totalNumKmers = readHash_.totalLength() * (std::ceil(readHash_.averageLength()) - readHash_.kmerLength() + 1);
+        std::vector<KmerQuantity> fracTran(transcripts_.size(), 0.0);
+        std::vector<KmerQuantity> fracTranLow(transcripts_.size(), 0.0);
+        std::vector<KmerQuantity> fracTranHigh(transcripts_.size(), 0.0);
+
+        std::vector<KmerQuantity> numKmersFromTranscript(transcripts_.size(), 0.0);
+
+        // Compute nucleotide fraction (\nu_i in RSEM)
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
+           [this, &numKmersFromTranscript, &fracTran, totalNumKmers, estimatedGroupTotal](const BlockedIndexRange& range) -> void {
+            for (auto tid = range.begin(); tid != range.end(); ++tid) {
+              auto& ts = this->transcripts_[tid];
+              //fracTran[tid] = this->_computeMean(ts);
+
+              // EM
+              numKmersFromTranscript[tid] = this->_computeSum(ts);
+
+              // VB
+              //numKmersFromTranscript[tid] = ts.mean * estimatedGroupTotal;//this->_computeSum(ts);
+              //fracTran[tid] = this->_computeClampedMean( ts );
+            }
+        });
+        // noise transcript
+        // fracTran[transcripts_.size()] = totalNumKmers - estimatedGroupTotal;
+
+        // Normalize to obtain the fraction of kmers that originated from each transcript
+        normalize_(numKmersFromTranscript);
+        auto& fracNuc = numKmersFromTranscript;
+
+
+        // Compute transcript fraction (\tau_i in RSEM)
+        tbb::parallel_for(BlockedIndexRange(size_t(0), transcripts_.size()),
+          [&, this](const BlockedIndexRange& range) -> void {
+            for (auto i = range.begin(); i != range.end(); ++i) {
+              auto& ts = transcripts_[i];
+
+              fracTran[i] = this->_computeMean(ts);
+
+              //fracTran[i] = (ts.effectiveLength > 0) ? ts.mean / ts.effectiveLength : 0.0;
+              fracTranLow[i] = (ts.effectiveLength > 0) ? ts.fracLow / ts.effectiveLength : 0.0;
+              fracTranHigh[i] = (ts.effectiveLength > 0) ? ts.fracHigh / ts.effectiveLength : 0.0;
+
+            }
+          });
+        //normalize_(fracTran);
+
+        auto fracNorm = 1.0 / psum_(fracTran);
+        tbb::parallel_for(BlockedIndexRange(size_t(0), transcripts_.size()),
+                          [&, fracNorm](const BlockedIndexRange& range) -> void {
+                              for (auto i = range.begin(); i != range.end(); ++i) {
+                                  auto& ts = transcripts_[i];
+                                  fracTran[i] *= fracNorm;
+                                  fracTranLow[i] *= fracNorm;
+                                  fracTranHigh[i] *= fracNorm;
+                              }
+        });
+
+
+        std::ofstream ofile( outputFilePath.string() );
+        size_t index = 0;
+        double million = std::pow(10.0, 6);
+        double billion = std::pow(10.0, 9);
+        double estimatedReadLength = readHash_.averageLength();
+        double kmersPerRead = ((estimatedReadLength - merLen_) + 1);
+        double totalMappedKmers = std::accumulate(numKmersFromTranscript.begin(),
+                                                  numKmersFromTranscript.end(),
+                                                  0.0);
+        double totalMappedReads = totalMappedKmers / kmersPerRead;
+
+        ofile << headerLines;
+
+        ofile << "# " <<
+            "Transcript" << '\t' <<
+            "Length" << '\t' <<
+            "TPM" << '\t' <<
+            "RPKM" << '\t' <<
+            "KPKM" << '\t' <<
+            "EstimatedNumReads";
+
+        if (haveCI) {
+            ofile << '\t' << "TPM_LOW" << '\t' << "TPM_HIGH";
+        }
+
+        ofile << '\n';
+
+        for ( auto i : boost::irange(size_t{0}, transcripts_.size()) ) {
+          auto& ts = transcripts_[i];
+          // expected # of kmers coming from transcript i
+
+          // auto ci = estimatedGroupTotal * fracNuc[i];
+          auto ci = numKmersFromTranscript[i];
+
+          // expected # of reads coming from transcript i
+          auto ri = ci / kmersPerRead;
+          double effectiveLength = ts.length - std::floor(estimatedReadLength) + 1;
+          double effectiveLengthKmer = ts.length - merLen_ + 1;
+
+          auto kpkm = (effectiveLengthKmer > 0) ?
+            (ci * billion) / (effectiveLengthKmer * totalMappedKmers) : 0.0;
+          kpkm = (kpkm < minAbundance) ? 0.0 : kpkm;
+
+          auto rpkm = (effectiveLength > 0) ?
+            (ri * billion) / (effectiveLength * totalMappedReads) : 0.0;
+          rpkm = (kpkm < minAbundance) ? 0.0 : rpkm;
+
+          // PREVIOUS
+          // auto ci = estimatedGroupTotal * fracNuc[i];
+          /*
+          auto kpkm = (effectiveLengthKmer > 0) ?
+            (billion * (fracNuc[i] / ts.effectiveLength)) : 0.0;
+          kpkm = (kpkm < minAbundance) ? 0.0 : kpkm;
+          auto rpkm = (effectiveLength > 0) ?
+          (billion * (fracNuc[i] / ts.effectiveLength)) : 0.0;
+          rpkm = (kpkm < minAbundance) ? 0.0 : rpkm;
+
+          */
+
+
+          auto tpm = fracTran[i] * million;
+          tpm = (kpkm < minAbundance) ? 0.0 : tpm;
+
+          ofile << transcriptGeneMap_.transcriptName(index) <<
+                   '\t' << ts.length << '\t' <<
+                   tpm << '\t' <<
+                   rpkm << '\t' <<
+                   kpkm << '\t' <<
+                   ri;
+          if (haveCI) {
+              ofile << '\t' <<
+                  fracTranLow[i] * million << '\t' <<
+                  fracTranHigh[i] * million;
+          }
+          ofile << '\n';
+
+          ++index;
+          ++pb;
+        }
+        ofile.close();
+    }
+
+
+    /**
+     *  Instead of using the EM (SQUAREM) algorithm, infer the posterior distribution
+     *  using Streaming, Distributed, Asynchronous (SDA) variational Bayes
+     */
+    KmerQuantity optimizeVB(const std::string& klutfname,
+                          const std::string& tlutfname,
+                          const std::string& kmerEquivClassFname,
+                          size_t numIt,
+                          double minMean,
+                          double maxDelta) {
+
+
+         kmerEquivClassFname_ = kmerEquivClassFname;
+        // Prepare the necessary structures
+        const bool discardZeroCountKmers = false;
+        initialize_(klutfname, tlutfname, kmerEquivClassFname, discardZeroCountKmers);
+
+        const size_t numTranscripts = transcripts_.size();
+        const size_t numKmers = transcriptsForKmer_.size();
+
+        // Set the appropriate, user-specified, convergence criteria
+        std::function<bool(std::vector<double>&, std::vector<double>&)> hasConverged;
+        if (std::isfinite(maxDelta)) {
+            hasConverged = [maxDelta, this] (std::vector<double>& v0, std::vector<double>& v1) -> bool {
+                double maxVal = *std::max_element(v1.begin(), v1.end());
+                std::cerr << "maxVal: " << maxVal << "\n";
+                double minVal = 1e-7;
+                auto relDiff = this->relAbsDiff_(v0, v1, minVal);
+                double maxRelativeChange = *std::max_element( relDiff.begin(), relDiff.end() );
+                std::cerr << "max relative change: " << maxRelativeChange << "\n";
+                return maxRelativeChange < maxDelta;
+            };
+        } else {
+            hasConverged = [] (std::vector<double>& v0, std::vector<double>& v1) -> bool {
+                std::cerr << "no data-driven convergence criterion specified\n";
+                return false;
+            };
+        }
+
+
+        // We'll use these to check convergence
+        std::vector<double> meansOld(transcripts_.size(), 0.0);
+        std::vector<double> meansNew(transcripts_.size(), 0.0);
+
+
+        std::atomic<size_t> uniquelyAnchoredTranscripts{0};
+        std::atomic<size_t> nonZeroTranscripts{0};
+
+        const double DirichletPriorAlpha = 0.1;
+        std::vector<double> priorAlphas(transcripts_.size(), 0.0);
+        std::vector<double> posteriorAlphas(transcripts_.size(), 0.0);
+        /*
+        std::vector<double> kmerWeights(numKmers, 0.0);
+        for (auto tid : boost::irange(size_t{0}, numTranscripts)) {
+            auto& transcriptData = transcripts_[tid];
+            for ( auto & kv : transcriptData.binMers ) {
+                kmerWeights[kv.first] += kv.second * weight_(kv.first);
+            }
+        }
+        for (auto kw : kmerWeights) {
+            if (std::abs(kw - 1.0) > 1e-3) {
+                std::cerr << "Kmer had weight " << kw << "\n";
+            }
+        }
+        */
+
+        // Compute the initial mean for each transcript
+        tbb::parallel_for(BlockedIndexRange(size_t(0), numTranscripts),
+                          [this, DirichletPriorAlpha, &uniquelyAnchoredTranscripts, &nonZeroTranscripts, &meansOld, &posteriorAlphas](const BlockedIndexRange& range) -> void {
+            for (auto tid = range.begin(); tid != range.end(); ++tid) {
+              auto& transcriptData = this->transcripts_[tid];
+              KmerQuantity total = 0.0;
+
+              bool notTotallyPromiscuous{false};
+              bool nonZero{false};
+
+              auto effLength = transcriptData.effectiveLength;
+              for ( auto & kv : transcriptData.binMers ) {
+                auto kmer = kv.first;
+
+                // count is the number of times occurrences of the k-mer in this transcript
+                auto count = kv.second;
+
+                // weight(kmer) = 1 / total # occurences
+                // count = # occurences in this transcript
+                // kmerGroupCounts(kmer) = # of observations of kmer in the read set
+                // The weight attributed to this transcript (kv.second) is:
+                // count * weight(kmer) * kmerGroupCounts(kmer) = (# occurrences in ts / total # occurrences) * total count
+                kv.second = (effLength  > 0.0) ?
+                    (count * this->kmerGroupCounts_[kmer] * this->weight_(kmer)) :
+                    0.0;
+
+                if (kv.second > 0.0 and kmerGroupPromiscuities_[kmer] == 1) { notTotallyPromiscuous = true; }
+                if (kv.second > 0.0) { nonZero = true; }
+              }
+
+              //transcriptData.mean = this->_computeMean(transcriptData);
+              auto weightedKmerSum = this->_computeSum(transcriptData);
+              // Set \alpha_t = \alpha_0 + \mu_t
+              posteriorAlphas[tid] = DirichletPriorAlpha ;//+ weightedKmerSum;
+              if (notTotallyPromiscuous) { ++uniquelyAnchoredTranscripts; }
+              if (nonZero) { ++nonZeroTranscripts; }
+            }
+        }
+        );
+
+        auto posteriorAlphaSum = psum_(posteriorAlphas);
+        std::cerr << "posteriorAlphaSum : " << posteriorAlphaSum << " [before iterations]\n";
+        for (size_t i : boost::irange({0}, numTranscripts)) {
+            meansOld[i] = posteriorAlphas[i] / posteriorAlphaSum;
+        }
+
+        std::vector<double> expctedLogThetas(transcripts_.size(), 0.0);
+        std::vector<double> logRho(transcripts_.size(), -std::numeric_limits<double>::infinity());
+        std::string clearline = "                                                                                \r\r";
+
+
+        for (size_t currIt : boost::irange({0}, numIt)) {
+            std::string jumpBack = "\x1b[A";
+            std::cerr << clearline << "iteration : " << currIt << "\n";
+            // log rho_{ntsoa} = E_{theta}[log theta_t] + log P(S_n | T_n) + [other terms sum to 0]
+            // E_{theta}[log theta_t] = digamma(alpha_t) - digamma(\sum_{t'} alpha_{t'})
+            double sumAlpha = psum_(posteriorAlphas);
+            double digammaSumAlpha = boost::math::digamma(sumAlpha);
+            for (size_t i : boost::irange({0}, numTranscripts)) {
+                auto& ts = transcripts_[i];
+                //double digammaAlphaT = boost::math::digamma(posteriorAlphas[i]);
+
+                logRho[i] = (ts.effectiveLength > 0 and posteriorAlphas[i] > 0.0) ?
+                    (boost::math::digamma(posteriorAlphas[i]) - digammaSumAlpha) + std::log(1.0 / ts.effectiveLength) :
+                    -std::numeric_limits<double>::infinity();
+            }
+
+            std::atomic<size_t> numUpdated{0};
+
+            //  E-Step : reassign the kmer group counts proportionally to each transcript
+            tbb::parallel_for(BlockedIndexRange(size_t(0), numKmers),
+                 // for each kmer group
+                 [&meansOld, &meansNew, &logRho, &posteriorAlphas, &numUpdated, DirichletPriorAlpha, this](const BlockedIndexRange& range) -> void {
+                     for (auto kid : boost::irange(range.begin(), range.end())) {
+                         auto kmer = kid;
+                         // for each transcript containing this kmer group
+                         auto& transcripts = this->transcriptsForKmer_[kmer];
+
+                         double totalMass = 0.0;
+                         for ( auto tid : transcripts ) {
+                             auto el = this->transcripts_[tid].effectiveLength;
+                             double rho = (el > 0) ? std::exp(logRho[tid]) : 0.0;
+                             totalMass += rho;
+                         }
+
+                         double norm = (totalMass > 0.0) ? (1.0 / totalMass) : 0.0;
+                         for ( auto tid : transcripts ) {
+                             auto& trans = this->transcripts_[tid];
+                             auto lastIndex = trans.binMers.size()  - 1;
+                             auto el = trans.effectiveLength;
+                             double rho = (el > 0) ? std::exp(logRho[tid]) : 0.0;
+                             trans.binMers[kmer] = rho * norm *
+                                 kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
+
+                             // If we've seen all of the k-mers that appear in this transcript,
+                             // then we can compute it's new estimated abundance
+                             if (trans.updated++ == lastIndex) {
+                                 ++numUpdated;
+
+                                 // We're folding the length term into the binMer weights now
+                                 // should this computeSum be a computeMean?
+                                 //trans.mean = this->_computeSum(trans);//this->_computeMean(trans);
+                                 auto sumWeightedReadMass = this->_computeSum(trans);
+
+                                 // with filter
+                                 // posteriorAlphas[tid] = DirichletPriorAlpha + sumWeightedReadMass;
+                                 posteriorAlphas[tid] = (posteriorAlphas[tid] > 0.0) ? DirichletPriorAlpha + sumWeightedReadMass : 0.0;
+
+                                 // without filter
+                                 //posteriorAlphas[tid] = DirichletPriorAlpha + trans.mean;
+                                 if (std::isnan(posteriorAlphas[tid])) {
+                                     std::cerr << "posterior is nan at " << tid << "\n";
+                                     std::cerr << "mean: " << trans.mean << "\n";
+                                     std::cerr << "norm: " << norm << "\n";
+                                 }
+                                 trans.updated.store(0);
+                             }
+                         }
+                     } // for kid in range
+           });
+
+            std::cerr << clearline << "numUpdated = " << numUpdated << " : numTranscripts = " << numTranscripts << "\n";
+            jumpBack += "\x1b[A";
+
+            auto posteriorAlphaSum = psum_(posteriorAlphas);
+            std::cerr << clearline << "posterior alpha sum: " << posteriorAlphaSum << "\n";
+            jumpBack += "\x1b[A";
+            for (size_t i : boost::irange({0}, numTranscripts)) {
+                meansNew[i] = posteriorAlphas[i] / posteriorAlphaSum;
+                transcripts_[i].mean = meansNew[i];
+            }
+
+            // Check for data-driven convergence criteria
+            if (hasConverged(meansOld, meansNew)) {
+              std::cerr << clearline << "convergence criteria met; terminating VB\n";
+              break;
+            }
+            jumpBack += "\x1b[A";
+
+            std::swap(meansNew, meansOld);
+
+            bool doFilter{false};
+            double itFrac = currIt / static_cast<double>(numIt);
+            double cutoff = std::min(1.0, std::exp(itFrac - 0.3));
+            if (doFilter and currIt > 0 and currIt % 100 == 0) {
+                // cutoff =  e^{x^2 - 0.5}
+                filterByCoverage_(cutoff, posteriorAlphas);
+            }
+
+            std::cerr << clearline << "cutoff = " << cutoff << "\n";
+            jumpBack += "\x1b[A";
+            std::cerr << jumpBack;
+        }
+
+        // Compute the confidence intervals for the expression values
+        // The marginal for each transcript fraction takes the form of a Beta distribution
+        // we compute the confidence intervals by the appropriate quantiles of the distribution
+        double sumOfAlphas = psum_(posteriorAlphas);
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptGeneMap_.numTranscripts())),
+             [this, &posteriorAlphas, sumOfAlphas](const BlockedIndexRange& range) -> void {
+                    for (auto tid = range.begin(); tid != range.end(); ++tid) {
+                        auto& transcriptData = this->transcripts_[tid];
+                        auto alphaT = posteriorAlphas[tid];
+                        if (alphaT > 0) {
+                            boost::math::beta_distribution<double> marginalDist(alphaT, sumOfAlphas - alphaT);
+                            transcriptData.mean = boost::math::mean(marginalDist);
+                            transcriptData.fracLow = boost::math::quantile(marginalDist, 0.025);
+                            transcriptData.fracHigh = boost::math::quantile(marginalDist, 0.975);
+                        } else {
+                            transcriptData.mean = transcriptData.fracLow = transcriptData.fracHigh = 0.0;
+                        }
+                    }
+        });
+
+    }
+
+};
+
+#endif // ITERATIVE_OPTIMIZER_HPP
diff --git a/include/CommonTypes.hpp b/include/CommonTypes.hpp
new file mode 100644
index 0000000..04eb25a
--- /dev/null
+++ b/include/CommonTypes.hpp
@@ -0,0 +1,48 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+# ifndef COMMON_TYPES_HPP
+# define COMMON_TYPES_HPP
+
+#include <cstdint>
+#include <array>
+
+namespace Sailfish {
+    /*
+	struct Kmer {
+		uint64_t kmer;
+	};
+
+	struct Index {
+		uint64_t index;
+	};
+    */
+    struct TranscriptFeatures{
+        std::string name;
+        size_t length;
+        double gcContent;
+        std::array<uint64_t, 16> diNucleotides;
+    };
+}
+
+
+# endif // COMMON_TYPES_HPP
diff --git a/include/EquivalenceClassBuilder.hpp b/include/EquivalenceClassBuilder.hpp
new file mode 100644
index 0000000..f91b836
--- /dev/null
+++ b/include/EquivalenceClassBuilder.hpp
@@ -0,0 +1,143 @@
+#ifndef EQUIVALENCE_CLASS_BUILDER_HPP
+#define EQUIVALENCE_CLASS_BUILDER_HPP
+
+#include <unordered_map>
+#include <vector>
+#include <thread>
+#include <memory>
+#include <mutex>
+
+// Logger includes
+#include "spdlog/spdlog.h"
+
+#include "cuckoohash_map.hh"
+#include "concurrentqueue.h"
+#include "TranscriptGroup.hpp"
+
+
+struct TGValue {
+    TGValue(const TGValue& o) {
+        weights = o.weights;
+        count.store(o.count.load());
+    }
+
+    TGValue(std::vector<double>& weightIn, uint64_t countIn) :
+        weights(weightIn) { count.store(countIn); }
+
+    // const is a lie
+    void normalizeAux() const {
+        double sumOfAux{0.0};
+        for (size_t i = 0; i < weights.size(); ++i) {
+            sumOfAux += weights[i];
+        }
+        double norm = 1.0 / sumOfAux;
+        for (size_t i = 0; i < weights.size(); ++i) {
+            weights[i] *= norm;
+        }
+        /* LOG SPACE
+        double sumOfAux = salmon::math::LOG_0;
+        for (size_t i = 0; i < weights.size(); ++i) {
+            sumOfAux = salmon::math::logAdd(sumOfAux, weights[i]);
+        }
+        for (size_t i = 0; i < weights.size(); ++i) {
+            weights[i] = std::exp(weights[i] - sumOfAux);
+        }
+        */
+    }
+
+    // forget synchronizing this for the time being
+    mutable std::vector<double> weights;
+    std::atomic<uint64_t> count{0};
+};
+
+class EquivalenceClassBuilder {
+    public:
+        EquivalenceClassBuilder(std::shared_ptr<spdlog::logger> loggerIn) :
+		logger_(loggerIn) {
+            countMap_.reserve(1000000);
+        }
+
+        ~EquivalenceClassBuilder() {}
+
+        void start() { active_ = true; }
+
+        bool finish() {
+            active_ = false;
+            for (auto kv = countMap_.begin(); !kv.is_end(); ++kv) {
+                kv->second.normalizeAux();
+                countVec_.push_back(*kv);
+            }
+
+    	    logger_->info("Computed {} rich equivalence classes "
+			  "for further processing", countVec_.size());
+            return true;
+        }
+
+        inline void addGroup(TranscriptGroup&& g,
+                             std::vector<double>& weights) {
+
+            auto upfn = [&weights](TGValue& x) -> TGValue& {
+                // update the count
+                x.count++;
+                // update the weights
+                for (size_t i = 0; i < x.weights.size(); ++i) {
+                    // Possibly atomicized in the future
+                    weights[i] += x.weights[i];
+                    /* LOG SPACE
+                    x.weights[i] =
+                        salmon::math::logAdd(x.weights[i], weights[i]);
+                    */
+                }
+                return x;
+            };
+            TGValue v(weights, 1);
+            countMap_.upsert(g, upfn, v);
+        }
+
+        std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec() {
+            return countVec_;
+        }
+
+    private:
+        std::atomic<bool> active_;
+	    cuckoohash_map<TranscriptGroup, TGValue, TranscriptGroupHasher> countMap_;
+        std::vector<std::pair<const TranscriptGroup, TGValue>> countVec_;
+    	std::shared_ptr<spdlog::logger> logger_;
+};
+
+#endif // EQUIVALENCE_CLASS_BUILDER_HPP
+
+/** Unordered map implementation */
+//std::unordered_map<TranscriptGroup, TGValue, TranscriptGroupHasher> countMap_;
+//std::mutex mapMut_;
+/*
+bool finish() {
+    // unordered_map implementation
+    for (auto& kv : countMap_) {
+        kv.second.normalizeAux();
+        countVec_.push_back(kv);
+    }
+    return true;
+}
+*/
+
+/*
+inline void addGroup(TranscriptGroup&& g,
+        std::vector<double>& weights) {
+
+    // unordered_map implementation
+    std::lock_guard<std::mutex> lock(mapMut_);
+    auto it = countMap_.find(g);
+    if (it == countMap_.end()) {
+        TGValue v(weights, 1);
+        countMap_.emplace(g, v);
+    } else {
+        auto& x = it->second;
+        x.count++;
+        for (size_t i = 0; i < x.weights.size(); ++i) {
+            x.weights[i] =
+                salmon::math::logAdd(x.weights[i], weights[i]);
+        }
+    }
+}
+*/
diff --git a/include/ErrorModel.hpp b/include/ErrorModel.hpp
new file mode 100644
index 0000000..1ec0635
--- /dev/null
+++ b/include/ErrorModel.hpp
@@ -0,0 +1,74 @@
+#ifndef ERROR_MODEL
+#define ERROR_MODEL
+
+#include <mutex>
+#include <atomic>
+#include "tbb/concurrent_vector.h"
+#include "AtomicMatrix.hpp"
+
+extern "C" {
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+#undef max
+#undef min
+}
+
+struct UnpairedRead;
+struct ReadPair;
+class Transcript;
+
+class ErrorModel {
+public:
+    ErrorModel(double alpha, uint32_t maxExpectedLen);
+    bool burnedIn();
+    void burnedIn(bool burnedIn);
+
+    /**
+    *  For unpaired reads, update the error model to account
+    *  for errors we've observed in this read pair.
+    */
+    void update(const UnpairedRead&, Transcript& ref, double p, double mass);
+
+    /**
+      * Compute the log-likelihood of the observed unpaired alignment given the
+      * current error model.
+      */
+    double logLikelihood(const UnpairedRead&, Transcript& ref);
+
+    /**
+    *  For paired-end reads, update the error model to account
+    *  for errors we've observed in this read pair.
+    */
+    void update(const ReadPair&, Transcript& ref, double p, double mass);
+
+    /**
+     * Compute the log-likelihood of the observed paire-end alignment given the
+     * current error model.
+     */
+    double logLikelihood(const ReadPair&, Transcript& ref);
+
+private:
+    /**
+     * These functions, which work directly on bam_seq_t* types, drive the
+     * update() and logLikelihood() methods above.
+     */
+    void update(bam_seq_t* read, Transcript& ref, double p, double mass, std::vector<AtomicMatrix<double>>& mismatchProfile);
+    double logLikelihood(bam_seq_t* read, Transcript& ref, std::vector<AtomicMatrix<double>>& mismatchProfile);
+
+    // NOTE: Do these need to be concurrent_vectors as before?
+    // Store the mismatch probability tables for the left and right reads
+    std::vector<AtomicMatrix<double>> mismatchLeft_;
+    std::vector<AtomicMatrix<double>> mismatchRight_;
+    //tbb::concurrent_vector<AtomicMatrix<double>> mismatchLeft_;
+    //tbb::concurrent_vector<AtomicMatrix<double>> mismatchRight_;
+
+    bool isEnabled_;
+    size_t maxExpectedLen_;
+    size_t maxLen_;
+    std::atomic<bool> burnedIn_;
+    // Maintain a mutex in case the error model wants to talk to the
+    // console / log.
+    std::mutex outputMutex_;
+};
+
+#endif // ERROR_MODEL
diff --git a/include/FASTAParser.hpp b/include/FASTAParser.hpp
new file mode 100644
index 0000000..8626552
--- /dev/null
+++ b/include/FASTAParser.hpp
@@ -0,0 +1,16 @@
+#ifndef FASTA_PARSER
+#define FASTA_PARSER
+
+#include <vector>
+
+class Transcript;
+
+class FASTAParser {
+public:
+    FASTAParser(const std::string& fname);
+    void populateTargets(std::vector<Transcript>& transcripts);
+private:
+    std::string fname_;
+};
+
+#endif // FASTA_PARSER 
diff --git a/include/ForgettingMassCalculator.hpp b/include/ForgettingMassCalculator.hpp
new file mode 100644
index 0000000..e6a90f7
--- /dev/null
+++ b/include/ForgettingMassCalculator.hpp
@@ -0,0 +1,138 @@
+#ifndef __FORGETTING_MASS_CALCULATOR__
+#define __FORGETTING_MASS_CALCULATOR__
+
+#include "SpinLock.hpp"
+#include "SalmonMath.hpp"
+#include "spdlog/spdlog.h"
+
+class ForgettingMassCalculator {
+    public:
+    ForgettingMassCalculator(double forgettingFactor = 0.65) :
+        batchNum_(0), forgettingFactor_(forgettingFactor),
+        logForgettingMass_(salmon::math::LOG_1),
+        logForgettingMasses_({}),
+        cumulativeLogForgettingMasses_({}){}
+
+    ForgettingMassCalculator(const ForgettingMassCalculator&) = delete;
+    ForgettingMassCalculator(ForgettingMassCalculator&&) = default;
+    ForgettingMassCalculator& operator=(ForgettingMassCalculator&&) = default;
+    ForgettingMassCalculator& operator=(const ForgettingMassCalculator&) = delete;
+
+    /** Precompute the log(forgetting mass) and cumulative log(forgetting mass)
+      * for the first numMiniBatches batches / timesteps.
+      */
+    bool prefill(uint64_t numMiniBatches) {
+        logForgettingMasses_.reserve(numMiniBatches);
+        cumulativeLogForgettingMasses_.reserve(numMiniBatches);
+
+        double fm = salmon::math::LOG_1;
+        logForgettingMasses_.push_back(fm);
+        cumulativeLogForgettingMasses_.push_back(fm);
+
+        for (size_t i = 2; i < numMiniBatches-1; ++i) {
+             fm += forgettingFactor_ * std::log(static_cast<double>(i-1)) -
+                std::log(std::pow(static_cast<double>(i), forgettingFactor_) - 1);
+            logForgettingMasses_.push_back(fm);
+            // fill in cumulative mass
+            cumulativeLogForgettingMasses_.push_back(
+                    salmon::math::logAdd(cumulativeLogForgettingMasses_.back(), fm));
+        }
+        return true;
+    }
+
+    double operator()() {
+#if defined __APPLE__
+        spin_lock::scoped_lock sl(ffMutex_);
+#else
+        std::lock_guard<std::mutex> lock(ffMutex_);
+#endif
+        ++batchNum_;
+        if (batchNum_ > 1) {
+            logForgettingMass_ += forgettingFactor_ * std::log(static_cast<double>(batchNum_-1)) -
+                std::log(std::pow(static_cast<double>(batchNum_), forgettingFactor_) - 1);
+        }
+        return logForgettingMass_;
+    }
+
+
+    /**
+      *  Return the log(forgetting mass) and current timestep in the ouput
+      *  variables logForgettingMass and currentMinibatchTimestep. If we
+      *  haven't pre-computed the forgetting mass for the next timestep yet,
+      *  then do it now.
+      */
+    void getLogMassAndTimestep(double& logForgettingMass, uint64_t& currentMinibatchTimestep) {
+#if defined __APPLE__
+        spin_lock::scoped_lock sl(ffMutex_);
+#else
+        std::lock_guard<std::mutex> lock(ffMutex_);
+#endif
+        if (batchNum_ < logForgettingMasses_.size()) {
+            currentMinibatchTimestep = batchNum_;
+            logForgettingMass = logForgettingMasses_[batchNum_];
+            ++batchNum_;
+        } else {
+            double fm = logForgettingMasses_.back() + forgettingFactor_ *
+                    std::log(static_cast<double>(batchNum_-1)) -
+                    std::log(std::pow(static_cast<double>(batchNum_), forgettingFactor_) - 1);
+
+            logForgettingMasses_.push_back(fm);
+            cumulativeLogForgettingMasses_.push_back(
+                    salmon::math::logAdd(cumulativeLogForgettingMasses_.back(), fm));
+
+            currentMinibatchTimestep = batchNum_;
+            logForgettingMass = logForgettingMasses_[batchNum_];
+            ++batchNum_;
+        }
+    }
+
+    // Retrieve the log(forgetting mass) at a particular timestep.  This
+    // function assumes that the forgetting mass has already been computed
+    // for this timestep --- otherwise, this will result in a fatal error.
+    double logMassAt(uint64_t timestep) {
+        if (timestep < logForgettingMasses_.size()) {
+            return logForgettingMasses_[timestep];
+        } else {
+            spdlog::get("jointLog")->error("Requested forgetting mass for timestep {} "
+                                      "where it has not yet been computed.  This "
+                                      "likely means that the ForgettingMassCalculator "
+                                      "class was being used incorrectly!  Please "
+                                      "report this crash on GitHub!\n", timestep);
+            std::exit(1);
+            return salmon::math::LOG_0;
+        }
+    }
+
+    // Retrieve the cumulative log(forgetting mass) at a particular timestep.
+    // This function assumes that the forgetting mass has already been computed
+    // for this timestep --- otherwise, this will result in a fatal error.
+    double cumulativeLogMassAt(uint64_t timestep) {
+        if (timestep < cumulativeLogForgettingMasses_.size()) {
+            return cumulativeLogForgettingMasses_[timestep];
+        } else {
+            spdlog::get("jointLog")->error("Requested cumulative forgetting mass for timestep {} "
+                                      "where it has not yet been computed.  This "
+                                      "likely means that the ForgettingMassCalculator "
+                                      "class was being used incorrectly!  Please "
+                                      "report this crash on GitHub!\n", timestep);
+            std::exit(1);
+            return salmon::math::LOG_0;
+        }
+    }
+
+    uint64_t getCurrentTimestep() { return batchNum_; }
+
+    private:
+        uint64_t batchNum_;
+        double forgettingFactor_;
+        double logForgettingMass_;
+        std::vector<double> logForgettingMasses_;
+        std::vector<double> cumulativeLogForgettingMasses_;
+#if defined __APPLE__
+        spin_lock ffMutex_;
+#else
+        std::mutex ffMutex_;
+#endif
+};
+
+#endif //__FORGETTING_MASS_CALCULATOR__
diff --git a/include/FragmentLengthDistribution.hpp b/include/FragmentLengthDistribution.hpp
new file mode 100644
index 0000000..ee73aed
--- /dev/null
+++ b/include/FragmentLengthDistribution.hpp
@@ -0,0 +1,153 @@
+/**
+ *  lengthdistribution.h
+ *  express
+ *
+ *  Created by Adam Roberts on 1/30/13.
+ *  Copyright 2013 Adam Roberts. All rights reserved.
+ */
+// Modifications by Rob Patro; 2014
+
+#ifndef FRAGMENT_LENGTH_DISTRIBUTION
+#define FRAGMENT_LENGTH_DISTRIBUTION
+
+#include "tbb/atomic.h"
+#include <atomic>
+#include <vector>
+#include <string>
+#include <mutex>
+
+/**
+ * The LengthDistribution class keeps track of the observed length distribution.
+ * It is initialized with a Gaussian prior with parameters specified by the
+ * arguments to the constructor. An argument-specified binomial kernel is then
+ * added for each observation. All mass values and probabilities are stored and
+ * returned in log space (except in to_string).
+ */
+class FragmentLengthDistribution {
+  /**
+   * A private vector that stores the (logged) kernel values.
+   **/
+  std::vector<double> kernel_;
+  /**
+   * A private vector that stores the observed (logged) mass for each length.
+   */
+    std::vector<tbb::atomic<double>> hist_;
+
+   /**
+   * A private vector that stores the observed (logged) mass for each length.
+   */
+    std::vector<double> cachedCMF_;
+    volatile bool haveCachedCMF_;
+    std::mutex fldMut_;
+
+  /**
+   * A private double that stores the total observed (logged) mass.
+   */
+    tbb::atomic<double> totMass_;
+  /**
+   * A private double that stores the (logged) sum of the product of observed
+   * lengths and masses for quick mean calculations.
+   */
+    tbb::atomic<double> sum_;
+  /**
+   * A private int that stores the minimum observed length.
+   */
+    std::atomic<size_t> min_;
+  /**
+   * A size for internal binning of the lengths in the distribution.
+   */
+  size_t binSize_;
+
+public:
+  /**
+   * LengthDistribution Constructor.
+   * @param alpha double that sets the average pseudo-counts (logged).
+   * @param max_val an integer that sets the maximum allowable length.
+   * @param prior_mu a size_t for the mean of the prior gaussian distribution.
+            If 0, a uniform distribution is used instead.
+   * @param prior_sigma a size_t for the standard deviation of the prior
+   *        gaussian distribution.
+   * @param kernel_n a size_t specifying the number of trials in the kernel
+   *        binomial distribution. Must be odd.
+   * @param kernel_p a double specifying the success probability for the kernel
+   *        binomial distribution.
+   * @param bin_size a size_t specifying the size of bins to use internally to
+   *        reduce the number of parameters in the distribution.
+   */
+  FragmentLengthDistribution(double alpha, size_t max_val, size_t prior_mu,
+                             size_t prior_sigma, size_t kernel_n, double kernel_p,
+                             size_t bin_size = 1);
+  /**
+   * An accessor for the maximum allowed length.
+   * @return Max allowed length.
+   */
+  size_t maxVal() const;
+  /**
+   * An accessor for the minimum observed length (1 initially).
+   * @return Minimum observed length.
+   */
+  size_t minVal() const;
+  /**
+   * An accessor for the mean length in the distribution.
+   * @return Mean observed length.
+   */
+  double mean() const;
+  /**
+   * A member function that updates the distribution based on a new length
+   * observation.
+   * @param len an integer for the observed length.
+   * @param mass a double for the mass (logged) to add.
+   */
+  void addVal(size_t len, double mass);
+  /**
+   * An accessor for the (logged) probability of a given length.
+   * @param len an integer for the length to return the probability of.
+   * @return (logged) probability of observing the given length.
+   */
+  double pmf(size_t len) const;
+  /**
+   * A member function that returns a (logged) cumulative mass for a given
+   * length.
+   * @param len an integer for the length to return the cmf value of.
+   * @return (Logged) cmf value of length.
+   */
+  double cmf(size_t len) const;
+
+  /**
+    * A member function that caches the cumulative mass function into
+    * a vector.  This should be called (once), when the fld will no
+    * longer be updated.  It will make future calls to cmf(len)
+    * much faster.
+    */
+  void cacheCMF();
+
+  /**
+   * A member function that returns a vector containing the (logged) cumulative
+   * mass function *for the bins*.
+   * @return (Logged) cmf of bins.
+   */
+  std::vector<double> cmf() const;
+  /**
+   * An accessor for the (logged) observation mass (including pseudo-counts).
+   * @return Total observation mass.
+   */
+  double totMass() const;
+  /**
+   * A member function that returns a string containing the current
+   * distribution.
+   * @return Space-separated string of probabilities ordered from length 0 to
+   *         max_val (non-logged).
+   */
+   std::string toString() const;
+  //std::string to_string() const;
+  /**
+   * A member function that appends the LengthDistribution parameters to the end
+   * of the given file.
+   * @param outfile the file to append to.
+   * @param length_type a string specifying the type of length the distribution
+   *        is of (ie. "Fragment" or "Target") to be included in the header.
+   */
+  //void append_output(std::ofstream& outfile, std::string length_type) const;
+};
+
+#endif
diff --git a/include/FragmentList.hpp b/include/FragmentList.hpp
new file mode 100644
index 0000000..8452d61
--- /dev/null
+++ b/include/FragmentList.hpp
@@ -0,0 +1,42 @@
+#ifndef FRAGMENT_LIST_HPP
+#define FRAGMENT_LIST_HPP
+
+
+extern "C" {
+#include "container.h"
+}
+
+class FragmentList {
+    public:
+        FragmentList();
+
+         // Free all of the structures allocated for this fragment list
+        ~FragmentList();
+
+        void freeFragList(Container* frags);
+
+        bool computeBestChain_(Container* frags, double& maxScore, uint32_t& bestPos);
+
+        void computeBestChain();
+
+       void addFragMatch(uint32_t refStart, uint32_t queryStart, uint32_t len);
+
+        void addFragMatch(uint32_t refStart, uint32_t refEnd,
+                         uint32_t queryStart, uint32_t queryEnd);
+
+        void addFragMatchRC(uint32_t refStart, uint32_t queryStart, uint32_t len);
+
+        void addFragMatchRC(uint32_t refStart, uint32_t refEnd,
+                         uint32_t queryStart, uint32_t queryEnd);
+
+        bool isForward();
+
+        Container* fragments;
+        Container* fragmentsRC;
+        size_t numFrag;
+        double bestHitScore;
+        uint32_t bestHitPos;
+        bool isForward_{true};
+};
+
+#endif // FRAGMENT_LIST_HPP
diff --git a/include/FragmentStartPositionDistribution.hpp b/include/FragmentStartPositionDistribution.hpp
new file mode 100644
index 0000000..753c60c
--- /dev/null
+++ b/include/FragmentStartPositionDistribution.hpp
@@ -0,0 +1,120 @@
+/**
+ * Model the bias in read start positions across a set of transcripts.
+ * This class is similar to and inspired by the FragmentLengthDistribution
+ * class, which was itself modified from lengthdistribution.h ---
+ * originally written by Adam Roberts as part of the eXpress software.
+ * updated by Rob Patro; 2014, 2015
+ */
+
+#ifndef FRAGMENT_START_POSITION_DISTRIBUTION
+#define FRAGMENT_START_POSITION_DISTRIBUTION
+
+#include "tbb/atomic.h"
+#include <atomic>
+#include <vector>
+#include <string>
+#include <mutex>
+
+/**
+ * The FragmentStartPositionDistribution class keeps track of the observed fragment
+ * start position distribution. It is initialized with uniform prior with
+ * with parameters specified by the arguments to the constructor. All
+ * mass values and probabilities are stored and returned in log space (except
+ * in to_string).
+ */
+class FragmentStartPositionDistribution {
+  /**
+   * A private vector that stores the observed (logged) mass for each length.
+   */
+  std::vector<tbb::atomic<double>> pmf_;
+  std::vector<tbb::atomic<double>> cmf_;
+  /**
+   * A private double that stores the total observed (logged) mass.
+   */
+  tbb::atomic<double> totMass_;
+  /**
+   * A private double that stores the (logged) sum of the product of observed
+   * lengths and masses for quick mean calculations.
+   */
+   tbb::atomic<double> sum_;
+  /**
+   * The number of bins we consider within each transcript.
+   */
+  size_t numBins_;
+
+  // Mutex for this distribution
+  std::mutex fspdMut_;
+  bool isUpdated_;
+
+public:
+  /**
+   * FragmentStartPositionDistribution constructor:
+   * @param numBins The number of bins to consider for
+   *                each transcript.
+   *
+   */
+  FragmentStartPositionDistribution(uint32_t numBins=20);
+
+   /**
+   * A member function that updates the distribution based on a new length
+   * observation.
+   * @param len an integer for the observed length.
+   * @param mass a double for the mass (logged) to add.
+   */
+  void addVal(int32_t hitPos, uint32_t txpLen, double mass);
+  /**
+   * A member function that returns the probability that a hit
+   * starts at the specified position within the given transcript length.
+   * @param hitPos The position where the fragment begins
+   * @param txpLen The length of the transcript
+   */
+  double operator()(int32_t hitPos, uint32_t txpLen, double effLen);
+  // Evaluate the CDF between two points
+  double evalCDF(int32_t hitPos, uint32_t txpLen);
+  // Update the distribution (compute the CDF) and
+  // set isUpdated_;
+  void update();
+
+  /**
+   * An accessor for the (logged) probability of a given length.
+   * @param len an integer for the length to return the probability of.
+   * @return (logged) probability of observing the given length.
+   */
+  //double pmf(size_t len) const;
+  /**
+   * A member function that returns a (logged) cumulative mass for a given
+   * length.
+   * @param len an integer for the length to return the cmf value of.
+   * @return (Logged) cmf value of length.
+   */
+  //double cmf(size_t len) const;
+  /**
+   * A member function that returns a vector containing the (logged) cumulative
+   * mass function *for the bins*.
+   * @return (Logged) cmf of bins.
+   */
+  //std::vector<double> cmf() const;
+  /**
+   * An accessor for the (logged) observation mass (including pseudo-counts).
+   * @return Total observation mass.
+   */
+  double totMass() const;
+  /**
+   * A member function that returns a string containing the current
+   * distribution.
+   * @return Space-separated string of probabilities ordered from length 0 to
+   *         max_val (non-logged).
+   */
+   std::string toString() const;
+  //std::string to_string() const;
+  /**
+   * A member function that appends the LengthDistribution parameters to the end
+   * of the given file.
+   * @param outfile the file to append to.
+   * @param length_type a string specifying the type of length the distribution
+   *        is of (ie. "Fragment" or "Target") to be included in the header.
+   */
+  //void append_output(std::ofstream& outfile, std::string length_type) const;
+};
+
+#endif // FRAGMENT_START_POSITION_DISTRIBUTION
diff --git a/include/GenomicFeature.hpp b/include/GenomicFeature.hpp
new file mode 100644
index 0000000..5bfe844
--- /dev/null
+++ b/include/GenomicFeature.hpp
@@ -0,0 +1,215 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef GENOMIC_FEATURE_HPP
+#define GENOMIC_FEATURE_HPP
+
+#include <boost/thread/thread.hpp>
+
+#include <atomic>
+#include <thread>
+#include <fstream>
+#include <boost/tokenizer.hpp>
+
+#include "tbb/concurrent_queue.h"
+
+struct TranscriptGeneID {
+  std::string transcript_id;
+  std::string gene_id;
+  // std::unordered_map<std::string, std::string> dynamic;
+  bool parseAttribute( std::string& key, std::string& val ) {
+    if ( key == "transcript_id" ) { transcript_id = val; return true; }
+    if ( key == "gene_id" ) { gene_id = val; return true; }
+    // dynamic[key] = val;
+    return false;
+  }
+
+};
+
+std::ostream& operator<< ( std::ostream& out, const TranscriptGeneID& ids );
+
+template< typename StaticAttributes >
+class GenomicFeature {
+public:
+   std::string seqID;
+   std::string source;
+   std::string type;
+   int start, end;
+   float score;
+   char strand;
+   char phase;
+   StaticAttributes sattr;
+   template <typename T>
+   friend std::ostream& operator<< ( std::ostream& out, const GenomicFeature<T>& gf );
+   template <typename T>
+   friend std::istream& operator>> ( std::istream& in, const GenomicFeature<T>& gf );
+};
+
+template< typename StaticAttributes>
+std::ostream& operator<< ( std::ostream& out, const GenomicFeature<StaticAttributes>& gf );
+
+template< typename StaticAttributes >
+std::istream& operator>> ( std::istream& in, GenomicFeature<StaticAttributes>& gf );
+
+namespace GTFParser {
+
+  template< typename CustomGenomicFeature >
+  void genomicFeatureFromLine( std::string& l, CustomGenomicFeature& gf ) {
+    
+    size_t head = 0;
+    size_t tail = l.find_first_of('\t');
+    
+    gf.seqID = l.substr(head,tail);
+    head = tail+1;
+    tail = l.find_first_of('\t', head);
+
+    gf.source = l.substr(head, tail-head);
+    head = tail+1;
+    tail = l.find_first_of('\t', head);
+  
+    gf.type = l.substr(head, tail-head);
+    head = tail+1;
+    tail = l.find_first_of('\t', head);
+  
+    gf.start = atoi(l.substr(head, tail-head).c_str());
+    head = tail+1;
+    tail = l.find_first_of('\t', head);
+
+    gf.end = atoi(l.substr(head, tail-head).c_str());
+    head = tail+1;
+    tail = l.find_first_of('\t', head);
+
+    gf.score = atoi(l.substr(head, tail-head).c_str());
+    head = tail+1;
+    tail = l.find_first_of('\t', head);
+
+    gf.strand = l.substr(tail, tail-head)[0];
+    head = tail+1;
+    tail = l.find_first_of('\t', head);
+
+    gf.phase = l.substr(tail, tail-head)[0];
+    head = tail+1;
+    tail = l.find_first_of('\n', head);
+  
+    auto line = l.substr(head, tail-head);
+
+    using tokenizer = boost::tokenizer<boost::char_separator<char>>;
+    boost::char_separator<char> sep(";");  
+    tokenizer tokens(line, sep);
+
+    for ( auto tokIt : tokens ) {       
+    // Currently, we'll handle the following separators
+    // '\s+'
+    // '\s*=\s*'
+      tokIt = tokIt.substr(tokIt.find_first_not_of(' '));
+      auto kvsepStart = tokIt.find('=');
+    
+    // If we reached the end of the key, value token, then the string must have been
+    // separated by some set of spaces, and NO '='.  If this is the case, find the 'spaces' so that
+    // we can split on it.
+      if ( kvsepStart == tokIt.npos ) {
+        kvsepStart = tokIt.find(' ');
+      } 
+    
+      auto key = tokIt.substr(0, kvsepStart);
+      key = key.substr(0, key.find(' '));
+
+      auto kvsepStop = 1 + kvsepStart + tokIt.substr(kvsepStart+1).find_first_not_of(' ');
+      auto val = (tokIt[kvsepStop] == '"') ? tokIt.substr(kvsepStop+1, (tokIt.length() - (kvsepStop+2))) : tokIt.substr(kvsepStop, (tokIt.length() - (kvsepStop+1)));
+      gf.sattr.parseAttribute( key, val );
+    }
+
+  }
+
+  template <typename StaticAttributes>
+  std::vector<GenomicFeature<StaticAttributes>> readGTFFile( const std::string& fname ) {
+
+    using StringPtr = std::string*;
+    std::vector<GenomicFeature<StaticAttributes>> feats;
+
+    std::ifstream ifile(fname);
+    bool done = false;
+    std::vector<std::thread> threads;
+
+    tbb::concurrent_queue<StringPtr> queue;
+    //boost::lockfree::queue<StringPtr> queue(5000);
+    threads.push_back(
+    std::thread([&ifile, &queue, &done]() {
+      StringPtr line = new std::string();
+      while( !std::getline(ifile, *line).eof() ) {
+        StringPtr ownedLine = line;  
+        queue.push(ownedLine);
+        // for boost lockfree
+        // while( !queue.push(ownedLine) ) {}
+        line = new std::string();
+      }
+      done = true;
+    })
+    );
+
+    size_t nreader=10;
+    std::atomic<size_t> tctr(nreader);
+    tbb::concurrent_queue<GenomicFeature<StaticAttributes>*> outQueue;
+    // boost::lockfree::queue<GenomicFeature<StaticAttributes>*> outQueue(5000);
+    
+    for( size_t i = 0; i < nreader; ++i ) {
+      threads.push_back(
+      std::thread([&queue, &outQueue, &done, &tctr]() -> void {
+
+        StringPtr l = nullptr;
+        while( !done or queue.try_pop(l) ) {
+          if ( l != nullptr ) {
+            auto gf = new GenomicFeature<StaticAttributes>();
+            genomicFeatureFromLine(*l, *gf);
+            outQueue.push(gf);
+            // for boost lockfree
+            // while( !outQueue.push(gf) ) {}
+            delete l;
+            l = nullptr;
+          }
+        }
+        --tctr;
+      }
+      ));
+    }
+    
+    threads.push_back(
+      std::thread([&outQueue, &feats, &tctr]() -> void {
+        GenomicFeature<StaticAttributes>* f = nullptr;
+        while( outQueue.try_pop(f) or tctr > 0 ) {
+          if ( f != nullptr ) {
+            feats.push_back(*f);
+          }
+        }
+      }
+      ));
+
+    // Wait for all of the threads to finish
+    for ( auto& thread : threads ){ thread.join(); }
+
+
+    ifile.close();
+    return feats;
+  }
+}
+
+#endif // GENOMIC_FEATURE_HPP
diff --git a/include/HeptamerIndex.hpp b/include/HeptamerIndex.hpp
new file mode 100644
index 0000000..c1c826b
--- /dev/null
+++ b/include/HeptamerIndex.hpp
@@ -0,0 +1,41 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef __HEPTAMER_INDEX_HPP__
+#define __HEPTAMER_INDEX_HPP__
+
+#include <atomic>
+#include <vector>
+#include <array>
+
+class HeptamerIndex {
+  using AtomicCount = std::atomic<uint64_t>;
+public:
+  explicit HeptamerIndex();
+  std::size_t index(uint64_t heptamer);
+private:
+  constexpr static uint32_t PossibleHeptamers = 16384;
+  const std::array<uint32_t, 7> mult_{{1,4,16,64,256,1024,4096}};
+  std::vector<AtomicCount> heptamers_;
+};
+
+#endif // __HEPTAMER_INDEX_HPP__
diff --git a/include/IOUtils.hpp b/include/IOUtils.hpp
new file mode 100644
index 0000000..30a415a
--- /dev/null
+++ b/include/IOUtils.hpp
@@ -0,0 +1,15 @@
+#ifndef IO_UTILS
+#define IO_UTILS
+
+#include <cstdint>
+#include <cstddef>
+
+namespace ioutils{
+
+    const char RESET_COLOR[] = "\x1b[0;0m";
+    const char SET_GREEN[] = "\x1b[0;32m";
+    const char SET_RED[] = "\x1b[0;31m";
+
+}
+
+#endif // IO_UTILS
diff --git a/include/LibraryFormat.hpp b/include/LibraryFormat.hpp
new file mode 100644
index 0000000..24bc1e9
--- /dev/null
+++ b/include/LibraryFormat.hpp
@@ -0,0 +1,101 @@
+#ifndef LIBRARY_FORMAT_HPP
+#define LIBRARY_FORMAT_HPP
+
+#include <ostream>
+#include <cstdint>
+
+enum class ReadType : std::uint8_t { SINGLE_END = 0, PAIRED_END = 1 };
+enum class ReadOrientation  : std::uint8_t { SAME = 0, AWAY = 1, TOWARD = 2, NONE = 3};
+enum class ReadStrandedness  : std::uint8_t { SA = 0, AS = 1, S = 2, A = 3, U = 4 };
+
+class LibraryFormat {
+public:
+    LibraryFormat(ReadType type_in, ReadOrientation orientation_in, ReadStrandedness strandedness_in);
+
+    LibraryFormat(const LibraryFormat& o) = default;
+    LibraryFormat(LibraryFormat&& o) = default;
+    LibraryFormat& operator=(LibraryFormat& o) = default;
+    LibraryFormat& operator=(LibraryFormat&& o) = default;
+
+    ReadType type;
+    ReadOrientation orientation;
+    ReadStrandedness strandedness;
+
+    /**
+     * Returns true if the specified library format is OK, false otherwise.
+     */
+    bool check();
+    friend std::ostream& operator<<(std::ostream& os, const LibraryFormat& lf);
+    // The maximum index that could be returned for a library type
+    constexpr static uint8_t maxLibTypeID() {
+        return 0 |
+            ((static_cast<uint8_t>(ReadType::PAIRED_END)) |
+             (static_cast<uint8_t>(ReadOrientation::NONE)) << 1 |
+             (static_cast<uint8_t>(ReadStrandedness::U)) << 3);
+    }
+
+    inline static LibraryFormat formatFromID(uint8_t id) {
+        ReadType rt;
+        ReadOrientation ro;
+        ReadStrandedness rs;
+
+        switch (id & 0x01) {
+            case 0:
+                rt = ReadType::SINGLE_END;
+                break;
+            case 1:
+                rt = ReadType::PAIRED_END;
+                break;
+        }
+
+        switch ((id >> 1) & 0x3) {
+            case static_cast<uint8_t>(ReadOrientation::SAME):
+                ro = ReadOrientation::SAME;
+                break;
+            case static_cast<uint8_t>(ReadOrientation::AWAY):
+                ro = ReadOrientation::AWAY;
+                break;
+            case static_cast<uint8_t>(ReadOrientation::TOWARD):
+                ro = ReadOrientation::TOWARD;
+                break;
+            case static_cast<uint8_t>(ReadOrientation::NONE):
+                ro = ReadOrientation::NONE;
+                break;
+        }
+
+        switch ((id >> 3) & 0x7) {
+            case static_cast<uint8_t>(ReadStrandedness::SA):
+                rs = ReadStrandedness::SA;
+                break;
+            case static_cast<uint8_t>(ReadStrandedness::AS):
+                rs = ReadStrandedness::AS;
+                break;
+            case static_cast<uint8_t>(ReadStrandedness::S):
+                rs = ReadStrandedness::S;
+                break;
+            case static_cast<uint8_t>(ReadStrandedness::A):
+                rs = ReadStrandedness::A;
+                break;
+            case static_cast<uint8_t>(ReadStrandedness::U):
+                rs = ReadStrandedness::U;
+                break;
+        }
+
+        return LibraryFormat(rt, ro, rs);
+    }
+
+    // Assigns a unique ID to each potential library
+    // type.  The IDs are such that 0 <= formatID(lib) < num possible formats
+    inline uint8_t formatID() const {
+        uint8_t id = 0;
+        // mask: 00000001
+        id |= (0x01 & static_cast<uint8_t>(type));
+        // mask: 00000110
+        id |= (0x3 & static_cast<uint8_t>(orientation)) << 1;
+        // mask: 00111000
+        id |= (0x7 &static_cast<uint8_t>(strandedness)) << 3;
+        return id;
+    }
+};
+
+#endif // LIBRARY_FORMAT_HPP
diff --git a/include/MatrixTools.hpp b/include/MatrixTools.hpp
new file mode 100644
index 0000000..2c36a6e
--- /dev/null
+++ b/include/MatrixTools.hpp
@@ -0,0 +1,573 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef MATRIX_TOOLS_HPP
+#define MATRIX_TOOLS_HPP
+
+#include <unordered_set>
+#include <unordered_map>
+#include <map>
+#include <memory>
+
+#include "nnls.h"
+
+// taucs.h will define min/max macros if it's not already done (e.g. by Windows.h).
+#ifndef min
+    #define CGAL_TAUCS_DEFINES_MIN
+#endif
+#ifndef max
+    #define CGAL_TAUCS_DEFINES_MAX
+#endif
+
+// TAUCS is a C library
+extern "C" {
+    #include "tsnnls.h"
+}
+
+// Undefine Taucs' min/max macros to avoid an error
+// with std::min()/std::max() calls in standard C++ headers.
+#ifdef CGAL_TAUCS_DEFINES_MIN
+    #undef min
+#endif
+#ifdef CGAL_TAUCS_DEFINES_MAX
+    #undef max
+#endif
+
+
+#include <vigra/regression.hxx>
+#include <Eigen/SparseCore>
+#include <boost/dynamic_bitset.hpp>
+#include "shotgun_lasso.hpp"
+
+namespace matrix_tools {
+
+/**
+*  This function collapses the rate matrix "A" and the corresponding vector of read
+*  counts "counts" into a set of unique categories as defined in the paper
+*  (Salzman, Jiang and Wong 2011).
+
+*  Example
+*  A = [0 1 0 0 0 1 1]
+*      [0 0 1 1 0 1 1]
+*      [0 0 0 1 1 0 0]
+*  counts = [ 0 4 4 8 5 3 2 ]
+*
+*  might (b/c output column order is undefined) be collapsed to:
+*  A = [1 0 0 0 2]
+*      [0 1 1 0 2]
+*      [0 0 1 1 0]
+*  counts = [4 4 8 5 5]
+*/
+void collapseIntoCategories(
+    std::vector<std::vector<double>> &A,
+    std::vector<double> &counts,
+    std::unique_ptr<std::vector<std::vector<double>>> &collapsedA,
+    std::unique_ptr<std::vector<double>> &collapsedCounts
+) {
+
+    using boost::dynamic_bitset;
+using Count = size_t;
+using Index = size_t;
+
+    size_t numRows = A.size();
+    size_t numCols = A[0].size();
+    std::map<dynamic_bitset<>, std::vector<Index> > categoryCounts;
+
+    for ( size_t i = 0; i < numCols; ++i ) {
+        dynamic_bitset<> category(numRows);
+
+        for ( size_t j = 0; j < numRows; ++j ) {
+            category[j] = ( A[j][i] > 0.0 ) ? 1 : 0;
+        }
+        // The trivial category contains all 0s; we simply omit it
+        bool trivial = !category.any();
+        if ( !trivial ) {
+            categoryCounts[ category ].push_back(i);
+        }
+    }
+
+    size_t numCategories = categoryCounts.size();
+    std::unique_ptr<std::vector<std::vector<double>>> captr(new std::vector<std::vector<double>>(
+                numRows, std::vector<double>( numCategories, 0.0 ) ));
+    collapsedA = std::move(captr);
+
+    std::unique_ptr<std::vector<double>> ccptr( new std::vector<double>(numCategories, 0.0) );
+    collapsedCounts = std::move(ccptr);
+
+    size_t j = 0;
+    for ( auto catIt = categoryCounts.begin(); catIt != categoryCounts.end(); ++catIt, ++j ) {
+        auto category = catIt->first;
+        auto origCols = catIt->second;
+        for ( auto origColID : origCols ) {
+            (*collapsedCounts)[j] += counts[origColID];
+        }
+        for ( size_t i = 0; i < numRows; ++i ) {
+            auto accumCount = 0.0;
+            for ( auto col : origCols ) { accumCount += A[i][col]; }
+            (*collapsedA)[i][j] = accumCount * category[i];
+        }
+    }
+
+}
+
+
+
+template< typename MatT, typename VecT >
+std::vector<double> LARSSolve( MatT &Ain, VecT& bin )
+{
+
+        int m = Ain.rows(); 
+        int n = Ain.cols();
+        vigra::linalg::Matrix<double> A(m, n);
+        vigra::linalg::Matrix<double> b(m, 1);
+        
+        // fill A and b
+        for (int k=0; k < Ain.outerSize(); ++k) {
+
+          for (typename MatT::InnerIterator it(Ain,k); it; ++it) {
+            auto val = it.value();
+            auto i = it.row();   // row index
+            auto j = it.col();   // col index (here it is equal to k)
+            A(i,j) = val;
+         }
+        }
+
+        for (auto i : boost::irange(size_t(0), bin.size())) {
+            b(i,0) = bin[i];
+        }
+
+        // normalize the input
+        vigra::linalg::Matrix<double> offset(1,n), scaling(1,n);
+        vigra::linalg::prepareColumns(A, A, offset, scaling,  vigra::linalg::DataPreparationGoals(vigra::linalg::ZeroMean|vigra::linalg::UnitVariance));
+        vigra::linalg::prepareColumns(b, b, vigra::linalg::DataPreparationGoals(vigra::linalg::ZeroMean));
+
+        // arrays to hold the output
+        vigra::ArrayVector<vigra::ArrayVector<int> > activeSets;
+        vigra::ArrayVector<vigra::linalg::Matrix<double> > solutions;
+
+        // run leastAngleRegression() in non-negative LASSO mode
+        int numSolutions = vigra::linalg::leastAngleRegression(A, b, activeSets, solutions,
+                                    vigra::linalg::LeastAngleRegressionOptions().nnlasso());
+
+        // print results
+        std::vector<double> x(n);
+        vigra::linalg::Matrix<double> denseSolution(1, n);
+        for (vigra::MultiArrayIndex k = 0; k < numSolutions; ++k) {
+            // transform the sparse solution into a dense vector
+            denseSolution.init(0.0); // ensure that inactive variables are zero
+            for (unsigned int i = 0; i < activeSets[k].size(); ++i) {
+                // set the values of the active variables;
+                // activeSets[k][i] is the true index of the i-th variable in the active set
+                denseSolution(0, activeSets[k][i]) = solutions[k](i,0);
+            }
+
+            // invert the input normalization
+            denseSolution = denseSolution * pointWise(scaling);
+            for (auto i : boost::irange(int(0), n)) {
+              x[i] = denseSolution(0,i);
+            }
+        }
+
+        return x;
+}
+
+template< typename MatT, typename VecT >
+std::vector<double> shotgunSolve( MatT &A, VecT& b ) {
+
+    // Try shotgun LASSO solver
+    shotgun_data prob;
+    //std::cerr << "trying to reserve " << A.nonZeros() << " nonzeros in prob rows/cols\n";
+    prob.A_cols.resize( A.nonZeros(), sparse_array() );
+    prob.A_rows.resize( A.nonZeros(), sparse_array() );
+
+    std::cerr << "done\n";
+
+    auto nnz = A.nonZeros();
+    size_t ctr{0};
+    for (int k=0; k < A.outerSize(); ++k) {
+      //std::cerr << "column " << k << " of " << A.outerSize() << "\n";
+      for (typename MatT::InnerIterator it(A,k); it; ++it) {
+        auto val = it.value();
+        auto i = it.row();   // row index
+        auto j = it.col();   // col index (here it is equal to k)
+        //std::cerr << "adding " << i << ", " << j << " : " << val << "\n";
+        try {
+        prob.A_cols[j].add(i, val);
+        prob.A_rows[i].add(j, val);
+        ++ctr;
+        if ( ctr > nnz ) { std::cerr << "only reserved space for " << nnz <<
+          " elements, but you're pushing on the " << ctr << "th element\n";
+        }
+        } catch ( std::exception& e ) {
+          std::cerr << "died trying to add " << i << ", " << j << " : " << val << "\n";
+          std::cerr << e.what();
+        }
+      }
+    }
+
+    prob.nx = A.cols();
+    prob.ny = A.rows();
+
+    double me = *std::max_element(b.begin(), b.end());
+    auto scale = 1.0 / me;
+
+    prob.y.reserve(b.size());
+    for ( auto e : b ) { prob.y.push_back(e * scale); };
+        
+    double lambda = 0.1;
+    int K = 200;
+    int maxiter = 5000000;
+    int verbose = 3;
+    double threshold = 1e-15;
+    assert( prob.ny == prob.y.size() );
+    std::cerr << "before solve lasso\n";
+    LassoProblem lprob(&prob, lambda, K, threshold, maxiter, verbose);
+    lprob.solve();
+    std::cerr << "after solve lasso\n";
+
+    std::vector<double> myX(prob.x);
+    for ( auto& e : myX ) { e *= me; }
+    return myX;
+}
+
+/**
+  * Given the matrix A (m x n) and the right-hand-side vector b (m x 1) solve
+  * for the solution vector x ( n x 1 ) satisfying
+  * min_{x} || Ax - b ||
+  * subject to x[i] >= 0 for all i
+  */
+template< typename MatT, typename VecT >
+std::vector<double> nnlsSolve( MatT &A, VecT &b ) {
+    //std::cerr << "in least squares problem\n";
+    
+    Eigen::MatrixXd ADense = Eigen::MatrixXd(A);//.transpose();
+
+    int    mda = ADense.rows();
+    int    m = ADense.rows(), n = ADense.cols();
+
+    std::vector<double> w(n, 0.0);
+    std::vector<double> zz(m, 0.0);
+    std::vector<double> x(n, 1.0 );
+    std::vector<int> indx(n, 0);
+    assert(b.size() == m);
+    double rnorm = 0.0;
+    int mode = 0;
+
+    nnls( ADense.data(), mda, m, n, &b[0], &x[0], &rnorm, &w[0], &zz[0], &indx[0], &mode);
+        
+    return x;
+    /*
+    size_t errCtr = 0;
+    // Create a new TAUCS matrix structure
+    taucs_ccs_matrix* mat;
+    mat = new taucs_ccs_matrix;
+
+    mat->n = A.cols();
+    mat->m = A.rows();
+    mat->flags = TAUCS_DOUBLE;
+
+    std::cerr << "inited TAUCS matrix\n";
+
+    // Compress this matrix so we can steal / share the data
+    A.makeCompressed();
+
+    std::cerr << "compressed matrix\n";
+
+    mat->colptr = A.outerIndexPtr();
+    mat->rowind = A.innerIndexPtr();
+    mat->values.d = A.valuePtr();
+
+    double residualNorm{0.0};
+
+    std::cerr << "called NNLS solver\n";
+    auto x = t_snnls_fallback( mat, &b[0], &residualNorm, 0.0, 0);
+    char *errString;
+    tsnnls_error(&errString);
+    std::cerr << "NNLS solver returned\n";
+    std::cerr << "ERROR STRING " << errString << "\n";
+
+    if( x == NULL ) {
+      std::cerr << "AHHH, x is STILL NULL\n";
+
+      std::stringstream ss;
+      ss << "errorA_" << errCtr << ".mtx";
+      std::ofstream Afile(ss.str());
+
+      Afile << "SPARSE\n";
+      Afile << A.rows() << '\t' << A.cols() << '\n' << A.nonZeros() << '\n';
+      for (int k=0; k < A.outerSize(); ++k) {
+        for (typename MatT::InnerIterator it(A,k); it; ++it) {
+          Afile << it.row()+1 << '\t' << it.col()+1 << '\t' << it.value() << '\n';
+        }
+      }
+      Afile.close();
+
+      ss.clear();
+      ss << "errorb_"<< errCtr << ".mat";
+      std::ofstream bfile("errorb.mtx");
+      bfile << "# error right hand side\n";
+      bfile << "# rows: " << b.size() << "\n";
+      bfile << "# columns: 1\n";
+      for (size_t i = 0; i < b.size(); ++i) {
+          bfile << b[i] << '\n';
+      }
+      bfile.close();
+      ++errCtr;
+
+      delete mat;
+      return std::vector<double>();
+      //std::abort();
+    } else {
+
+      std::vector<double> myX( A.cols(), 0.0 );
+
+      for( size_t i = 0; i < myX.size(); ++i ) { myX[i] = x[i]; std::cerr << "x[" << i << "]  = " << x[i] << "\n"; }
+
+      free(x);
+      delete mat;
+
+      return myX;
+    }
+    */
+
+    /*
+    // Try shotgun LASSO solver
+    shotgun_data prob;
+    std::cerr << "trying to reserve " << A.nonZeros() << " nonzeros in prob rows/cols\n";
+    prob.A_cols.resize( A.nonZeros(), sparse_array() );
+    prob.A_rows.resize( A.nonZeros(), sparse_array() );
+
+    std::cerr << "done\n";
+
+    auto nnz = A.nonZeros();
+    size_t ctr{0};
+    for (int k=0; k < A.outerSize(); ++k) {
+      std::cerr << "column " << k << " of " << A.outerSize() << "\n";
+      for (typename MatT::InnerIterator it(A,k); it; ++it) {
+        auto val = it.value();
+        auto i = it.row();   // row index
+        auto j = it.col();   // col index (here it is equal to k)
+        //std::cerr << "adding " << i << ", " << j << " : " << val << "\n";
+        try {
+        prob.A_cols[j].add(i, val);
+        prob.A_rows[i].add(j, val);
+        ++ctr;
+        if ( ctr > nnz ) { std::cerr << "only reserved space for " << nnz <<
+          " elements, but you're pushing on the " << ctr << "th element\n";
+        }
+        } catch ( std::exception& e ) {
+          std::cerr << "died trying to add " << i << ", " << j << " : " << val << "\n";
+          std::cerr << e.what();
+        }
+      }
+    }
+
+    prob.nx = A.cols();
+    prob.ny = A.rows();
+
+    prob.y.reserve(b.size());
+    for ( auto e : b ) { prob.y.push_back(e); };
+    double lambda = 0.0;
+    int K = 10;
+    int maxiter = 5000;
+    int verbose = 10;
+    double threshold = 1e-8;
+    assert( prob.ny == prob.y.size() );
+    std::cerr << "before solve lasso\n";
+    solveLasso(&prob, lambda, K, threshold, maxiter, verbose);
+    std::cerr << "after solve lasso\n";
+
+    std::vector<double> myX(prob.x);
+    return myX;
+    */
+
+}
+
+
+
+
+
+template <typename MatT>
+std::vector<size_t> markDuplicateColumns( MatT &M ) {
+
+    std::vector<size_t> removeCols;
+    for ( size_t i = 0; i < M.cols(); ++i ) {
+        Eigen::SparseVector<typename MatT::Scalar> colI(M.middleCols(i, 1));
+        auto normI = colI.dot(colI);
+        auto stillValid = true;
+
+        for ( size_t j = i + 1; j < M.cols(); ++j ) {
+            if ( stillValid ) {
+                Eigen::SparseVector<typename MatT::Scalar> colJ(M.middleCols(j, 1));
+                auto normIJ = colI.dot(colJ);
+                if ( normIJ > normI ) {
+                    removeCols.push_back(i);
+                    stillValid = false;
+                }
+            }
+        } // end j
+    } // end i
+
+    return removeCols;
+}
+
+/**
+* Project vector v onto vector u
+*/
+template <typename VecT>
+VecT projectOnto( VecT &v, VecT &u ) {
+    return (v.dot(u) / u.dot(u) ) * u;
+}
+
+template <typename ColT>
+bool classicalGramSchmidt( std::vector<ColT> &basis, typename ColT::Scalar tol = 1e-10 ) {
+
+    // Orthogonalize the basis
+    for ( size_t j = 0; j < basis.size(); ++j ) {
+        // vj = aj
+        ColT vj(basis[j]);
+        for ( size_t i = 0; i < j; ++i ) {
+            // rij = qi * aj
+            //auto rij = basis[i].dot(basis[j]);
+            // vj = vj - rii * qi
+            vj = vj - projectOnto( basis[j], basis[i] );
+            //(basis[i] * rij);
+        }
+        // rjj = ||vj||_{2}
+        auto rjj = vj.dot(vj);
+        // If this vector isn't orthogonal, then return false now
+        if ( rjj <  tol ) {
+            return false;
+        }
+        // qj = vj / rjj
+        basis[j] = vj * (1.0 /  rjj);
+    }
+
+    return true;
+}
+
+template <typename ColT>
+bool addColumnToBasis( std::vector<ColT> &basis, ColT &c, typename ColT::Scalar tol = 1e-10 ) {
+
+    ColT vj(c);//basis[j]);
+    for ( size_t i = 0; i < basis.size(); ++i ) {
+        // rij = qi * aj
+        //auto rij = basis[i].dot(c);
+        // vj = vj - rii * qi
+        vj = vj - projectOnto( c, basis[i] );
+        //vj = vj - (basis[i] * rij);
+    }
+    // rjj = ||vj||_{2}
+    auto rjj = vj.dot(vj);
+
+    if ( rjj >= tol ) {
+        // qj = vj / rjj
+        basis.push_back( vj / rjj);
+        return true;
+    } else {
+        return false;
+    }
+}
+
+template <typename MatT>
+std::vector<size_t> markDependentColumns( MatT &M ) {
+
+using MatT::Index = typename;
+using MatT::Scalar = typename;
+
+    // Reference from MathOverflow:
+    // http://mathoverflow.net/questions/109868/finding-linearly-independent-columns-of-a-large-sparse-rectangular-matrix
+
+    // Structure that holds the column index and it's height
+    struct ColHeightT {
+        IndexT col;
+        IndexT height;
+    };
+
+    // Compute the "height" of each column, where the "height" of a column is the
+    // index of the row where the first non-zero entry occurs
+    std::vector< ColHeightT > heights;
+    heights.reserve(M.cols());
+
+    for ( size_t i = 0; i < M.cols(); ++i ) {
+        typename MatT::InnerIterator it(M, i);
+        heights.push_back( {i, it.row()} );
+    }
+
+    // Sort the columns by height
+    std::sort( heights.begin(), heights.end(),
+               []( const ColHeightT & a, const ColHeightT & b) -> bool { return a.height < b.height; } );
+
+    // We'll always keep the first column
+    std::unordered_set<size_t> independentCols { heights.front().col };
+    IndexT currHeight { heights.front().height };
+
+    // Pick at least one column from each height class
+    for ( auto & ch : heights ) {
+        if ( ch.height > currHeight ) {
+            independentCols.insert(ch.col);
+            currHeight = ch.height;
+        }
+    }
+
+    std::vector< Eigen::SparseVector< ScalarT > > basis;
+    for ( auto c : independentCols ) {
+        Eigen::SparseVector<ScalarT> col(M.middleCols(c, 1));
+        assert(col.size() == M.rows());
+        basis.push_back( col );
+    }
+
+    // orthogonalize the current basis
+    auto isIndependent = classicalGramSchmidt(basis);
+    if (!isIndependent) {
+        std::cerr << "Something went horribly wrong; the original set of vectors was not linearly independent!\n";
+        std::cerr << "Doing the safe thing (start with just one col and build up\n";
+        independentCols = {0};
+        Eigen::SparseVector<ScalarT> col(M.middleCols(0, 1));
+        basis = { col };
+        classicalGramSchmidt(basis);
+        //std::abort();
+    }
+
+    std::vector<size_t> removeCols;
+    // For each column that's not already in our basis
+    for ( size_t i = 0; i < M.cols(); ++i ) {
+        if ( independentCols.find(i) == independentCols.end() ) {
+            // Try to add it to our basis
+            Eigen::SparseVector<ScalarT> c( M.middleCols(i, 1) );
+            assert(c.size() == M.rows());
+            auto independent = addColumnToBasis(basis, c);
+            // If we were able to add it, it's independent
+            if ( independent ) {
+                independentCols.insert(i);
+            } else { // Otherwise it's dependent
+                removeCols.push_back(i);
+            }
+        } // don't mess with already independent columsn
+    } // done loop over columns
+
+    return removeCols;
+}
+}
+
+
+#endif // MATRIX_TOOLS_HPP
diff --git a/include/MicroOpts.hpp b/include/MicroOpts.hpp
new file mode 100644
index 0000000..a9aadea
--- /dev/null
+++ b/include/MicroOpts.hpp
@@ -0,0 +1,16 @@
+#ifndef MICROOPTS_HPP
+#define MICROOPTS_HPP
+
+/** Based on Likely.h from FB's Folly library **/
+
+#undef LIKELY
+#undef UNLIKELY
+
+#if defined(__GUNC__) && __GNUC__ >= 4
+    #define LIKELY(X) (__builtin_expect((x), 1))
+    #define UNLIKELY(X) (__builtin_expect((x), 0))
+#elif defined(__clang__)
+
+
+
+#endif // MICROOPTS_HPP
diff --git a/include/MiniBatchInfo.hpp b/include/MiniBatchInfo.hpp
new file mode 100644
index 0000000..b62c31d
--- /dev/null
+++ b/include/MiniBatchInfo.hpp
@@ -0,0 +1,74 @@
+#ifndef  MINIBATCH_INFO
+#define  MINIBATCH_INFO
+
+#include <vector>
+#include "LibraryFormat.hpp"
+#include "ReadPair.hpp"
+#include "UnpairedRead.hpp"
+#include "AlignmentGroup.hpp"
+#include "concurrentqueue.h"
+
+template <typename AlnGroupT>
+class MiniBatchInfo {
+    public:
+        MiniBatchInfo(size_t _batchNum, std::vector<AlnGroupT*>* _alignments, double _logForgettingMass) :
+            batchNum(_batchNum), alignments(_alignments), logForgettingMass(_logForgettingMass) {}
+
+        size_t batchNum;
+        //AlignmentBatch* alignments;
+        //std::vector<size_t>* readGroupBoundaries;
+        std::vector<AlnGroupT*>* alignments;
+        double logForgettingMass;
+
+        template <typename FragT>
+        void release(tbb::concurrent_queue<FragT*>& fragmentQueue,
+                     moodycamel::ConcurrentQueue<AlnGroupT*>& alignmentGroupQueue){
+                    // tbb::concurrent_bounded_queue<AlnGroupT*>& alignmentGroupQueue){
+            size_t ng{0};
+            for (auto& alnGroup : *alignments) {
+                //fragmentQueue.enqueue_bulk(alnGroup->alignments().begin(), alnGroup->alignments().size());
+
+                for (auto aln : alnGroup->alignments()) {
+                   fragmentQueue.push(aln);
+                    aln = nullptr;
+                }
+
+                alnGroup->alignments().clear();
+                //alignmentGroupQueue.push(alnGroup);
+                //alnGroup = nullptr;
+                ++ng;
+            }
+
+            alignmentGroupQueue.enqueue_bulk(std::make_move_iterator(alignments->begin()), alignments->size());
+            delete alignments;
+            alignments = nullptr;
+        }
+
+};
+
+/*
+template <>
+void MiniBatchInfo<AlignmentGroup<ReadPair>>::release(
+        tbb::concurrent_bounded_queue<ReadPair*>& alignmentStructureQueue,
+        tbb::concurrent_bounded_queue<AlignmentGroup<ReadPair>*>& alignmentGroupQueue) {
+    size_t ng{0};
+    for (auto& alnGroup : *alignments) {
+        for (auto& aln : alnGroup->alignments()) {
+
+            alignmentStructureQueue.push(aln.read1);
+            alignmentStructureQueue.push(aln.read2);
+            aln.read1 = nullptr;
+            aln.read2 = nullptr;
+        }
+        alnGroup->alignments().clear();
+        alignmentGroupQueue.push(alnGroup);
+        //delete alnGroup;
+        alnGroup = nullptr;
+        ++ng;
+    }
+    delete alignments;
+    alignments = nullptr;
+}
+*/
+
+#endif // MINIBATCH_INFO
diff --git a/include/MultinomialSampler.hpp b/include/MultinomialSampler.hpp
new file mode 100644
index 0000000..9b3d91c
--- /dev/null
+++ b/include/MultinomialSampler.hpp
@@ -0,0 +1,53 @@
+#ifndef _MULTINOMIAL_SAMPLER_HPP_
+#define _MULTINOMIAL_SAMPLER_HPP_
+
+#include <random>
+#include <vector>
+
+class MultinomialSampler {
+ public:
+     MultinomialSampler(std::random_device& rd) :
+         gen_(rd()), u01_(0.0, 1.0) {}
+
+     void operator()(
+             std::vector<int>::iterator sampleBegin,
+             uint32_t n,
+             uint32_t k,
+             std::vector<double>::iterator probsBegin,
+	     bool clearCounts = true) {
+         int i, j;
+         double u, sum;
+         std::vector<double> z(k+1, 0.0);
+
+	 if (clearCounts) {
+		 for (uint32_t i = 0; i < k; i++) {
+		     *(sampleBegin + i) = 0;
+		 }
+	 }
+
+         z[0] = 0;
+         for (i = 1; i <= k; i++) {
+             sum = 0;
+             for (j = 0; j < i; j++) sum+= *(probsBegin + j);
+             z[i] = sum;
+         }
+
+         for (j = 0; j < n; j++) {
+             u = u01_(gen_);
+
+             for (i = 0; i < k; i++) {
+                 if ((z[i] < u) && (u <= z[i+1])) {
+                     (*(sampleBegin + i))++;
+                 }
+             }
+         }
+     }
+
+
+private:
+   std::mt19937 gen_;
+   std::uniform_real_distribution<> u01_;
+};
+
+#endif //_MULTINOMIAL_SAMPLER_HPP_
+
diff --git a/include/MultithreadedBAMParser.hpp b/include/MultithreadedBAMParser.hpp
new file mode 100644
index 0000000..6bc23b7
--- /dev/null
+++ b/include/MultithreadedBAMParser.hpp
@@ -0,0 +1,68 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef __MULTITHREADED_BAM_PARSER__
+#define __MULTITHREADED_BAM_PARSER__
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+#include <thread>
+#include <atomic>
+#include <iostream>
+
+#include "api/BamReader.h"
+#include "api/BamAlignment.h"
+
+#include <boost/filesystem.hpp>
+#include <boost/range/irange.hpp>
+#include "tbb/concurrent_queue.h"
+
+namespace bfs = boost::filesystem;
+
+struct ReadSeq {
+    char* seq = nullptr;
+    size_t len = 0;
+    char* name = nullptr;
+    size_t nlen = 0;
+};
+
+class MultithreadedBAMParser {
+public:
+    MultithreadedBAMParser( std::vector<bfs::path>& files );
+    ~MultithreadedBAMParser();
+    bool start();
+    bool nextAlignment(BamTools::BamAlignment*& seq);
+    void finishedWithAlignment(BamTools::BamAlignment*& s);
+
+private:
+    std::vector<bfs::path>& inputStreams_;
+    bool parsing_;
+    std::thread* parsingThread_;
+    tbb::concurrent_bounded_queue<BamTools::BamAlignment*> alnQueue_, seqContainerQueue_;
+    BamTools::BamAlignment* alnStructs_;
+    const size_t queueCapacity_ = 5000000;
+};
+
+//#include "Parser.cpp"
+
+#endif // __MULTITHREADED_BAM_PARSER__
diff --git a/include/NullFragmentFilter.hpp b/include/NullFragmentFilter.hpp
new file mode 100644
index 0000000..26cfa9f
--- /dev/null
+++ b/include/NullFragmentFilter.hpp
@@ -0,0 +1,10 @@
+#ifndef __NULL_FRAGMENT_FILTER_HPP__
+#define __NULL_FRAGMENT_FILTER_HPP__
+
+template <typename FragT>
+class NullFragmentFilter {
+    public:
+    inline void processFrag(FragT* f) { }
+};
+
+#endif // __NULL_FRAGMENT_FILTER_HPP__
diff --git a/include/OutputUnmappedFilter.hpp b/include/OutputUnmappedFilter.hpp
new file mode 100644
index 0000000..34d6c7d
--- /dev/null
+++ b/include/OutputUnmappedFilter.hpp
@@ -0,0 +1,46 @@
+#ifndef __OUTPUT_UNMAPPED_FILTER_HPP__
+#define __OUTPUT_UNMAPPED_FILTER_HPP__
+
+#include <iostream>
+#include <tbb/concurrent_queue.h>
+#include "UnpairedRead.hpp"
+#include "ReadPair.hpp"
+
+template <typename FragT>
+class OutputUnmappedFilter {
+    public:
+        OutputUnmappedFilter(tbb::concurrent_bounded_queue<FragT*>* outQueue) :
+        outQueue_(outQueue), qlen_(0) {
+            memset(&qname_[0], 0, 255);
+        }
+
+        inline void processFrag(FragT* f) {
+            bool distinctRead{true};
+
+            // Check if the new read name is distinct from
+            // the previous one.
+            auto newLen = f->getNameLength();
+            if (qlen_ == newLen) {
+                distinctRead = (memcmp(&qname_[0], f->getName(), qlen_) != 0);
+            }
+
+            // If this doesn't have the same name as the last read we saw,
+            // than pass it to the output queue and update the name of the
+            // most recently observed read.
+            if (distinctRead) {
+                auto* alnCpy = f->clone();
+                outQueue_->push(alnCpy);
+                qlen_ = newLen;
+                memcpy(&qname_[0], f->getName(), qlen_);
+            }
+            // If it does have the same name as the last read, no need
+            // to update.
+        }
+
+    private:
+        tbb::concurrent_bounded_queue<FragT*>* outQueue_ = nullptr;
+        char qname_[255];
+        uint32_t qlen_;
+};
+
+#endif // __OUTPUT_UNMAPPED_FILTER_HPP__
diff --git a/include/PCA.hpp b/include/PCA.hpp
new file mode 100644
index 0000000..7d7f42d
--- /dev/null
+++ b/include/PCA.hpp
@@ -0,0 +1,108 @@
+#ifndef __PCA__HPP__
+#define __PCA__HPP__
+
+#include <iostream>
+#include <unordered_set>
+#include "Eigen/Dense"
+
+class PCA {
+    public:
+    PCA(Eigen::MatrixXd& m) { //Eigen::Map<Eigen::MatrixXd>& m) {
+        dat_ = m;
+        /*
+        dat_ = Eigen::MatrixXd(m.rows(), m.cols());
+        for(size_t i = 0; i < m.rows(); ++i) {
+            for (size_t j = 0; j < m.cols(); ++j) {
+                dat_(i,j) = m(i,j);
+            }
+        }
+        */
+    }
+
+    void performDecomposition(bool standardize=true) {
+        if (standardize) {
+            std::unordered_set<uint32_t> droppedCols;
+
+            size_t nrows = dat_.rows();
+            Eigen::VectorXd means(dat_.cols());
+            means = dat_.colwise().mean();
+            Eigen::VectorXd stdDev(dat_.cols());
+            for (size_t i = 0; i < dat_.cols(); ++i) {
+                for (size_t j = 0; j < dat_.rows(); ++j) {
+                    dat_(j,i) -= means(i);
+                }
+                auto x = dat_.col(i).dot(dat_.col(i));
+                auto sd = std::sqrt(x / nrows);
+                if (sd < 1e-20) {
+                    droppedCols.insert(i);
+                } else {
+                    dat_.col(i) /= sd;
+                }
+            }
+
+            // If we dropped any columns for having a "0" standard deviation,
+            // then create a new normalized data matrix without them.
+            if (droppedCols.size() > 0) {
+                Eigen::MatrixXd tmpDat(dat_.rows(), dat_.cols() - droppedCols.size());
+                size_t curCol{0};
+                for (size_t i = 0; i < dat_.cols(); ++i) {
+                    if (droppedCols.find(i) == droppedCols.end()) {
+                        tmpDat.col(curCol) = dat_.col(i);
+                        ++curCol;
+                    }
+                }
+                std::swap(dat_, tmpDat);
+            }
+        }
+
+        //std::cerr << "DATA = " << dat_ << "\n";
+        Eigen::JacobiSVD<Eigen::MatrixXd> svd(dat_,
+                                              Eigen::ComputeThinU |
+                                              Eigen::ComputeThinV);
+        proj_ = svd.matrixV();
+        variances_ = svd.singularValues();
+        //std::cerr << "V = " << proj_ << "\n";
+        //std::cerr << "S = " << variances_;
+    }
+
+    uint32_t varFrac(double frac, bool verbose=false) {
+        uint32_t cutoff{0};
+        bool haveCutoff{false};
+
+        if (verbose) {
+            std::cerr << "variances: " << variances_ << "\n";
+        }
+        Eigen::VectorXd fracs(variances_.rows());
+        double totalVar{0.0};
+        for (uint32_t i = 0; i < variances_.rows(); ++i) {
+            variances_(i) = variances_(i) * variances_(i);
+            totalVar += variances_(i);
+        }
+        double partialSum{0.0};
+        for (uint32_t i = 0; i < variances_.rows(); ++i) {
+            partialSum += variances_(i);
+            fracs(i) = partialSum / totalVar;
+            if (!haveCutoff and fracs(i) >= frac) {
+                cutoff = i;
+                haveCutoff = true;
+            }
+        }
+        if (verbose) {
+            std::cerr << "fracs = " << fracs << "\n";
+            std::cerr << "include the first " << cutoff + 1 << " components to reach cutoff of " << frac << "\n";
+        }
+        return cutoff + 1;
+    }
+
+    Eigen::MatrixXd projectedData(double frac, bool verbose=false) {
+        uint32_t numVec = varFrac(frac, verbose);
+        return dat_ * proj_.block(0, 0, proj_.rows(), numVec);
+    }
+
+    private:
+        Eigen::MatrixXd dat_;
+        Eigen::MatrixXd proj_;
+        Eigen::VectorXd variances_;
+};
+
+#endif //__PCA__HPP__
diff --git a/include/PairSequenceParser.hpp b/include/PairSequenceParser.hpp
new file mode 100644
index 0000000..e0d1f23
--- /dev/null
+++ b/include/PairSequenceParser.hpp
@@ -0,0 +1,193 @@
+#ifndef __PAIR_SEQUENCE_PARSER_HPP__
+#define __PAIR_SEQUENCE_PARSER_HPP__
+
+#include <string>
+#include <memory>
+#include <utility>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <fstream>
+
+#include <jellyfish/err.hpp>
+#include <jellyfish/cooperative_pool2.hpp>
+#include <jellyfish/cpp_array.hpp>
+
+struct header_sequence_qual {
+  std::string header;
+  std::string seq;
+  std::string qual;
+};
+struct sequence_list {
+  size_t nb_filled;
+  std::vector<std::pair<header_sequence_qual, header_sequence_qual> > data;
+};
+
+template<typename PathIterator>
+class pair_sequence_parser : public jellyfish::cooperative_pool2<pair_sequence_parser<PathIterator>, sequence_list> {
+  typedef jellyfish::cooperative_pool2<pair_sequence_parser<PathIterator>, sequence_list> super;
+  typedef std::unique_ptr<std::istream> stream_type;
+  enum file_type { DONE_TYPE, FASTA_TYPE, FASTQ_TYPE, ERROR_TYPE };
+
+  struct stream_status {
+    file_type   type;
+    std::string buffer;
+    stream_type stream1;
+    stream_type stream2;
+
+    stream_status() : type(DONE_TYPE) { }
+  };
+  jellyfish::cpp_array<stream_status> streams_;
+  PathIterator                        path_begin_, path_end_;
+  std::mutex                          path_mutex_;
+
+public:
+  /// Size is the number of buffers to keep around. It should be
+  /// larger than the number of thread expected to read from this
+  /// class. nb_sequences is the number of sequences to read into a
+  /// buffer. 'begin' and 'end' are iterators to a range of istream.
+  pair_sequence_parser(uint32_t size, uint32_t nb_sequences,
+                       uint32_t max_producers,
+                       PathIterator path_begin, PathIterator path_end) :
+    super(max_producers, size),
+    streams_(max_producers),
+    path_begin_(path_begin), path_end_(path_end)
+  {
+    for(auto it = super::element_begin(); it != super::element_end(); ++it) {
+      it->nb_filled = 0;
+      it->data.resize(nb_sequences);
+    }
+    for(uint32_t i = 0; i < max_producers; ++i) {
+      streams_.init(i);
+      open_next_files(streams_[i]);
+    }
+  }
+
+  inline bool produce(uint32_t i, sequence_list& buff) {
+    stream_status& st = streams_[i];
+
+    switch(st.type) {
+    case FASTA_TYPE:
+      read_fasta(st, buff);
+      break;
+    case FASTQ_TYPE:
+      read_fastq(st, buff);
+      break;
+    case DONE_TYPE:
+    case ERROR_TYPE:
+      return true;
+    }
+
+    if(st.stream1->good() && st.stream2->good())
+      return false;
+
+    // Reach the end of file, close current and try to open the next one
+    open_next_files(st);
+    return false;
+  }
+
+protected:
+  file_type peek_file_type(std::istream& is) {
+    switch(is.peek()) {
+    case EOF: return DONE_TYPE;
+    case '>': return FASTA_TYPE;
+    case '@': return FASTQ_TYPE;
+    default: return ERROR_TYPE;
+    }
+  }
+
+  void open_next_files(stream_status& st) {
+    st.stream1.reset();
+    st.stream2.reset();
+    const char *p1 = 0, *p2 = 0;
+    {
+      std::lock_guard<std::mutex> lck(path_mutex_);
+      if(path_begin_ < path_end_) {
+        p1 = *path_begin_;
+        ++path_begin_;
+      }
+      if(path_begin_ < path_end_) {
+        p2 = *path_begin_;
+        ++path_begin_;
+      }
+    }
+
+    if(!p1 || !p2) {
+      st.type = DONE_TYPE;
+      return;
+    }
+    st.stream1.reset(new std::ifstream(p1));
+    st.stream2.reset(new std::ifstream(p2));
+    if(!*st.stream1 || !*st.stream2) {
+      st.type = DONE_TYPE;
+      return;
+    }
+
+    // Update the type of the current file and move past first header
+    // to beginning of sequence.
+    file_type type1 = peek_file_type(*st.stream1);
+    file_type type2 = peek_file_type(*st.stream1);
+    if(type1 == DONE_TYPE || type2 == DONE_TYPE)
+      return open_next_files(st);
+    if(type1 != type2)
+      eraise(std::runtime_error) << "Paired files are of different format";
+    if(type1 == ERROR_TYPE || type2 == ERROR_TYPE)
+      eraise(std::runtime_error) << "Unsupported format";
+    st.type = type1;
+  }
+
+  void read_fasta_one_sequence(std::istream& is, std::string& tmp, header_sequence_qual& hsq) {
+    is.get(); // Skip '>'
+    std::getline(is, hsq.header);
+    hsq.seq.clear();
+    while(is.peek() != '>' && is.peek() != EOF) {
+      std::getline(is, tmp); // Wish there was an easy way to combine the
+      hsq.seq.append(tmp);             // two lines avoiding copying
+    }
+  }
+
+  void read_fasta(stream_status& st, sequence_list& buff) {
+    size_t&      nb_filled = buff.nb_filled;
+    const size_t data_size = buff.data.size();
+
+    for(nb_filled = 0; nb_filled < data_size && st.stream1->peek() != EOF && st.stream2->peek() != EOF; ++nb_filled) {
+      read_fasta_one_sequence(*st.stream1, st.buffer, buff.data[nb_filled].first);
+      read_fasta_one_sequence(*st.stream2, st.buffer, buff.data[nb_filled].second);
+    }
+  }
+
+  void read_fastq_one_sequence(std::istream& is, std::string& tmp, header_sequence_qual& hsq) {
+    is.get(); // Skip '@'
+    std::getline(is, hsq.header);
+    hsq.seq.clear();
+    while(is.peek() != '+' && is.peek() != EOF) {
+      std::getline(is, tmp); // Wish there was an easy way to combine the
+      hsq.seq.append(tmp);             // two lines avoiding copying
+    }
+    if(!is.good())
+      eraise(std::runtime_error) << "Truncated fastq file";
+    is.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    hsq.qual.clear();
+    while(hsq.qual.size() < hsq.seq.size() && is.good()) {
+      std::getline(is, tmp);
+      hsq.qual.append(tmp);
+    }
+    if(hsq.qual.size() != hsq.seq.size())
+      eraise(std::runtime_error) << "Invalid fastq file: wrong number of quals";
+    if(is.peek() != EOF && is.peek() != '@')
+      eraise(std::runtime_error) << "Invalid fastq file: header missing";
+
+  }
+
+  void read_fastq(stream_status& st, sequence_list& buff) {
+    size_t&      nb_filled = buff.nb_filled;
+    const size_t data_size = buff.data.size();
+
+    for(nb_filled = 0; nb_filled < data_size && st.stream1->peek() != EOF && st.stream2->peek() != EOF; ++nb_filled) {
+      read_fastq_one_sequence(*st.stream1, st.buffer, buff.data[nb_filled].first);
+      read_fastq_one_sequence(*st.stream2, st.buffer, buff.data[nb_filled].second);
+    }
+  }
+};
+
+#endif /* __PAIR_SEQUENCE_PARSER_HPP__ */
diff --git a/include/PartitionRefiner.hpp b/include/PartitionRefiner.hpp
new file mode 100644
index 0000000..eb349a2
--- /dev/null
+++ b/include/PartitionRefiner.hpp
@@ -0,0 +1,42 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef __PARTITION_REFINER_HPP__
+#define __PARTITION_REFINER_HPP__
+
+#include <vector>
+#include "LookUpTableUtils.hpp"
+
+class PartitionRefiner {
+public:
+        PartitionRefiner(LUTTools::KmerID numElem);
+        void splitWith(std::vector<LUTTools::KmerID> splitter);
+        void relabel();
+        const std::vector<LUTTools::KmerID>& partitionMembership();
+
+private:
+        LUTTools::KmerID numElem_;
+        std::vector<LUTTools::KmerID> membership_;
+        LUTTools::KmerID maxSetIdx_;
+};
+
+#endif // __PARTITION_REFINER_HPP__
diff --git a/include/PerfectHashIndex.hpp b/include/PerfectHashIndex.hpp
new file mode 100644
index 0000000..3217858
--- /dev/null
+++ b/include/PerfectHashIndex.hpp
@@ -0,0 +1,174 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef __PERFECT_HASH_INDEX_HPP__
+#define __PERFECT_HASH_INDEX_HPP__
+
+#include <vector>
+#include <chrono>
+#include <iostream>
+#include <cstdio>
+#include <memory>
+#include <functional>
+
+#include <sys/mman.h>
+
+#include "boost/timer/timer.hpp"
+#include "cmph.h"
+
+//template <typename Deleter>
+class PerfectHashIndex {
+  using Kmer = uint64_t;
+  using Count = uint32_t;
+  using AtomicKmer = std::atomic<Kmer>;
+  using AtomicCount = std::atomic<Count>;
+  using Deleter = std::function<void(cmph_t*)>;
+
+  public:
+   // We'll return this invalid id if a kmer is not found in our DB
+   size_t INVALID = std::numeric_limits<size_t>::max();
+
+   PerfectHashIndex( std::vector<Kmer>& kmers, std::unique_ptr<cmph_t, Deleter>& hash, 
+                     uint32_t merSize, bool canonical ) : kmers_(std::move(kmers)), 
+                                                          hash_(std::move(hash)), 
+                                                          hashRaw_(hash_.get()),
+                                                          merSize_(merSize),
+                                                          canonical_(canonical) {}
+
+   PerfectHashIndex( PerfectHashIndex&& ph ) {
+   	merSize_ = ph.merSize_;
+   	hash_ = std::move(ph.hash_);
+    hashRaw_ = hash_.get();
+   	kmers_ = std::move(ph.kmers_);
+    canonical_ = ph.canonical_;
+   }
+
+   void dumpToFile(const std::string& fname) {
+   	FILE* out = fopen(fname.c_str(), "w");
+
+   	// read the key set
+    fwrite( reinterpret_cast<char*>(&merSize_), sizeof(merSize_), 1, out );
+    fwrite( reinterpret_cast<char*>(&canonical_), sizeof(canonical_), 1, out);
+    size_t numCounts = kmers_.size();
+    fwrite( reinterpret_cast<char*>(&numCounts), sizeof(size_t), 1, out );
+    fwrite( reinterpret_cast<char*>(&kmers_[0]), sizeof(Kmer), numCounts, out );
+
+    cmph_dump(hash_.get(), out); 
+    fclose(out);
+
+   }
+
+   static PerfectHashIndex fromFile( const std::string& fname ) {
+   	FILE* in = fopen(fname.c_str(),"r");
+
+   	// read the key set
+    uint32_t merSize;
+    fread( reinterpret_cast<char*>(&merSize), sizeof(merSize), 1, in );
+    bool canonical;
+    fread( reinterpret_cast<char*>(&canonical), sizeof(canonical), 1, in );
+    size_t numCounts;
+    fread( reinterpret_cast<char*>(&numCounts), sizeof(size_t), 1, in );
+    std::vector<Kmer> kmers(numCounts, Kmer(0));
+    fread( reinterpret_cast<char*>(&kmers[0]), sizeof(Kmer), numCounts, in );
+
+    // read the hash
+    std::unique_ptr<cmph_t, Deleter> hash( cmph_load(in), cmph_destroy );
+    PerfectHashIndex index(kmers, hash, merSize, canonical);
+
+    fclose(in);
+
+    return index;
+   }
+
+   inline size_t getKmerIndex( uint64_t kmer ) {
+    return kmer % kmers_.size();
+   }
+
+   inline size_t index( uint64_t kmer ) {
+   	char *key = reinterpret_cast<char*>(&kmer);
+    unsigned int id{cmph_search(hashRaw_, key, sizeof(uint64_t))};
+    return (kmers_[id] == kmer) ? id : INVALID;
+   }
+
+   inline size_t numKeys() { return kmers_.size(); }
+
+   bool verify() {
+   	auto start = std::chrono::steady_clock::now();
+   	for ( auto k : kmers_ ) { 
+   		if( kmers_[index(k)] != k ) { return false; }
+   	}
+   	auto end = std::chrono::steady_clock::now();
+   	auto ms = std::chrono::duration_cast<std::chrono::microseconds>(end-start);
+   	std::cerr << "verified: " << static_cast<double>(ms.count()) / kmers_.size() << " us / key\n";
+   	return true;
+   }
+
+   void will_need(uint32_t threadIdx, uint32_t numThreads) {
+     auto pageSize = sysconf(_SC_PAGESIZE);
+     size_t numPages{0};
+     
+     auto entriesPerPage = pageSize / sizeof(char);
+     auto size = cmph_size(hashRaw_);
+     numPages = (sizeof(char) * size) / entriesPerPage;
+     // number of pages that each thread should touch
+     auto numPagesPerThread = numPages / numThreads;
+     auto entriesPerThread = entriesPerPage * numPagesPerThread;
+     // the page this thread starts touching
+     auto start = entriesPerPage * threadIdx;
+     // the last page this thread touches
+     auto end = start + entriesPerThread;
+
+     for (size_t i = start; i < size; i += numThreads*entriesPerPage) {      
+      *(reinterpret_cast<char*>(hashRaw_)+i) = *(reinterpret_cast<char*>(hashRaw_)+i);
+     }
+
+     // entries per page
+     entriesPerPage = pageSize / sizeof(Kmer);
+     // total number of pages
+     size = kmers_.size();
+     numPages = (sizeof(Kmer) * kmers_.size()) / entriesPerPage;
+     // number of pages that each thread should touch
+     numPagesPerThread = numPages / numThreads;
+     entriesPerThread = entriesPerPage * numPagesPerThread;
+     // the page this thread starts touching
+     start = entriesPerPage * threadIdx;
+     // the last page this thread touches
+     end = start + entriesPerThread;
+     for (size_t i = start; i < size; i += numThreads*entriesPerPage) {
+      //std::cerr << "thread " << threadIdx << " is touching page " << i / entriesPerPage << "\n";
+      kmers_[i] = kmers_[i];
+     }
+   }
+
+   inline bool canonical() { return canonical_; }
+   inline uint32_t kmerLength() { return merSize_; }
+   const std::vector<Kmer>& kmers() { return kmers_; }
+
+   private:
+   	std::vector<Kmer> kmers_;
+   	std::unique_ptr<cmph_t, Deleter> hash_;
+    cmph_t* hashRaw_;
+   	uint32_t merSize_;
+    bool canonical_;
+};
+
+#endif // __PERFECT_HASH_INDEX_HPP__
\ No newline at end of file
diff --git a/include/ReadExperiment.hpp b/include/ReadExperiment.hpp
new file mode 100644
index 0000000..43d5f28
--- /dev/null
+++ b/include/ReadExperiment.hpp
@@ -0,0 +1,458 @@
+#ifndef EXPERIMENT_HPP
+#define EXPERIMENT_HPP
+
+extern "C" {
+#include "bwa.h"
+#include "bwamem.h"
+#include "kvec.h"
+#include "utils.h"
+}
+
+// Our includes
+#include "ClusterForest.hpp"
+#include "Transcript.hpp"
+#include "ReadLibrary.hpp"
+#include "FragmentLengthDistribution.hpp"
+#include "FragmentStartPositionDistribution.hpp"
+#include "SequenceBiasModel.hpp"
+#include "SalmonOpts.hpp"
+#include "EquivalenceClassBuilder.hpp"
+
+// Logger includes
+#include "spdlog/spdlog.h"
+
+// Boost includes
+#include <boost/filesystem.hpp>
+#include <boost/range/irange.hpp>
+
+// Standard includes
+#include <vector>
+#include <memory>
+#include <fstream>
+
+/**
+  *  This class represents a library of alignments used to quantify
+  *  a set of target transcripts.  The AlignmentLibrary contains info
+  *  about both the alignment file and the target sequence (transcripts).
+  *  It is used to group them together and track information about them
+  *  during the quantification procedure.
+  */
+class ReadExperiment {
+
+    public:
+
+    ReadExperiment(std::vector<ReadLibrary>& readLibraries,
+                   //const boost::filesystem::path& transcriptFile,
+                   const boost::filesystem::path& indexDirectory,
+		   SalmonOpts& sopt) :
+        readLibraries_(readLibraries),
+        //transcriptFile_(transcriptFile),
+        transcripts_(std::vector<Transcript>()),
+        totalAssignedFragments_(0),
+        fragStartDists_(5),
+        seqBiasModel_(1.0),
+	eqBuilder_(sopt.jointLog) {
+            namespace bfs = boost::filesystem;
+
+            // Make sure the read libraries are valid.
+            for (auto& rl : readLibraries_) { rl.checkValid(); }
+
+            size_t maxFragLen = sopt.fragLenDistMax;
+            size_t meanFragLen = sopt.fragLenDistPriorMean;
+            size_t fragLenStd = sopt.fragLenDistPriorSD;
+            size_t fragLenKernelN = 4;
+            double fragLenKernelP = 0.5;
+            fragLengthDist_.reset(new FragmentLengthDistribution(1.0, maxFragLen,
+                    meanFragLen, fragLenStd,
+                    fragLenKernelN,
+                    fragLenKernelP, 1));
+
+            // Make sure the transcript file exists.
+            /*
+            if (!bfs::exists(transcriptFile_)) {
+                std::stringstream ss;
+                ss << "The provided transcript file: " << transcriptFile_ <<
+                    " does not exist!\n";
+                throw std::invalid_argument(ss.str());
+            }
+            */
+
+            // ====== Load the transcripts from file
+            { // mem-based
+                bfs::path indexPath = indexDirectory / "bwaidx";
+                if ((idx_ = bwa_idx_load(indexPath.string().c_str(), BWA_IDX_BWT|BWA_IDX_BNS|BWA_IDX_PAC)) == 0) {
+                    fmt::print(stderr, "Couldn't open index [{}] --- ", indexPath);
+                    fmt::print(stderr, "Please make sure that 'salmon index' has been run successfully\n");
+                    std::exit(1);
+                }
+            }
+
+            size_t numRecords = idx_->bns->n_seqs;
+            std::vector<Transcript> transcripts_tmp;
+
+            fmt::print(stderr, "Index contained {} targets\n", numRecords);
+            //transcripts_.resize(numRecords);
+            for (auto i : boost::irange(size_t(0), numRecords)) {
+                uint32_t id = i;
+                char* name = idx_->bns->anns[i].name;
+                uint32_t len = idx_->bns->anns[i].len;
+                // copy over the length, then we're done.
+                transcripts_tmp.emplace_back(id, name, len);
+            }
+
+            std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
+                    [](const Transcript& t1, const Transcript& t2) -> bool {
+                    return t1.id < t2.id;
+                    });
+            double alpha = 0.005;
+            char nucTab[256];
+            nucTab[0] = 'A'; nucTab[1] = 'C'; nucTab[2] = 'G'; nucTab[3] = 'T';
+            for (size_t i = 4; i < 256; ++i) { nucTab[i] = 'N'; }
+
+            // Load the transcript sequence from file
+            for (auto& t : transcripts_tmp) {
+                transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength, alpha);
+                /* from BWA */
+                uint8_t* rseq = nullptr;
+                int64_t tstart, tend, compLen, l_pac = idx_->bns->l_pac;
+                tstart  = idx_->bns->anns[t.id].offset;
+                tend = tstart + t.RefLength;
+                rseq = bns_get_seq(l_pac, idx_->pac, tstart, tend, &compLen);
+                if (compLen != t.RefLength) {
+                    fmt::print(stderr,
+                               "For transcript {}, stored length ({}) != computed length ({}) --- index may be corrupt. exiting\n",
+                               t.RefName, compLen, t.RefLength);
+                    std::exit(1);
+                }
+                std::string seq(t.RefLength, ' ');
+                if (rseq != 0) {
+                    for (size_t i = 0; i < compLen; ++i) { seq[i] = nucTab[rseq[i]]; }
+                }
+                auto& txp = transcripts_.back();
+                txp.Sequence = salmon::stringtools::encodeSequenceInSAM(seq.c_str(), t.RefLength);
+                // Length classes taken from
+                // ======
+                // Roberts, Adam, et al.
+                // "Improving RNA-Seq expression estimates by correcting for fragment bias."
+                // Genome Biol 12.3 (2011): R22.
+                // ======
+                // perhaps, define these in a more data-driven way
+                if (t.RefLength <= 1334) {
+                    txp.lengthClassIndex(0);
+                } else if (t.RefLength <= 2104) {
+                    txp.lengthClassIndex(0);
+                } else if (t.RefLength <= 2988) {
+                    txp.lengthClassIndex(0);
+                } else if (t.RefLength <= 4389) {
+                    txp.lengthClassIndex(0);
+                } else {
+                    txp.lengthClassIndex(0);
+                }
+                /*
+                std::cerr << "TS = " << t.RefName << " : \n";
+                std::cerr << seq << "\n VS \n";
+                for (size_t i = 0; i < t.RefLength; ++i) {
+                    std::cerr << transcripts_.back().charBaseAt(i);
+                }
+                std::cerr << "\n\n";
+                */
+                free(rseq);
+                /* end BWA code */
+            }
+            // Since we have the de-coded reference sequences, we no longer need
+            // the encoded sequences, so free them.
+            free(idx_->pac); idx_->pac = nullptr;
+            transcripts_tmp.clear();
+            // ====== Done loading the transcripts from file
+
+            // Create the cluster forest for this set of transcripts
+            clusters_.reset(new ClusterForest(transcripts_.size(), transcripts_));
+        }
+
+    EquivalenceClassBuilder& equivalenceClassBuilder() {
+        return eqBuilder_;
+    }
+
+    std::vector<Transcript>& transcripts() { return transcripts_; }
+
+    uint64_t numAssignedFragments() { return numAssignedFragments_; }
+    uint64_t numMappedReads() { return numAssignedFragments_; }
+
+    uint64_t upperBoundHits() { return upperBoundHits_; }
+    void setUpperBoundHits(uint64_t ubh) { upperBoundHits_ = ubh; }
+
+    std::atomic<uint64_t>& numAssignedFragmentsAtomic() { return numAssignedFragments_; }
+
+    void setNumObservedFragments(uint64_t numObserved) { numObservedFragments_ = numObserved; }
+
+    double mappingRate() {
+        if (quantificationPasses_ > 0) {
+            return static_cast<double>(numAssignedFragsInFirstPass_) / numObservedFragsInFirstPass_;
+        } else {
+            return static_cast<double>(numAssignedFragments_) / numObservedFragments_;
+        }
+    }
+
+    template <typename CallbackT>
+    bool processReads(const uint32_t& numThreads, CallbackT& processReadLibrary) {
+        bool burnedIn = (totalAssignedFragments_ + numAssignedFragments_ > 5000000);
+        for (auto& rl : readLibraries_) {
+            processReadLibrary(rl, idx_, transcripts_, clusterForest(),
+                               *(fragLengthDist_.get()), numAssignedFragments_,
+                               numThreads, burnedIn);
+        }
+        return true;
+    }
+
+    ~ReadExperiment() {
+        // ---- Get rid of things we no longer need --------
+        bwa_idx_destroy(idx_);
+    }
+
+    ClusterForest& clusterForest() { return *clusters_.get(); }
+
+    std::string readFilesAsString() {
+        std::stringstream sstr;
+        size_t ln{0};
+        size_t numReadLibraries{readLibraries_.size()};
+
+        for (auto &rl : readLibraries_) {
+            sstr << rl.readFilesAsString();
+            if (ln++ < numReadLibraries) { sstr << "; "; }
+        }
+        return sstr.str();
+    }
+
+    uint64_t numAssignedFragsInFirstPass() {
+        return numAssignedFragsInFirstPass_;
+    }
+
+    uint64_t numObservedFragsInFirstPass() {
+        return numObservedFragsInFirstPass_;
+    }
+
+    double effectiveMappingRate() {
+        return effectiveMappingRate_;
+    }
+
+    void setEffetiveMappingRate(double emr) {
+        effectiveMappingRate_ = emr;
+    }
+
+    std::vector<FragmentStartPositionDistribution>& fragmentStartPositionDistributions() {
+        return fragStartDists_;
+    }
+
+    SequenceBiasModel& sequenceBiasModel() {
+        return seqBiasModel_;
+    }
+
+    bool softReset() {
+        if (quantificationPasses_ == 0) {
+            numAssignedFragsInFirstPass_ = numAssignedFragments_;
+            numObservedFragsInFirstPass_ = numObservedFragments_;
+        }
+        numObservedFragments_ = 0;
+        totalAssignedFragments_ += numAssignedFragments_;
+        numAssignedFragments_ = 0;
+        quantificationPasses_++;
+        return true;
+    }
+
+    bool reset() {
+        namespace bfs = boost::filesystem;
+        for (auto& rl : readLibraries_) {
+            if (!rl.isRegularFile()) { return false; }
+        }
+
+        if (quantificationPasses_ == 0) {
+            numAssignedFragsInFirstPass_ = numAssignedFragments_;
+            numObservedFragsInFirstPass_ = numObservedFragments_;
+        }
+
+        numObservedFragments_ = 0;
+        totalAssignedFragments_ += numAssignedFragments_;
+        numAssignedFragments_ = 0;
+        quantificationPasses_++;
+        return true;
+    }
+
+    void summarizeLibraryTypeCounts(boost::filesystem::path& opath){
+        LibraryFormat fmt1(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U);
+        LibraryFormat fmt2(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U);
+
+        std::ofstream ofile(opath.string());
+
+        fmt::MemoryWriter errstr;
+
+        auto log = spdlog::get("jointLog");
+
+        uint64_t numFmt1{0};
+        uint64_t numFmt2{0};
+        uint64_t numAgree{0};
+        uint64_t numDisagree{0};
+
+        for (auto& rl : readLibraries_) {
+            auto fmt = rl.format();
+            auto& counts = rl.libTypeCounts();
+
+            // If the format is un-stranded, check that
+            // we have a similar number of mappings in both
+            // directions and then aggregate the forward and
+            // reverse counts.
+            if (fmt.strandedness == ReadStrandedness::U) {
+                std::vector<ReadStrandedness> strands;
+                switch (fmt.orientation) {
+                    case ReadOrientation::SAME:
+                    case ReadOrientation::NONE:
+                        strands.push_back(ReadStrandedness::S);
+                        strands.push_back(ReadStrandedness::A);
+                        break;
+                    case ReadOrientation::AWAY:
+                    case ReadOrientation::TOWARD:
+                        strands.push_back(ReadStrandedness::AS);
+                        strands.push_back(ReadStrandedness::SA);
+                        break;
+                }
+
+                fmt1.type = fmt.type; fmt1.orientation = fmt.orientation;
+                fmt1.strandedness = strands[0];
+                fmt2.type = fmt.type; fmt2.orientation = fmt.orientation;
+                fmt2.strandedness = strands[1];
+
+                numFmt1 = 0;
+                numFmt2 = 0;
+                numAgree = 0;
+                numDisagree = 0;
+
+                for (size_t i = 0; i < counts.size(); ++i) {
+                    if (i == fmt1.formatID()) {
+                        numFmt1 = counts[i];
+                    } else if (i == fmt2.formatID()) {
+                        numFmt2 = counts[i];
+                    } else {
+                        numDisagree += counts[i];
+                    }
+                }
+                numAgree = numFmt1 + numFmt2;
+                double ratio = static_cast<double>(numFmt1) / (numFmt1 + numFmt2);
+
+                if ( std::abs(ratio - 0.5) > 0.01) {
+                    errstr << "NOTE: Read Lib [" << rl.readFilesAsString() << "] :\n";
+                    errstr << "\nDetected a strand bias > 1\% in an unstranded protocol "
+                           << "check the file: " << opath.string() << " for details\n";
+
+                    log->warn() << errstr.str();
+                    errstr.clear();
+                }
+
+                ofile << "========\n"
+                      << "Read library consisting of files: "
+                      << rl.readFilesAsString()
+                      << "\n\n"
+                      << "Expected format: " << rl.format()
+                      << "\n\n"
+                      << "# of consistent alignments: " << numAgree << "\n"
+                      << "# of inconsistent alignments: " << numDisagree << "\n"
+                      << "strand bias = " << ratio << " (0.5 is unbiased)\n"
+                      << "# alignments with format " << fmt1 << ": " << numFmt1 << "\n"
+                      << "# alignments with format " << fmt2 << ": " << numFmt2 << "\n"
+                      << "\n========\n";
+            } else {
+                numAgree = 0;
+                numDisagree = 0;
+
+                for (size_t i = 0; i < counts.size(); ++i) {
+                    if (i == fmt.formatID()) {
+                        numAgree = counts[i];
+                    } else {
+                        numDisagree += counts[i];
+                    }
+                } // end for
+
+                ofile << "========\n"
+                      << "Read library consisting of files: "
+                      << rl.readFilesAsString()
+                      << "\n\n"
+                      << "Expected format: " << rl.format()
+                      << "\n\n"
+                      << "# of consistent alignments: " << numAgree << "\n"
+                      << "# of inconsistent alignments: " << numDisagree << "\n"
+                      << "\n========\n";
+
+            } //end else
+
+            double disagreeRatio = static_cast<double>(numDisagree) / (numAgree + numDisagree);
+            if (disagreeRatio > 0.05) {
+                errstr << "NOTE: Read Lib [" << rl.readFilesAsString() << "] :\n";
+                errstr << "\nGreater than 5\% of the alignments (but not, necessarily reads) "
+                       << "disagreed with the provided library type; "
+                       << "check the file: " << opath.string() << " for details\n";
+
+                log->warn() << errstr.str();
+                errstr.clear();
+            }
+
+            ofile << "---- counts for each format type ---\n";
+            for (size_t i = 0; i < counts.size(); ++i) {
+                ofile << LibraryFormat::formatFromID(i) << " : " << counts[i] << "\n";
+            }
+            ofile << "------------------------------------\n\n";
+        }
+        ofile.close();
+    }
+
+    std::vector<ReadLibrary>& readLibraries() { return readLibraries_; }
+    FragmentLengthDistribution* fragmentLengthDistribution() { return fragLengthDist_.get(); }
+
+    private:
+    /**
+     * The file from which the alignments will be read.
+     * This can be a SAM or BAM file, and can be a regular
+     * file or a fifo.
+     */
+    std::vector<ReadLibrary> readLibraries_;
+    /**
+     * The file from which the transcripts are read.
+     * This is expected to be a FASTA format file.
+     */
+    //boost::filesystem::path transcriptFile_;
+    /**
+     * The targets (transcripts) to be quantified.
+     */
+    std::vector<Transcript> transcripts_;
+    /**
+     * The index we've built on the set of transcripts.
+     */
+    bwaidx_t *idx_{nullptr};
+    /**
+     * The cluster forest maintains the dynamic relationship
+     * defined by transcripts and reads --- if two transcripts
+     * share an ambiguously mapped read, then they are placed
+     * in the same cluster.
+     */
+    std::unique_ptr<ClusterForest> clusters_;
+    /**
+      *
+      *
+      */
+    std::vector<FragmentStartPositionDistribution> fragStartDists_;
+
+    SequenceBiasModel seqBiasModel_;
+
+    /** Keeps track of the number of passes that have been
+     *  made through the alignment file.
+     */
+    std::atomic<uint64_t> numObservedFragments_{0};
+    std::atomic<uint64_t> numAssignedFragments_{0};
+    uint64_t totalAssignedFragments_{0};
+    size_t quantificationPasses_{0};
+    uint64_t numAssignedFragsInFirstPass_{0};
+    uint64_t numObservedFragsInFirstPass_{0};
+    uint64_t upperBoundHits_{0};
+    double effectiveMappingRate_{0.0};
+    std::unique_ptr<FragmentLengthDistribution> fragLengthDist_;
+    EquivalenceClassBuilder eqBuilder_;
+};
+
+#endif // EXPERIMENT_HPP
diff --git a/include/ReadLibrary.hpp b/include/ReadLibrary.hpp
new file mode 100644
index 0000000..b3a0070
--- /dev/null
+++ b/include/ReadLibrary.hpp
@@ -0,0 +1,238 @@
+#ifndef READ_LIBRARY_HPP
+#define READ_LIBRARY_HPP
+
+#include <vector>
+#include <exception>
+#include <set>
+
+#include <boost/filesystem.hpp>
+
+#include "LibraryFormat.hpp"
+
+/**
+ * This class represents the basic information about a library of reads, like
+ * its paired-end status, the reads that should appear on the forward and reverse strand,
+ * and the relative orientation of the reads.
+ */
+class ReadLibrary {
+public:
+    /**
+     * Construct a new ReadLibrary of the given format
+     */
+    ReadLibrary(LibraryFormat& fmt) : fmt_(fmt),
+        libTypeCounts_(std::vector<std::atomic<uint64_t>>(LibraryFormat::maxLibTypeID() + 1)){}
+
+    /**
+     * Copy constructor
+     */
+    ReadLibrary(const ReadLibrary& rl) :
+        fmt_(rl.fmt_),
+        unmatedFilenames_(rl.unmatedFilenames_),
+        mateOneFilenames_(rl.mateOneFilenames_),
+        mateTwoFilenames_(rl.mateTwoFilenames_),
+        libTypeCounts_(std::vector<std::atomic<uint64_t>>(LibraryFormat::maxLibTypeID() + 1)) {
+            auto mc = LibraryFormat::maxLibTypeID() + 1;
+            for (size_t i = 0; i < mc; ++i) { libTypeCounts_[i].store(rl.libTypeCounts_[i].load()); }
+        }
+
+    /**
+     * Move constructor
+     */
+    ReadLibrary(ReadLibrary&& rl) :
+        fmt_(rl.fmt_),
+        unmatedFilenames_(std::move(rl.unmatedFilenames_)),
+        mateOneFilenames_(std::move(rl.mateOneFilenames_)),
+        mateTwoFilenames_(std::move(rl.mateTwoFilenames_)),
+        libTypeCounts_(std::vector<std::atomic<uint64_t>>(LibraryFormat::maxLibTypeID() + 1)) {
+            auto mc = LibraryFormat::maxLibTypeID() + 1;
+            for (size_t i = 0; i < mc; ++i) { libTypeCounts_[i].store(rl.libTypeCounts_[i].load()); }
+        }
+
+    /**
+     * Add files containing mated reads (from pair 1 of the mates) to this library.
+     */
+    void addMates1(const std::vector<std::string>& mateOneFilenames) {
+        mateOneFilenames_ = mateOneFilenames;
+    }
+
+    /**
+     * Add files containing mated reads (from pair 2 of the mates) to this library.
+     */
+    void addMates2(const std::vector<std::string>& mateTwoFilenames) {
+        mateTwoFilenames_ = mateTwoFilenames;
+    }
+
+    /**
+     * Add files containing unmated reads.
+     */
+    void addUnmated(const std::vector<std::string>& unmatedFilenames) {
+        unmatedFilenames_ = unmatedFilenames;
+    }
+
+    /**
+     * Return true if this read library is for paired-end reads and false otherwise.
+     */
+    bool isPairedEnd() {
+        return (fmt_.type == ReadType::PAIRED_END);
+    }
+
+
+    bool checkFileExtensions_(std::vector<std::string>& filenames, std::stringstream& errorStream) {
+        namespace bfs = boost::filesystem;
+
+        std::set<std::string> acceptableExensions = {".FASTA", ".FASTQ", ".FA", ".FQ",
+                                                     ".fasta", ".fastq", ".fa", ".fq"};
+        bool extensionsOK{true};
+        for (auto& fn : filenames) {
+            auto fpath = bfs::path(fn);
+            auto ext = fpath.extension().string();
+            if (bfs::is_regular_file(fpath)) {
+                if (acceptableExensions.find(ext) == acceptableExensions.end()) {
+                    errorStream << "ERROR: file [" << fn << "] has extension " << ext << ", "
+                        << "which suggests it is neither a fasta nor a fastq file.\n"
+                        << "Is this a compressed file?  If so, consider replacing: \n\n"
+                        << fn << "\n\nwith\n\n"
+                        << "<(decompressor " << fn << ")\n\n"
+                        << "which will decompress the reads \"on-the-fly\"\n\n";
+                    extensionsOK = false;
+                } else if (bfs::is_empty(fpath)) {
+                    errorStream << "ERROR: file [" << fn << "] appears to be empty "
+                        "(i.e. it has size 0).  This is likely an error. "
+                        "Please re-run salmon with a corrected input file.\n\n";
+                        extensionsOK = false;
+                }
+            }
+        }
+        return extensionsOK;
+    }
+
+    bool isRegularFile() {
+        if (isPairedEnd()) {
+            for (auto& m1 : mateOneFilenames_) {
+                if (!boost::filesystem::is_regular_file(m1)) { return false; }
+            }
+            for (auto& m2: mateTwoFilenames_) {
+                if (!boost::filesystem::is_regular_file(m2)) { return false; }
+            }
+        } else {
+            for (auto& um : unmatedFilenames_) {
+                if (!boost::filesystem::is_regular_file(um)) { return false; }
+            }
+        }
+        return true;
+    }
+
+    std::string readFilesAsString() {
+        std::stringstream sstr;
+        if (isPairedEnd()) {
+            size_t n1 = mateOneFilenames_.size();
+            size_t n2 = mateTwoFilenames_.size();
+            if (n1 == 0 or n2 == 0 or n1 != n2) {
+                sstr << "LIBRARY INVALID --- You must provide #1 and #2 mated read files with a paired-end library type";
+            } else {
+                for (size_t i = 0; i < n1; ++i) {
+                    sstr << "( " << mateOneFilenames_[i] << ", " <<
+                            mateTwoFilenames_[i] << " )";
+                    if (i != n1 - 1) { sstr << ", "; }
+                }
+            }
+        } else { // single end
+            size_t n = unmatedFilenames_.size();
+            if (n == 0) {
+                sstr << "LIBRARY INVALID --- You must provide unmated read files with a single-end library type";
+            } else {
+                for (size_t i = 0; i < n; ++i) {
+                    sstr << unmatedFilenames_[i];
+                    if (i != n - 1) { sstr << ", "; }
+                }
+            }
+        } // end else
+        return sstr.str();
+    }
+
+    /**
+     * Checks if this read library is valid --- if it's paired-end, it should have mate1/2 reads and the same
+     * number of files for each; if it's unpaired it should have only unpaired files.
+     * NOTE: This function throws an exception if this is not a valid read library!
+     */
+    void checkValid() {
+
+        bool readsOK{true};
+
+        std::stringstream errorStream;
+        errorStream << "\nThe following errors were detected with the read files\n";
+        errorStream << "======================================================\n";
+
+        if (isPairedEnd()) {
+            size_t n1 = mateOneFilenames_.size();
+            size_t n2 = mateTwoFilenames_.size();
+            if (n1 == 0 or n2 == 0 or n1 != n2) {
+                errorStream << "You must provide #1 and #2 mated read files with a paired-end library type\n";
+                readsOK = false;
+            }
+        } else {
+            size_t n = unmatedFilenames_.size();
+            if (n == 0) {
+                errorStream << "You must provide unmated read files with a single-end library type\n";
+                readsOK = false;
+            }
+
+        }
+
+        // Check if the user tried to pass in non-fast{a,q} files.  If so,
+        // throw an exception with the appropriate error messages.
+        // NOTE: This check currently does nothing useful for non-regular files
+        // (i.e. named-pipes).  If the user passed in a non-regular file, we should
+        // have some other mechanism to check if it's of an expected format and provide
+        // a reasonable error message otherwise.
+        readsOK = readsOK && checkFileExtensions_(mateOneFilenames_, errorStream);
+        readsOK = readsOK && checkFileExtensions_(mateTwoFilenames_, errorStream);
+        readsOK = readsOK && checkFileExtensions_(unmatedFilenames_, errorStream);
+
+        if (!readsOK) {
+            throw std::invalid_argument(errorStream.str());
+        }
+    }
+
+    /**
+     * Return the vector of files containing the mate1 reads for this library.
+     */
+    const std::vector<std::string>& mates1() const { return mateOneFilenames_; }
+
+    /**
+     * Return the vector of files containing the mate2 reads for this library.
+     */
+    const std::vector<std::string>& mates2() const { return mateTwoFilenames_; }
+
+    /**
+     * Return the vector of files containing the unmated reads for the library.
+     */
+    const std::vector<std::string>& unmated() const { return unmatedFilenames_; }
+
+    /**
+     * Return the LibraryFormat object describing the format of this read library.
+     */
+    const LibraryFormat& format() const { return fmt_; }
+
+    /**
+    * Update the library type counts for this read library given the counts
+    * in the vector `counts` which has been passed in.
+    */
+    inline void updateLibTypeCounts(const std::vector<uint64_t>& counts) {
+        size_t lc{counts.size()};
+        for (size_t i = 0; i < lc; ++i) { libTypeCounts_[i] += counts[i]; }
+    }
+
+    std::vector<std::atomic<uint64_t>>& libTypeCounts() {
+        return libTypeCounts_;
+    }
+
+private:
+    LibraryFormat fmt_;
+    std::vector<std::string> unmatedFilenames_;
+    std::vector<std::string> mateOneFilenames_;
+    std::vector<std::string> mateTwoFilenames_;
+    std::vector<std::atomic<uint64_t>> libTypeCounts_;
+};
+
+#endif // READ_LIBRARY_HPP
diff --git a/include/ReadPair.hpp b/include/ReadPair.hpp
new file mode 100644
index 0000000..cf68db9
--- /dev/null
+++ b/include/ReadPair.hpp
@@ -0,0 +1,137 @@
+#ifndef READ_PAIR
+#define READ_PAIR
+
+#include "StadenUtils.hpp"
+#include "SalmonMath.hpp"
+#include "LibraryFormat.hpp"
+#include "SalmonUtils.hpp"
+
+#include "format.h"
+
+struct ReadPair {
+    bam_seq_t* read1 = nullptr;
+    bam_seq_t* read2 = nullptr;
+    salmon::utils::OrphanStatus orphanStatus;
+    double logProb;
+    LibraryFormat libFmt{ReadType::PAIRED_END, ReadOrientation::NONE, ReadStrandedness::U};
+
+    ReadPair():
+        read1(staden::utils::bam_init()),
+        read2(staden::utils::bam_init()),
+        orphanStatus(salmon::utils::OrphanStatus::Paired),
+        logProb(salmon::math::LOG_0) {}
+
+    ReadPair(bam_seq_t* r1, bam_seq_t* r2, salmon::utils::OrphanStatus os, double lp,
+             LibraryFormat lf) :
+        read1(r1), read2(r2), orphanStatus(os), logProb(lp),
+        libFmt(lf){}
+
+    ReadPair(ReadPair&& other) {
+        orphanStatus = other.orphanStatus;
+        logProb = other.logProb;
+        std::swap(read1, other.read1);
+        std::swap(read2, other.read2);
+        libFmt = other.libFmt;
+    }
+
+    ReadPair& operator=(ReadPair&& other) {
+        orphanStatus = other.orphanStatus;
+        logProb = other.logProb;
+        std::swap(read1, other.read1);
+        std::swap(read2, other.read2);
+        libFmt = other.libFmt;
+        return *this;
+    }
+
+
+   ReadPair(ReadPair& other) = default;
+
+   ReadPair& operator=(ReadPair& other) = default;
+
+   ReadPair* clone() {
+       return new ReadPair(bam_dup(read1), bam_dup(read2), orphanStatus, logProb, libFmt);
+   }
+
+    ~ReadPair() {
+        staden::utils::bam_destroy(read1);
+        staden::utils::bam_destroy(read2);
+    }
+
+    inline LibraryFormat& libFormat() { return libFmt; }
+    inline bool isPaired() const { return (orphanStatus == salmon::utils::OrphanStatus::Paired); }
+    inline bool isLeftOrphan() const { return (orphanStatus == salmon::utils::OrphanStatus::LeftOrphan); }
+    inline bool isRightOrphan() const { return (orphanStatus == salmon::utils::OrphanStatus::RightOrphan); }
+
+    /**
+      * returns 0 on success, -1 on failure.
+      */
+    int writeToFile(scram_fd* fp) {
+        int r1 = scram_put_seq(fp, read1);
+        if (r1 == 0 and isPaired()) {
+            return scram_put_seq(fp, read2);
+        } else {
+            return r1;
+        }
+    }
+
+    inline char* getName() const {
+        return bam_name(read1);
+    }
+
+    inline uint32_t getNameLength() {
+        uint32_t l = bam_name_len(read1);
+        char* r = getName();
+        if ( l > 2  and r[l-2] == '/') {
+            return l-2;
+        }
+        return l;//bam_name_len(read1);
+    }
+
+    inline uint32_t fragLen() const {
+        if (!isPaired()) { return 0; }
+        auto leftmost1 = bam_pos(read1);
+        auto leftmost2 = bam_pos(read2);
+
+        // The length of the mapped read that is "rightmost" w.r.t. the forward strand.
+        auto rightmostLen = (leftmost1 < leftmost2) ? bam_seq_len(read2) : bam_seq_len(read1);
+        return std::abs(leftmost1 - leftmost2) + rightmostLen;
+
+        //return std::abs(read1->core.isize) + std::abs(read1->core.l_qseq) + std::abs(read2->core.l_qseq);
+    }
+
+     inline bool isRight() const { return isPaired() ? false : (isRightOrphan() ? true : false) ; }
+     inline bool isLeft() const { return isPaired() ? false : (isLeftOrphan() ? true : false); }
+
+     inline int32_t left() const {
+         if (isPaired()) {
+             return std::min(bam_pos(read1), bam_pos(read2));
+         } else {
+             return bam_pos(read1);
+         }
+     }
+
+    inline int32_t right() const {
+        if (isPaired()) {
+            return std::max(bam_pos(read1) + bam_seq_len(read1),
+                    bam_pos(read2) + bam_seq_len(read2));
+        } else {
+            return bam_pos(read1) + bam_seq_len(read1);
+        }
+    }
+
+    inline ReadType fragType() const { return ReadType::PAIRED_END; }
+    inline int32_t transcriptID() const { return bam_ref(read1); }
+
+    inline double logQualProb() {
+        return salmon::math::LOG_1;
+        /*
+        double q1 = bam_map_qual(read1);
+        double q2 = bam_map_qual(read2);
+        double logP1 = (q1 == 255) ? salmon::math::LOG_1 : std::log(std::pow(10.0, -q1 * 0.1));
+        double logP2 = (q2 == 255) ? salmon::math::LOG_1 : std::log(std::pow(10.0, -q2 * 0.1));
+        return logP1 + logP2;
+        */
+    }
+};
+
+#endif //READ_PAIR
diff --git a/include/ReadProducer.hpp b/include/ReadProducer.hpp
new file mode 100644
index 0000000..2e13120
--- /dev/null
+++ b/include/ReadProducer.hpp
@@ -0,0 +1,92 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef __READPRODUCER_HPP__
+#define __READPRODUCER_HPP__
+
+#include "jellyfish/parse_dna.hpp"
+#include "jellyfish/mapped_file.hpp"
+#include "jellyfish/parse_read.hpp"
+#include "jellyfish/sequence_parser.hpp"
+#include "jellyfish/dna_codes.hpp"
+#include "jellyfish/compacted_hash.hpp"
+#include "jellyfish/mer_counting.hpp"
+#include "jellyfish/misc.hpp"
+#include "StreamingSequenceParser.hpp"
+
+template <typename Parser>
+class ReadProducer {
+};
+
+template <>
+class ReadProducer<jellyfish::parse_read> {
+public:
+  ReadProducer(jellyfish::parse_read& parser) : s_(new ReadSeq), stream_(parser.new_thread()) {}
+  ~ReadProducer() { delete s_; }
+
+  bool nextRead(ReadSeq*& s) {
+    if ((read_ = stream_.next_read())) {
+        // we iterate over the entire read
+        const char*         start = read_->seq_s;
+        const char*  const  end   = read_->seq_e;
+        uint32_t readLen = std::distance(start, end);
+
+        s_->seq = const_cast<char*>(read_->seq_s);
+        s_->len = readLen;
+        s_->name = const_cast<char*>(read_->header);
+        s_->nlen = read_->hlen;
+        s = s_;
+        return true;
+    } else {
+      return false;
+    }
+  }
+
+  void finishedWithRead(ReadSeq*& s) { s = nullptr; }
+
+private:
+  ReadSeq* s_;
+  jellyfish::parse_read::read_t* read_;//{parser.new_thread()};
+  jellyfish::parse_read::thread stream_;//{parser.new_thread()};
+};
+
+template <>
+class ReadProducer<StreamingReadParser> {
+public:
+  ReadProducer(StreamingReadParser& parser) : parser_(parser) {}
+
+  bool nextRead(ReadSeq*& s) {
+    if (parser_.nextRead(s)) {
+      return true;
+    } else {
+      s = nullptr;
+      return false;
+    }
+  }
+
+  void finishedWithRead(ReadSeq*& s) { parser_.finishedWithRead(s); }
+
+private:
+  StreamingReadParser& parser_;
+};
+
+#endif // __READPRODUCER_HPP__
diff --git a/include/SailfishUtils.hpp b/include/SailfishUtils.hpp
new file mode 100644
index 0000000..749d73a
--- /dev/null
+++ b/include/SailfishUtils.hpp
@@ -0,0 +1,153 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef SAILFISH_UTILS_HPP
+#define SAILFISH_UTILS_HPP
+
+#include <future>
+#include <algorithm>
+#include <iostream>
+#include <tuple>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+#include <boost/filesystem.hpp>
+#include <boost/program_options.hpp>
+
+#include "LibraryFormat.hpp"
+#include "ReadLibrary.hpp"
+#include "TranscriptGeneMap.hpp"
+#include "GenomicFeature.hpp"
+
+namespace sailfish{
+namespace utils {
+using std::string;
+using NameVector = std::vector<string>;
+using IndexVector = std::vector<size_t>;
+using KmerVector = std::vector<uint64_t>;
+
+// Returns a uint64_t where the upper 32-bits
+// contain tid and the lower 32-bits contain offset
+uint64_t encode(uint64_t tid, uint64_t offset);
+
+// Given a uin64_t generated by encode(), return the
+// transcript id --- upper 32-bits
+uint32_t transcript(uint64_t enc);
+
+// Given a uin64_t generated by encode(), return the
+// offset --- lower 32-bits
+uint32_t offset(uint64_t enc);
+
+
+LibraryFormat parseLibraryFormatStringNew(std::string& fmt);
+
+std::vector<ReadLibrary> extractReadLibraries(boost::program_options::parsed_options& orderedOptions);
+
+LibraryFormat parseLibraryFormatString(std::string& fmt);
+
+size_t numberOfReadsInFastaFile(const std::string& fname);
+
+bool readKmerOrder( const std::string& fname, std::vector<uint64_t>& kmers );
+
+template <template<typename> class S, typename T>
+bool overlap( const S<T> &a, const S<T> &b );
+
+template< typename T >
+TranscriptGeneMap transcriptToGeneMapFromFeatures( std::vector<GenomicFeature<T>> &feats ) {
+    using std::unordered_set;
+    using std::unordered_map;
+    using std::vector;
+    using std::tuple;
+    using std::string;
+    using std::get;
+
+    using NameID = tuple<string, size_t>;
+
+    IndexVector t2g;
+    NameVector transcriptNames;
+    NameVector geneNames;
+
+    // holds the mapping from transcript ID to gene ID
+    IndexVector t2gUnordered;
+    // holds the set of gene IDs
+    unordered_map<string, size_t> geneNameToID;
+
+    // To read the input and assign ids
+    size_t geneCounter = 0;
+    string transcript;
+    string gene;
+
+    std::sort( feats.begin(), feats.end(),
+    []( const GenomicFeature<T> & a, const GenomicFeature<T> & b) -> bool {
+        return a.sattr.transcript_id < b.sattr.transcript_id;
+    } );
+
+    std::string currentTranscript = "";
+    for ( auto & feat : feats ) {
+
+        auto &gene = feat.sattr.gene_id;
+        auto &transcript = feat.sattr.transcript_id;
+
+        if ( transcript != currentTranscript ) {
+            auto geneIt = geneNameToID.find(gene);
+            size_t geneID = 0;
+
+            if ( geneIt == geneNameToID.end() ) {
+                // If we haven't seen this gene yet, give it a new ID
+                geneNameToID[gene] = geneCounter;
+                geneID = geneCounter;
+                geneNames.push_back(gene);
+                ++geneCounter;
+            } else {
+                // Otherwise lookup the ID
+                geneID = geneIt->second;
+            }
+
+            transcriptNames.push_back(transcript);
+            t2g.push_back(geneID);
+
+            //++transcriptID;
+            currentTranscript = transcript;
+        }
+
+    }
+
+    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
+}
+
+TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname, std::string key="gene_id");
+
+TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile );
+
+TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile );
+
+void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm, boost::filesystem::path& inputPath);
+
+// NOTE: Throws an invalid_argument exception of the quant or quant_bias_corrected files do
+// not exist!
+void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
+                                boost::filesystem::path& estDir,
+                                bool haveBiasCorrectedFile = false);
+
+}
+}
+#endif // UTILS_HPP
diff --git a/include/SalmonConfig.hpp b/include/SalmonConfig.hpp
new file mode 100644
index 0000000..6fb94fc
--- /dev/null
+++ b/include/SalmonConfig.hpp
@@ -0,0 +1,35 @@
+/**
+>HEADER
+    Copyright (c) 2014, 2015 Rob Patro robp at cs.stonybrook.edu
+
+    This file is part of Salmon.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef SALMON_CONFIG_HPP
+#define SALMON_CONFIG_HPP
+
+#include <string>
+
+namespace salmon {
+	constexpr char majorVersion[] = "0";
+	constexpr char minorVersion[] = "4";
+	constexpr char patchVersion[] = "2";
+	constexpr char version[] = "0.4.2";
+}
+
+#endif // SALMON_CONFIG_HPP
diff --git a/include/SalmonMath.hpp b/include/SalmonMath.hpp
new file mode 100644
index 0000000..e439a69
--- /dev/null
+++ b/include/SalmonMath.hpp
@@ -0,0 +1,48 @@
+#ifndef SALMON_MATH_HPP
+#define SALMON_MATH_HPP
+
+#include <cmath>
+#include <cassert>
+
+namespace salmon {
+
+    namespace math {
+        constexpr double LOG_0 = HUGE_VAL;
+        constexpr double LOG_1 = 0;
+        constexpr double LOG_ONEHALF = -0.69314718055994530941;
+        constexpr double LOG_ORPHAN_PROB = -2.30258509299404568401;
+        constexpr double EPSILON = 0.375e-10;
+        const double LOG_EPSILON = log(EPSILON);
+
+        // Taken from https://github.com/adarob/eXpress/blob/master/src/main.h
+        inline bool approxEqual(double a, double b, double eps=EPSILON) {
+            return std::abs(a-b) <= eps;
+        }
+
+        // Taken from https://github.com/adarob/eXpress/blob/master/src/main.h
+        inline double logAdd(double x, double y) {
+            if (std::abs(x) == LOG_0) { return y; }
+            if (std::abs(y) == LOG_0) { return x; }
+            if (y > x) { std::swap(x,y); }
+            double sum = x + std::log(1 + std::exp(y-x));
+            return sum;
+        }
+
+        // Taken from https://github.com/adarob/eXpress/blob/master/src/main.h
+        inline double logSub(double x, double y) {
+            if (std::abs(y) == LOG_0) { return x; }
+            if (x <= y) {
+                assert(std::fabs(x-y) < 1e-5);
+                return LOG_0;
+            }
+            double diff = x + std::log(1-std::exp(y-x));
+            return diff;
+        }
+
+
+    }
+
+}
+
+
+#endif //SALMON_MATH_HPP
diff --git a/include/SalmonOpts.hpp b/include/SalmonOpts.hpp
new file mode 100644
index 0000000..fcf0a00
--- /dev/null
+++ b/include/SalmonOpts.hpp
@@ -0,0 +1,110 @@
+#ifndef SALMON_OPTS_HPP
+#define SALMON_OPTS_HPP
+
+#include <boost/filesystem.hpp>
+
+// Logger includes
+#include "spdlog/spdlog.h"
+
+#include <memory> // for shared_ptr
+
+
+/**
+  * A structure to hold some common options used
+  * by Salmon so that we don't have to pass them
+  * all around as separate arguments.
+  */
+struct SalmonOpts {
+    // The options below are adopted from the mem_opt_t structure of BWA
+    /*
+    int maxOccurences; // maximum number of allowable occurences of (S)MEM
+    int minSeedOccurences; // try to split a seed into smaller seeds if it occurs
+                           // fewer than this many times.
+    int minSeedLen; // A seed must be at least this long.
+    float splitFactor; // Split a seed if it's longer than splitFactor * minSeedLen.
+    int flag; // Used by bwa
+    bool maxMEMIntervals; // If true, don't split (S)MEMs into MEMs
+    */
+
+    SalmonOpts() : splitSpanningSeeds(false), noFragLengthDist(false),
+                   noEffectiveLengthCorrection(false), useReadCompat(false),
+                   maxReadOccs(200), extraSeedPass(false),
+                   mappingCacheMemoryLimit(5000000) {}
+    bool splitSpanningSeeds; // Attempt to split seeds that span multiple transcripts.
+
+    bool noFragLengthDist; // Do not give a fragment assignment a likelihood based on an emperically
+                           // observed fragment length distribution.
+
+    bool noEffectiveLengthCorrection; // Don't take the fragment length distribution into
+                                      // account when computing the probability that a
+                                     // fragment was generated from a transcript.
+
+    bool noFragStartPosDist; // Don't learn a non-uniform start distribution
+
+    bool noSeqBiasModel; // Don't learn and use a sequence-specific bias model.
+
+    bool noRichEqClasses; // Don't use rich equivalence classes --- forget the
+                          // aggregate weights for each transcript to each
+                          // equivalence class of fragments.
+
+    bool noFragLenFactor; // Don't account for the fragment length term in the
+                          // likelihood.
+
+    bool useReadCompat; // Give a fragment assignment a likelihood based on the compatibility
+                        // between the manner in which it mapped and the expected read
+                        // library format.
+
+    double incompatPrior; // The prior probability that an alignment that disagrees with
+                          // the provided library type could correspond to the true
+                          // fragment origin.
+
+    bool useErrorModel; // Learn and apply the error model when computing the likelihood
+                        // of a given alignment.
+
+    uint32_t numErrorBins; // Number of bins into which each read is divided
+                           // when learning and applying the error model.
+
+    double forgettingFactor; // The forgetting factor used to determine the
+                             // learning schedule in the online inference algorithm.
+
+    uint32_t numBurninFrags; // Number of mapped fragments required for burn-in
+
+    uint32_t numPreBurninFrags; // Number of mapped fragments that are evaluated before
+    				// applying the auxiliary models.
+
+    uint32_t maxReadOccs; // Discard reads  mapping to more than this many places.
+
+    uint32_t maxExpectedReadLen; // Maximum expected length of an observed read.
+
+    bool extraSeedPass; // Perform extra pass trying to find seeds to cover the read
+
+    bool disableMappingCache; // Don't write mapping results to temporary mapping cache file
+
+    boost::filesystem::path outputDirectory; // Quant output directory
+
+    boost::filesystem::path indexDirectory; // Index directory
+
+    bool useMassBanking; // Bank unique mass in subsequent epochs of inference
+
+    bool useVBOpt; // Use Variational Bayesian EM instead of "regular" EM in the batch passes
+    bool useGSOpt; // Do Gibbs Sampling after optimization
+
+    uint32_t numGibbsSamples; // Number of rounds of Gibbs sampling to perform 
+
+    // Related to the fragment length distribution
+    size_t fragLenDistMax;
+    size_t fragLenDistPriorMean;
+    size_t fragLenDistPriorSD;
+
+    // Related to the logger
+    std::shared_ptr<spdlog::logger> jointLog{nullptr};
+    std::shared_ptr<spdlog::logger> fileLog{nullptr};
+
+    // Related to caching and threading
+    uint32_t mappingCacheMemoryLimit;
+    uint32_t numThreads;
+    uint32_t numQuantThreads;
+    uint32_t numParseThreads;
+};
+
+#endif // SALMON_OPTS_HPP
diff --git a/include/SalmonStringUtils.hpp b/include/SalmonStringUtils.hpp
new file mode 100644
index 0000000..c75dff6
--- /dev/null
+++ b/include/SalmonStringUtils.hpp
@@ -0,0 +1,137 @@
+#ifndef SALMON_STRING_UTILS
+#define SALMON_STRING_UTILS
+
+#include <cstdint>
+#include <cstddef>
+
+namespace salmon {
+
+    namespace stringtools{
+
+        constexpr double phredToLogProb[] =  { -1000.0, -1.58147375341, -0.996843044008, -0.695524471332, -0.507675873697,
+                                            -0.380130408066, -0.289268187202, -0.222551515973, -0.172556572914,
+                                            -0.134551960289, -0.105360515658, -0.0827653026692, -0.0651741731994,
+                                            -0.051418274158, -0.0406248442216, -0.0321335740231, -0.0254397275342,
+                                            -0.0201543647612, -0.0159758692467, -0.0126691702086, -0.0100503358535,
+                                            -0.00797499827851, -0.00632956293111, -0.00502447389099, -0.00398901726641,
+                                            -0.00316728822616, -0.00251504651118, -0.00199725550255, -0.00158615046428,
+                                            -0.00125971852411, -0.00100050033358, -0.000794643880558, -0.000631156481835,
+                                            -0.000501312869929, -0.000398186436251, -0.00031627777656, -0.000251220196302,
+                                            -0.000199546139504, -0.000158501880005, -0.000125900466311, -0.000100005000333,
+                                            -7.94359784262e-05, -6.30977250677e-05, -5.01199793479e-05, -3.9811509523e-05,
+                                            -3.16232766123e-05, -2.51191797991e-05, -1.99528222059e-05, -1.58490575203e-05,
+                                            -1.25893333633e-05, -1.00000500003e-05, -7.94331389525e-06, -6.30959335027e-06,
+                                            -5.01188489573e-06, -3.98107963e-06, -3.16228266018e-06, -2.51188958631e-06,
+                                            -1.99526430546e-06, -1.58489444841e-06, -1.25892620425e-06, -1.00000050003e-06,
+                                            -7.94328550222e-07, -6.30957543537e-07, -5.01187359198e-07, -3.98107249807e-07,
+                                            -3.16227815973e-07, -2.51188674656e-07, -1.99526251442e-07, -1.58489331784e-07,
+                                            -1.25892549094e-07, -1.00000004947e-07, -7.94328265847e-08, -6.30957364055e-08,
+                                            -5.01187246496e-08, -3.98107178487e-08, -3.16227771195e-08, -2.51188646374e-08,
+                                            -1.99526233083e-08, -1.58489320702e-08, -1.25892541629e-08, -1.00000001002e-08,
+                                            -7.94328239674e-09, -6.3095734392e-09, -5.01187237413e-09, -3.98107170245e-09,
+                                            -3.16227766695e-09, -2.51188647975e-09, -1.99526229071e-09, -1.58489321792e-09,
+                                            -1.25892540916e-09, -9.99999972218e-10, -7.94328270142e-10, -6.3095739764e-10,
+                                            -5.01187202976e-10 };
+        constexpr uint8_t samToTwoBit[] = {0, /*A*/0, /*C*/1, 0, /*G*/2, 0, 0, 0, /*T*/3,
+                                           0, 0, 0, 0, 0, 0, 0};
+
+        constexpr char twoBitToChar[] = {'A','C','G','T'};
+
+        constexpr uint8_t charToSamEncode[] = {
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 0, 15, 15, 15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, 15,
+            15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15, 15, 1, 15, 2, 15, 15,
+            15, 4, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 8, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15};
+
+
+        /*
+        constexpr uint8_t charToSamEncode[] = {
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0, 15, 15, 15, 1, 14, 2, 13,
+            15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, 15, 15, 5, 6, 8, 15, 7, 9,
+            15, 10, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+            15, 15, 15, 15, 15, 15};
+            */
+
+        uint8_t* encodeSequenceInSAM(const char* src, size_t len);
+
+        /**
+           Incomplete: currently only rev for 'ATCG'
+         */
+        constexpr uint8_t encodedRevComp[] = {
+            15, 8, 4, 15, 2, 15, 15, 15,
+            1, 15, 15, 15, 15, 15, 15, 15};
+
+        constexpr char samCodeToChar[] = {
+            '=', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K',
+            'D', 'B', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
+
+        constexpr char charCanon[] = {
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'A', 'N', 'C', 'N', 'N', 'N', 'G', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'A', 'N', 'C', 'N', 'N', 'N', 'G', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N'};
+
+
+
+        constexpr char charRC[] = {
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
+            'N'};
+
+        enum class strand {forward, reverse};
+    } // stringutils
+} // salmon
+
+#endif // SALMON_STRING_UTILS
diff --git a/include/SalmonUtils.hpp b/include/SalmonUtils.hpp
new file mode 100644
index 0000000..7a6de01
--- /dev/null
+++ b/include/SalmonUtils.hpp
@@ -0,0 +1,162 @@
+#ifndef __SALMON_UTILS_HPP__
+#define __SALMON_UTILS_HPP__
+
+extern "C" {
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+#undef min
+#undef max
+}
+
+#include <algorithm>
+#include <iostream>
+#include <tuple>
+#include <unordered_set>
+#include <unordered_map>
+#include <memory>
+#include <vector>
+#include <boost/filesystem.hpp>
+#include <boost/program_options.hpp>
+
+#include "format.h"
+
+#include "SalmonOpts.hpp"
+#include "SalmonMath.hpp"
+
+#include "LibraryFormat.hpp"
+#include "ReadLibrary.hpp"
+#include "TranscriptGeneMap.hpp"
+#include "GenomicFeature.hpp"
+
+class ReadExperiment;
+class LibraryFormat;
+
+namespace salmon{
+namespace utils {
+
+using std::string;
+using NameVector = std::vector<string>;
+using IndexVector = std::vector<size_t>;
+using KmerVector = std::vector<uint64_t>;
+
+// Returns a uint64_t where the upper 32-bits
+// contain tid and the lower 32-bits contain offset
+uint64_t encode(uint64_t tid, uint64_t offset);
+
+// Given a uin64_t generated by encode(), return the
+// transcript id --- upper 32-bits
+uint32_t transcript(uint64_t enc);
+
+// Given a uin64_t generated by encode(), return the
+// offset --- lower 32-bits
+uint32_t offset(uint64_t enc);
+
+
+LibraryFormat parseLibraryFormatStringNew(std::string& fmt);
+
+std::vector<ReadLibrary> extractReadLibraries(boost::program_options::parsed_options& orderedOptions);
+
+LibraryFormat parseLibraryFormatString(std::string& fmt);
+
+size_t numberOfReadsInFastaFile(const std::string& fname);
+
+bool readKmerOrder( const std::string& fname, std::vector<uint64_t>& kmers );
+
+template <template<typename> class S, typename T>
+bool overlap( const S<T> &a, const S<T> &b );
+
+template< typename T >
+TranscriptGeneMap transcriptToGeneMapFromFeatures( std::vector<GenomicFeature<T>> &feats );
+
+TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname, std::string key="gene_id");
+
+TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile );
+
+TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile );
+
+/*
+ * Use atomic compare-and-swap to update val to
+ * val + inc (*in log-space*).  Update occurs in a loop in case other
+ * threads update in the meantime.
+ */
+inline void incLoopLog(tbb::atomic<double>& val, double inc) {
+	double oldMass = val.load();
+	double returnedMass = oldMass;
+	double newMass{salmon::math::LOG_0};
+	do {
+	  oldMass = returnedMass;
+ 	  newMass = salmon::math::logAdd(oldMass, inc);
+	  returnedMass = val.compare_and_swap(newMass, oldMass);
+	} while (returnedMass != oldMass);
+}
+
+/*
+ * Use atomic compare-and-swap to update val to
+ * val + inc.  Update occurs in a loop in case other
+ * threads update in the meantime.
+ */
+inline void incLoop(tbb::atomic<double>& val, double inc) {
+        double oldMass = val.load();
+        double returnedMass = oldMass;
+        double newMass{oldMass + inc};
+        do {
+            oldMass = returnedMass;
+            newMass = oldMass + inc;
+            returnedMass = val.compare_and_swap(newMass, oldMass);
+        } while (returnedMass != oldMass);
+}
+
+
+void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm, boost::filesystem::path& inputPath);
+
+// NOTE: Throws an invalid_argument exception of the quant or quant_bias_corrected files do
+// not exist!
+void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
+                                boost::filesystem::path& estDir,
+                                bool haveBiasCorrectedFile = false);
+
+    enum class OrphanStatus: uint8_t { LeftOrphan = 0, RightOrphan = 1, Paired = 2 };
+
+    bool headersAreConsistent(SAM_hdr* h1, SAM_hdr* h2);
+
+    bool headersAreConsistent(std::vector<SAM_hdr*>&& headers);
+
+    template <typename AlnLibT>
+    void writeAbundances(const SalmonOpts& sopt,
+                         AlnLibT& alnLib,
+                         boost::filesystem::path& fname,
+                         std::string headerComments="");
+
+    template <typename AlnLibT>
+    void writeAbundancesFromCollapsed(const SalmonOpts& sopt,
+                         AlnLibT& alnLib,
+                         boost::filesystem::path& fname,
+                         std::string headerComments="");
+
+    template <typename AlnLibT>
+    void normalizeAlphas(const SalmonOpts& sopt,
+                         AlnLibT& alnLib);
+
+
+    double logAlignFormatProb(const LibraryFormat observed, const LibraryFormat expected, double incompatPrior);
+
+    std::ostream& operator<<(std::ostream& os, OrphanStatus s);
+    /**
+    *  Given the information about the position and strand from which a paired-end
+    *  read originated, return the library format with which it is compatible.
+    */
+    LibraryFormat hitType(int32_t end1Start, bool end1Fwd,
+                          int32_t end2Start, bool end2Fwd);
+    LibraryFormat hitType(int32_t end1Start, bool end1Fwd, uint32_t len1,
+                          int32_t end2Start, bool end2Fwd, uint32_t len2, bool canDovetail);
+    /**
+    *  Given the information about the position and strand from which the
+    *  single-end read originated, return the library format with which it
+    *  is compatible.
+    */
+    LibraryFormat hitType(int32_t readStart, bool isForward);
+
+}
+}
+
+#endif // __SALMON_UTILS_HPP__
diff --git a/include/Sampler.hpp b/include/Sampler.hpp
new file mode 100644
index 0000000..d083f55
--- /dev/null
+++ b/include/Sampler.hpp
@@ -0,0 +1,489 @@
+#ifndef __SAMPLER__HPP__
+#define __SAMPLER__HPP__
+
+extern "C" {
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+#undef max
+#undef min
+}
+
+// for cpp-format
+#include "format.h"
+
+#include "spdlog/spdlog.h"
+
+#include <tbb/atomic.h>
+#include <iostream>
+#include <fstream>
+#include <atomic>
+#include <vector>
+#include <random>
+#include <memory>
+#include <exception>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <mutex>
+#include <thread>
+#include <condition_variable>
+
+#include <tbb/concurrent_queue.h>
+
+#include <boost/config.hpp>
+#include <boost/timer/timer.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/math/special_functions/digamma.hpp>
+#include <boost/program_options.hpp>
+#include <boost/pending/disjoint_sets.hpp>
+#include <boost/math/distributions/normal.hpp>
+
+#include "ClusterForest.hpp"
+#include "AlignmentLibrary.hpp"
+#include "MiniBatchInfo.hpp"
+#include "BAMQueue.hpp"
+#include "SalmonMath.hpp"
+#include "FASTAParser.hpp"
+#include "LibraryFormat.hpp"
+#include "Transcript.hpp"
+#include "ReadPair.hpp"
+#include "ErrorModel.hpp"
+#include "AlignmentModel.hpp"
+#include "FragmentLengthDistribution.hpp"
+#include "TranscriptCluster.hpp"
+#include "SailfishUtils.hpp"
+#include "SalmonUtils.hpp"
+#include "SalmonConfig.hpp"
+#include "SalmonOpts.hpp"
+#include "OutputUnmappedFilter.hpp"
+
+namespace salmon {
+    namespace sampler {
+
+        namespace bfs = boost::filesystem;
+        using salmon::math::LOG_0;
+        using salmon::math::LOG_1;
+        using salmon::math::logAdd;
+        using salmon::math::logSub;
+
+        template <typename FragT>
+            using AlignmentBatch = std::vector<FragT>;
+
+        template <typename FragT>
+            using MiniBatchQueue = tbb::concurrent_queue<MiniBatchInfo<FragT>*>;
+
+        template <typename FragT>
+            using OutputQueue = tbb::concurrent_bounded_queue<FragT*>;
+
+        template <typename FragT>
+        void sampleMiniBatch(AlignmentLibrary<FragT>& alnLib,
+                    MiniBatchQueue<AlignmentGroup<FragT*>>& workQueue,
+                    std::condition_variable& workAvailable,
+                    std::mutex& cvmutex,
+                    volatile bool& doneParsing,
+                    std::atomic<size_t>& activeBatches,
+                    const SalmonOpts& salmonOpts,
+                    bool& burnedIn,
+                    std::atomic<size_t>& processedReads,
+                    OutputQueue<FragT>& outputQueue) {
+
+                // Seed with a real random value, if available
+                std::random_device rd;
+                auto log = spdlog::get("jointLog");
+
+                // Create a random uniform distribution
+                std::default_random_engine eng(rd());
+                std::uniform_real_distribution<> uni(0.0, 1.0 + std::numeric_limits<double>::min());
+
+                using salmon::math::LOG_0;
+                using salmon::math::logAdd;
+                using salmon::math::logSub;
+
+                bool useFSPD{!salmonOpts.noFragStartPosDist};
+                auto& refs = alnLib.transcripts();
+                auto& clusterForest = alnLib.clusterForest();
+                auto& fragmentQueue = alnLib.fragmentQueue();
+                auto& alignmentGroupQueue = alnLib.alignmentGroupQueue();
+                auto& fragLengthDist = alnLib.fragmentLengthDistribution();
+                auto& alnMod = alnLib.alignmentModel();
+
+                std::vector<FragmentStartPositionDistribution>& fragStartDists =
+                    alnLib.fragmentStartPositionDistributions();
+
+                const auto expectedLibraryFormat = alnLib.format();
+
+                std::chrono::microseconds sleepTime(1);
+                MiniBatchInfo<AlignmentGroup<FragT*>>* miniBatch = nullptr;
+                size_t numTranscripts = refs.size();
+
+                while (!doneParsing or !workQueue.empty()) {
+                    bool foundWork = workQueue.try_pop(miniBatch);
+                    // If work wasn't immediately available, then wait for it.
+                    if (!foundWork) {
+                        std::unique_lock<std::mutex> l(cvmutex);
+                        workAvailable.wait(l, [&miniBatch, &workQueue, &doneParsing]() { return workQueue.try_pop(miniBatch) or doneParsing; });
+                    }
+                    if (miniBatch != nullptr) {
+                       ++activeBatches;
+                        size_t batchReads{0};
+                        std::vector<AlignmentGroup<FragT*>*>& alignmentGroups = *(miniBatch->alignments);
+
+                        using TranscriptID = size_t;
+                        using HitIDVector = std::vector<size_t>;
+                        using HitProbVector = std::vector<double>;
+
+                        std::unordered_map<TranscriptID, std::vector<FragT*>> hitList;
+                        // Each alignment group corresponds to all of the potential
+                        // mapping locations of a multi-mapping read
+                        for (auto alnGroup : alignmentGroups) {
+                            // Score all of these alignments and sample according to
+                            // their probabilities
+                            for (auto a : alnGroup->alignments()) {
+                                auto transcriptID = a->transcriptID();
+                                if (transcriptID < 0 or transcriptID >= refs.size()) {
+                                    log->warn("Invalid Transcript ID: {}\n", transcriptID);
+                                }
+                                hitList[transcriptID].emplace_back(a);
+                            }
+                        }
+
+                        {
+                            // Iterate over each group of alignments (a group consists of all alignments reported
+                            // for a single read).
+                            for (auto alnGroup : alignmentGroups) {
+                                double sumOfAlignProbs{LOG_0};
+                                // update the cluster-level properties
+                                bool transcriptUnique{true};
+                                auto firstTranscriptID = alnGroup->alignments().front()->transcriptID();
+                                for (auto& aln : alnGroup->alignments()) {
+                                    auto transcriptID = aln->transcriptID();
+                                    auto& transcript = refs[transcriptID];
+                                    transcriptUnique = transcriptUnique and (transcriptID == firstTranscriptID);
+
+                                    double refLength = transcript.RefLength > 0 ? transcript.RefLength : 1.0;
+
+                                    double logFragProb = salmon::math::LOG_1;
+
+                                    if (!salmonOpts.noFragLengthDist) {
+                                        if(aln->fragLen() == 0) {
+                                            if (aln->isLeft() and transcript.RefLength - aln->left() < fragLengthDist.maxVal()) {
+                                                logFragProb = fragLengthDist.cmf(transcript.RefLength - aln->left());
+                                            } else if (aln->isRight() and aln->right() < fragLengthDist.maxVal()) {
+                                                logFragProb = fragLengthDist.cmf(aln->right());
+                                            }
+                                        } else {
+                                            logFragProb = fragLengthDist.pmf(static_cast<size_t>(aln->fragLen()));
+                                        }
+                                    }
+
+                                    // The alignment probability is the product of a transcript-level term (based on abundance and) an alignment-level
+                                    // term below which is P(Q_1) * P(Q_2) * P(F | T)
+                                    double logRefLength = std::log(refLength);
+
+                                    double logAlignCompatProb = (salmonOpts.useReadCompat) ?
+                                        (salmon::utils::logAlignFormatProb(aln->libFormat(), expectedLibraryFormat, salmonOpts.incompatPrior)) :
+                                        LOG_1;
+
+                                    // Adjustment to the likelihood due to the
+                                    // error model
+                                    double errLike = salmon::math::LOG_1;
+                                    if (burnedIn and salmonOpts.useErrorModel) {
+                                        errLike = alnMod.logLikelihood(*aln, transcript);
+                                    }
+
+                                    // Allow for a non-uniform fragment start position distribution
+                                    double startPosProb = -logRefLength;
+                                    auto hitPos = aln->left();
+                                    if (useFSPD and burnedIn and hitPos < refLength) {
+                                        auto& fragStartDist =
+                                            fragStartDists[transcript.lengthClassIndex()];
+                                        startPosProb = fragStartDist(hitPos, refLength, logRefLength);
+                                    }
+
+                                    double auxProb = startPosProb + logFragProb +
+                                        aln->logQualProb() +
+                                        errLike + logAlignCompatProb;
+
+                                    double transcriptLogCount = transcript.mass(false);
+
+                                    if ( transcriptLogCount != LOG_0 and
+                                         auxProb != LOG_0 ) {
+
+                                        aln->logProb = transcriptLogCount + auxProb;
+                                        sumOfAlignProbs = logAdd(sumOfAlignProbs, aln->logProb);
+
+                                    } else {
+                                        aln->logProb = LOG_0;
+                                    }
+                                }
+                                // normalize the hits
+                                if (sumOfAlignProbs == LOG_0) {
+                                    auto aln = alnGroup->alignments().front();
+                                    log->warn("0 probability fragment [{}] "
+                                              "encountered\n", aln->getName());
+                                    continue;
+                                }
+
+
+                                if (transcriptUnique) {
+                                    // avoid r-value ref until we figure out what's
+                                    // up with TBB 4.3
+                                    auto* alnPtr = alnGroup->alignments().front()->clone();
+                                    outputQueue.push(alnPtr);
+                                } else { // read maps to multiple transcripts
+                                    double r = uni(eng);
+                                    double currentMass{0.0};
+                                    double massInc{0.0};
+                                    bool choseAlignment{false};
+                                    for (auto& aln : alnGroup->alignments()) {
+                                        if (aln->logProb == LOG_0) { continue; }
+                                        aln->logProb -= sumOfAlignProbs;
+
+                                        massInc = std::exp(aln->logProb);
+                                        if (currentMass <= r and currentMass + massInc > r) {
+                                            // Write out this read
+                                            // avoid r-value ref until we figure out what's
+                                            // up with TBB 4.3
+                                           auto* alnPtr = aln->clone();
+                                            outputQueue.push(alnPtr);
+                                            currentMass += massInc;
+                                            choseAlignment = true;
+                                            break;
+                                        }
+                                        currentMass += massInc;
+                                    } // end alignment group
+                                    if (BOOST_UNLIKELY(!choseAlignment)) {
+                                        log->warn("[Sampler.hpp]: Failed to sample an alignment for this read; "
+                                                  "this shouldn't happen\n"
+                                                  "currentMass = {}, r = {}\n", currentMass, r);
+                                    }
+                                } // non-unique read
+                                ++batchReads;
+                            } // end read group
+                        }// end timer
+
+                        miniBatch->release(fragmentQueue, alignmentGroupQueue);
+                        delete miniBatch;
+                        --activeBatches;
+                        processedReads += batchReads;
+                    }
+                    miniBatch = nullptr;
+                } // nothing left to process
+            }
+
+        /**
+         *  Sample the alignments in the provided library given in current
+         *  estimates of transcript abundance.
+         */
+        template <typename FragT>
+        bool sampleLibrary(
+                    AlignmentLibrary<FragT>& alnLib,
+                    const SalmonOpts& salmonOpts,
+                    bool burnedIn,
+                    bfs::path& sampleFilePath,
+                    bool sampleUnaligned) {
+
+                fmt::MemoryWriter msgStr;
+                auto log = spdlog::get("jointLog");
+
+                msgStr << "Sampling alignments; outputting results to "
+                       << sampleFilePath.string() << "\n";
+
+                log->info() << msgStr.str();
+
+                auto& refs = alnLib.transcripts();
+                size_t numTranscripts = refs.size();
+                size_t miniBatchSize{1000};
+                size_t numObservedFragments{0};
+
+                MiniBatchQueue<AlignmentGroup<FragT*>> workQueue;
+                double logForgettingMass{std::log(1.0)};
+                double forgettingFactor{0.60};
+                size_t batchNum{0};
+
+                /**
+                * Output queue
+                */
+                volatile bool consumedAllInput{false};
+                size_t defaultCapacity = 2000000;
+                OutputQueue<FragT> outQueue;
+                outQueue.set_capacity(defaultCapacity);
+
+                std::unique_ptr<OutputUnmappedFilter<FragT>> outFilt = nullptr;
+
+                if (sampleUnaligned) {
+                    outFilt.reset(new OutputUnmappedFilter<FragT>(&outQueue));
+                }
+
+                // Reset our reader to the beginning
+                if (!alnLib.reset(false, outFilt.get())) {
+                    fmt::print(stderr,
+                            "\n\n======== WARNING ========\n"
+                            "A provided alignment file "
+                            "is not a regular file and therefore can't be read from "
+                            "more than once.\n\n"
+                            "Therefore, we cannot provide sample output alignments "
+                            "from this file.  No sampled BAM file will be generated. "
+                            "Please consider re-running Salmon with these alignments "
+                            "as a regular file!\n"
+                            "==========================\n\n");
+
+                    return false;
+                }
+
+                volatile bool doneParsing{false};
+                std::condition_variable workAvailable;
+                std::mutex cvmutex;
+                std::vector<std::thread> workers;
+                std::atomic<size_t> activeBatches{0};
+                std::atomic<size_t> processedReads{0};
+
+                size_t numProc{0};
+                for (uint32_t i = 0; i < salmonOpts.numQuantThreads; ++i) {
+                    workers.emplace_back(sampleMiniBatch<FragT>,
+                            std::ref(alnLib),
+                            std::ref(workQueue),
+                            std::ref(workAvailable), std::ref(cvmutex),
+                            std::ref(doneParsing), std::ref(activeBatches),
+                            std::ref(salmonOpts),
+                            std::ref(burnedIn),
+                            std::ref(processedReads),
+                            std::ref(outQueue));
+                }
+
+                std::thread outputThread(
+                        [&consumedAllInput, &alnLib, &outQueue, &log, sampleFilePath] () -> void {
+
+                            scram_fd* bf = scram_open(sampleFilePath.c_str(), "wb");
+                            scram_set_option(bf, CRAM_OPT_NTHREADS, 3);
+                            scram_set_header(bf, alnLib.header());
+                            scram_write_header(bf);
+                            if (bf == nullptr) {
+                                fmt::MemoryWriter errstr;
+                                errstr << ioutils::SET_RED << "ERROR: "
+                                       << ioutils::RESET_COLOR
+                                       << "Couldn't open output bam file "
+                                       << sampleFilePath.string() << ". Exiting\n";
+                                log->warn() << errstr.str();
+                                std::exit(-1);
+                            }
+
+                            FragT* aln{nullptr};
+                            while (!outQueue.empty() or !consumedAllInput) {
+                                while (outQueue.try_pop(aln)) {
+                                    if (aln != nullptr) {
+                                        int ret = aln->writeToFile(bf);
+                                        if (ret != 0) {
+                                            std::cerr << "ret = " << ret << "\n";
+                                            fmt::MemoryWriter errstr;
+                                            errstr << ioutils::SET_RED << "ERROR:"
+                                                << ioutils::RESET_COLOR << "Could not write "
+                                                << "a sampled alignment to the output BAM "
+                                                << "file. Please check that the file can "
+                                                << "be created properly and that the disk "
+                                                << "is not full.  Exiting.\n";
+                                            log->warn() << errstr.str();
+                                            std::exit(-1);
+                                        }
+                                        // Eventually, as we do in BAMQueue, we should
+                                        // have queue of bam1_t structures that can be
+                                        // re-used rather than continually calling
+                                        // new and delete.
+                                        delete aln;
+                                        aln = nullptr;
+                                    }
+                                }
+                            }
+
+                            scram_close(bf); // will delete the header itself
+                        });
+
+
+
+                BAMQueue<FragT>& bq = alnLib.getAlignmentGroupQueue();
+                std::vector<AlignmentGroup<FragT*>*>* alignments = new std::vector<AlignmentGroup<FragT*>*>;
+                alignments->reserve(miniBatchSize);
+                AlignmentGroup<FragT*>* ag;
+
+                bool alignmentGroupsRemain = bq.getAlignmentGroup(ag);
+                while (alignmentGroupsRemain or alignments->size() > 0) {
+                    if (alignmentGroupsRemain) { alignments->push_back(ag); }
+                    // If this minibatch has reached the size limit, or we have nothing
+                    // left to fill it up with
+                    if (alignments->size() >= miniBatchSize or !alignmentGroupsRemain) {
+                        // Don't need to update the batch number or log forgetting mass in this phase
+                        MiniBatchInfo<AlignmentGroup<FragT*>>* mbi =
+                            new MiniBatchInfo<AlignmentGroup<FragT*>>(batchNum, alignments, logForgettingMass);
+                        workQueue.push(mbi);
+                        {
+                            std::unique_lock<std::mutex> l(cvmutex);
+                            workAvailable.notify_one();
+                        }
+                        alignments = new std::vector<AlignmentGroup<FragT*>*>;
+                        alignments->reserve(miniBatchSize);
+                    }
+                    if (numProc % 1000000 == 0) {
+                        const char RESET_COLOR[] = "\x1b[0m";
+                        char green[] = "\x1b[30m";
+                        green[3] = '0' + static_cast<char>(fmt::GREEN);
+                        char red[] = "\x1b[30m";
+                        red[3] = '0' + static_cast<char>(fmt::RED);
+                        fmt::print(stderr, "\r\r{}processed{} {} {}reads{}", green, red, numProc, green, RESET_COLOR);
+                    }
+                    ++numProc;
+                    alignmentGroupsRemain = bq.getAlignmentGroup(ag);
+
+                }
+                std::cerr << "\n";
+
+                // Free the alignments and the vector holding them
+                for (auto& aln : *alignments) {
+                    aln->alignments().clear();
+                    delete aln; aln = nullptr;
+                }
+                delete alignments;
+
+                doneParsing = true;
+
+                /**
+                 * This could be a problem for small sets of alignments --- make sure the
+                 * work queue is empty!!
+                 * --- Thanks for finding a dataset that exposes this bug, Richard (Smith-Unna)!
+                 */
+                size_t t = 0;
+                while (!workQueue.empty()) {
+                    std::unique_lock<std::mutex> l(cvmutex);
+                    workAvailable.notify_one();
+                }
+
+                size_t tnum{0};
+                for (auto& t : workers) {
+                    fmt::print(stderr, "killing thread {} . . . ", tnum++);
+                    {
+                        std::unique_lock<std::mutex> l(cvmutex);
+                        workAvailable.notify_all();
+                    }
+                    t.join();
+                    fmt::print(stderr, "done\r\r");
+                }
+                fmt::print(stderr, "\n");
+                consumedAllInput = true;
+
+                numObservedFragments += alnLib.numMappedReads();
+                fmt::print(stderr, "# observed = {} mapped fragments.\033[F\033[F\033[F\033[F",
+                        numObservedFragments);
+
+                fmt::print(stderr, "Waiting on output thread\n");
+                outputThread.join();
+                fmt::print(stderr, "done\n");
+
+                fmt::print(stderr, "\n\n\n\n");
+                return true;
+            }
+
+
+
+    } // namespace sampler
+} // namespace salmon
+
+#endif // __SAMPLER__HPP__
diff --git a/include/SequenceBiasModel.hpp b/include/SequenceBiasModel.hpp
new file mode 100644
index 0000000..73ecc64
--- /dev/null
+++ b/include/SequenceBiasModel.hpp
@@ -0,0 +1,97 @@
+#ifndef SEQUENCE_BIAS_MODEL
+#define SEQUENCE_BIAS_MODEL
+
+#include <mutex>
+#include <atomic>
+#include <memory>
+
+// logger includes
+#include "spdlog/spdlog.h"
+
+#include "tbb/concurrent_vector.h"
+#include "AtomicMatrix.hpp"
+
+class Transcript;
+class LibraryFormat;
+
+class SequenceBiasModel {
+public:
+    SequenceBiasModel(double alpha,
+                      uint32_t windowSize=15 // must be odd
+            );
+
+    bool burnedIn();
+    void burnedIn(bool burnedIn);
+
+    /**
+    *  For unpaired reads, update the bias model to account
+    *  for this read
+    */
+    bool update(Transcript& ref, int32_t pos,
+                LibraryFormat libFormat,
+                double mass, double prob);
+
+    /**
+    *  For unpaired reads, update the bias model to account
+    *  for this read
+    */
+    bool update(Transcript& ref, int32_t pos, int32_t pos2,
+                LibraryFormat libFormat,
+                double mass, double prob);
+
+    /**
+     *
+     *
+     */
+    double biasFactor(Transcript& ref, int32_t pos, LibraryFormat libFmt);
+
+    /**
+     *
+     *
+     */
+    double biasFactor(Transcript& ref, int32_t pos1, int32_t pos2,
+                      LibraryFormat libFormat);
+
+    void setLogger(std::shared_ptr<spdlog::logger> logger);
+    bool hasLogger();
+    std::string toString();
+    //void normalize();
+
+private:
+
+    double seqProb(Transcript& ref, int32_t pos,
+                   bool isFwd, AtomicMatrix<double>& profile);
+    /**
+     * These functions, which work directly on bam_seq_t* types, drive the
+     * update() and logLikelihood() methods above.
+     */
+    bool update(Transcript& ref, int32_t pos,
+                bool fwd,
+                double mass, double prob, AtomicMatrix<double>& sequenceModel);
+
+    double biasFactor(Transcript& ref,
+                      int32_t pos,
+                      LibraryFormat libFormat,
+                      AtomicMatrix<double>& foregroundModel,
+                      AtomicMatrix<double>& backgroundModel);
+
+    // A, C, G, T
+    constexpr static uint32_t numBases() { return 4; }
+
+    // NOTE: Do these need to be concurrent_vectors as before?
+    // Store the mismatch probability tables for the left and right reads
+    AtomicMatrix<double> biasLeftForeground_;
+    AtomicMatrix<double> biasLeftBackground_;
+    AtomicMatrix<double> biasRightForeground_;
+    AtomicMatrix<double> biasRightBackground_;
+
+    uint32_t windowSize_;
+    bool isEnabled_;
+    std::atomic<bool> burnedIn_;
+    std::shared_ptr<spdlog::logger> logger_;
+    // Maintain a mutex in case we want to talk to the
+    // console / log.
+    std::mutex outputMutex_;
+};
+
+#endif // SEQUENCE_BIAS_MODEL
diff --git a/include/SpinLock.hpp b/include/SpinLock.hpp
new file mode 100644
index 0000000..e2cc9c8
--- /dev/null
+++ b/include/SpinLock.hpp
@@ -0,0 +1,38 @@
+#ifndef __SPIN_LOCK_HPP__
+#define __SPIN_LOCK_HPP__
+
+#include <atomic>
+
+/**
+ * since std::mutex is *VERY SLOW* on OSX, we use
+ * this there instead.
+ * Taken from (http://stackoverflow.com/questions/22899053/why-is-stdmutex-so-slow-on-osx)
+ */
+class spin_lock {
+    std::atomic<bool> _lock;
+public:
+  spin_lock(const spin_lock&) = delete;
+  spin_lock& operator=(const spin_lock&) = delete;
+
+  spin_lock() : _lock(false) {}
+
+  class scoped_lock {
+    spin_lock &_lock;
+
+  public:
+    scoped_lock(const scoped_lock&) = delete;
+    scoped_lock& operator=(const scoped_lock&) = delete;
+
+    scoped_lock(spin_lock &lock) : _lock(lock) {
+      bool expect = false;
+      while (!_lock._lock.compare_exchange_weak(expect, true)) {
+        expect = false;
+      }
+    }
+    ~scoped_lock() {
+      _lock._lock = false;
+    }
+  };
+};
+
+#endif // __SPIN_LOCK_HPP__
diff --git a/include/StadenUtils.hpp b/include/StadenUtils.hpp
new file mode 100644
index 0000000..83f640c
--- /dev/null
+++ b/include/StadenUtils.hpp
@@ -0,0 +1,20 @@
+#ifndef STADEN_UTILS
+#define STADEN_UTILS
+
+extern "C" {
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+#undef max
+#undef min
+}
+
+#include <cstdlib>
+
+namespace staden {
+    namespace utils {
+        bam_seq_t* bam_init();
+        void bam_destroy(bam_seq_t* b);
+    }
+}
+
+#endif // STADEN_UTILS
diff --git a/include/SufficientStatisticsQueue.hpp b/include/SufficientStatisticsQueue.hpp
new file mode 100644
index 0000000..7405df4
--- /dev/null
+++ b/include/SufficientStatisticsQueue.hpp
@@ -0,0 +1,103 @@
+#ifndef __SUFFICIENT_STATISTICS_QUEUE_HPP__
+#define __SUFFICIENT_STATISTICS_QUEUE_HPP__
+
+#include <queue>
+#include <cmath>
+#include <mutex>
+#include <tuple>
+
+#include "btree_map.h"
+
+#include "SalmonMath.hpp"
+
+class SufficientStatisticsQueue {
+
+    using TranscriptID = uint32_t;
+    public:
+        SufficientStatisticsQueue(uint32_t qlen=10) : ct_(0), qlen_(qlen) {
+            if (qlen_ > 0) {
+                logL_ = std::log(static_cast<double>(qlen_));
+            } else {
+                std::cerr << "ERROR: cannot instantiate sufficient statistic queue of length 0";
+            }
+        }
+
+        bool approxEq(double a, double b) {
+            return std::abs(a - b) < 1e-3;
+        }
+
+        btree::btree_map<TranscriptID, std::tuple<double,size_t>> getSmoothedStats(
+                btree::btree_map<TranscriptID, TranscriptAlignments>& Si ) {
+            using salmon::math::logAdd;
+            using salmon::math::logSub;
+
+            std::lock_guard<std::mutex> lg(ssmut_);
+
+            ++ct_;
+            if (ct_ % 200 == 0) {
+                std::cerr << "\n\n sstatQueue_.size() = " << sstatQueue_.size() << ", suf stat map size = " << smoothedSufficientStats_.size() << "\n\n\n";
+            }
+
+            sstatQueue_.emplace();
+            auto& currentSStat = sstatQueue_.back();
+
+            // S_i^L = S_{i-1}^{L} + (S_{i} / L)
+            for (auto kv = Si.begin(); kv != Si.end(); ++kv) {
+                auto transcriptID = kv->first;
+                auto& hits = kv->second;
+                double contrib = hits.totalProb - logL_;
+                currentSStat[transcriptID] = contrib;
+                auto it = smoothedSufficientStats_.find(transcriptID);
+                if (it == smoothedSufficientStats_.end()) {
+                    smoothedSufficientStats_[transcriptID] = std::make_tuple(contrib,1);
+                } else {
+                    std::get<0>(it->second) = logAdd(std::get<0>(it->second), contrib);
+                    std::get<1>(it->second) += 1;
+                }
+
+            }
+
+            if (sstatQueue_.size() > qlen_) {
+                // S_i^L -= (S_{i-L} / L)
+                auto& Sil = sstatQueue_.front();
+                for (auto kv = Sil.begin(); kv != Sil.end(); ++kv) {
+                    auto transcriptID = kv->first;
+                    auto& lprob = kv->second;
+                    auto it = smoothedSufficientStats_.find(transcriptID);
+                    if (it == smoothedSufficientStats_.end()) {
+                        //std::cerr << "\n\nWHAT\n\n\n";
+                    } else if (std::get<0>(it->second) <= lprob or
+                               approxEq(lprob, std::get<0>(it->second)) ) {//std::get<1>(it->second) == 1) {
+                        if (!approxEq(std::get<0>(it->second), lprob) and std::get<1>(it->second) > 1) {
+                            std::cerr << "hrmmmm .... remaining mass = " << std::get<0>(it->second) << " < lprob = " << lprob << "\n";
+                        }
+                        smoothedSufficientStats_.erase(it);
+                    } else {
+                        double before = std::get<0>(it->second);
+                        double update = logSub(std::get<0>(it->second), lprob);
+                        std::get<0>(it->second) = update;
+                        std::get<1>(it->second) -= 1;
+                        if (std::get<0>(it->second) == salmon::math::LOG_0) {
+                            smoothedSufficientStats_.erase(it);
+                        }
+                    }
+                }
+                sstatQueue_.pop();
+            }
+
+            // make sure this is a copy!
+            return smoothedSufficientStats_;
+            // lock is freed here
+        }
+
+    private:
+        size_t ct_;
+        std::mutex ssmut_;
+        std::queue<btree::btree_map<TranscriptID, double>> sstatQueue_;
+        btree::btree_map<TranscriptID, std::tuple<double, size_t>> smoothedSufficientStats_;
+        uint32_t qlen_;
+        double logL_;
+};
+
+#endif // __SUFFICIENT_STATISTICS_QUEUE_HPP__
+
diff --git a/include/Transcript.hpp b/include/Transcript.hpp
new file mode 100644
index 0000000..0ee8207
--- /dev/null
+++ b/include/Transcript.hpp
@@ -0,0 +1,235 @@
+#ifndef TRANSCRIPT
+#define TRANSCRIPT
+
+#include <atomic>
+#include <cmath>
+#include <limits>
+#include "SalmonStringUtils.hpp"
+#include "SalmonUtils.hpp"
+#include "SalmonMath.hpp"
+#include "SequenceBiasModel.hpp"
+#include "FragmentLengthDistribution.hpp"
+#include "tbb/atomic.h"
+
+class Transcript {
+public:
+    Transcript(size_t idIn, const char* name, uint32_t len, double alpha = 0.05) :
+        RefName(name), RefLength(len), id(idIn), Sequence(nullptr),
+        logPerBasePrior_(std::log(alpha)),
+        priorMass_(std::log(alpha*len)),
+        mass_(salmon::math::LOG_0), sharedCount_(0.0),
+        avgMassBias_(salmon::math::LOG_0) {
+            uniqueCount_.store(0);
+            lastUpdate_.store(0);
+            lastTimestepUpdated_.store(0);
+            cachedEffectiveLength_.store(std::log(static_cast<double>(RefLength)));
+        }
+
+    Transcript(Transcript&& other) {
+        id = other.id;
+        //std::swap(RefName, other.RefName);
+        RefName = std::move(other.RefName);
+        RefLength = other.RefLength;
+        Sequence = other.Sequence;
+        uniqueCount_.store(other.uniqueCount_);
+        totalCount_.store(other.totalCount_.load());
+        lastTimestepUpdated_.store(other.lastTimestepUpdated_.load());
+        sharedCount_.store(other.sharedCount_.load());
+        mass_.store(other.mass_.load());
+        lastUpdate_.store(other.lastUpdate_.load());
+        cachedEffectiveLength_.store(other.cachedEffectiveLength_.load());
+        lengthClassIndex_ = other.lengthClassIndex_;
+        logPerBasePrior_ = other.logPerBasePrior_;
+        priorMass_ = other.priorMass_;
+        avgMassBias_.store(other.avgMassBias_.load());
+    }
+
+    Transcript& operator=(Transcript&& other) {
+        id = other.id;
+        //std::swap(RefName, other.RefName);
+        RefName = std::move(other.RefName);
+        RefLength = other.RefLength;
+        Sequence = other.Sequence;
+        uniqueCount_.store(other.uniqueCount_);
+        totalCount_.store(other.totalCount_.load());
+        lastTimestepUpdated_.store(other.lastTimestepUpdated_.load());
+        sharedCount_.store(other.sharedCount_.load());
+        mass_.store(other.mass_.load());
+        lastUpdate_.store(other.lastUpdate_.load());
+        cachedEffectiveLength_.store(other.cachedEffectiveLength_.load());
+        lengthClassIndex_ = other.lengthClassIndex_;
+        logPerBasePrior_ = other.logPerBasePrior_;
+        priorMass_ = other.priorMass_;
+        avgMassBias_.store(other.avgMassBias_.load());
+        return *this;
+    }
+
+
+    inline double sharedCount() { return sharedCount_.load(); }
+    inline size_t uniqueCount() { return uniqueCount_.load(); }
+    inline size_t totalCount() { return totalCount_.load(); }
+
+    inline void addUniqueCount(size_t newCount) { uniqueCount_ += newCount; }
+    inline void addTotalCount(size_t newCount) { totalCount_ += newCount; }
+
+    inline double uniqueUpdateFraction() {
+        double ambigCount = static_cast<double>(totalCount_ - uniqueCount_);
+        return uniqueCount_ / ambigCount;
+    }
+
+    inline char charBaseAt(size_t idx,
+                              salmon::stringtools::strand dir = salmon::stringtools::strand::forward) {
+        return salmon::stringtools::samCodeToChar[baseAt(idx, dir)];
+    }
+
+    inline uint8_t baseAt(size_t idx,
+                          salmon::stringtools::strand dir = salmon::stringtools::strand::forward) {
+        using salmon::stringtools::strand;
+        using salmon::stringtools::encodedRevComp;
+        size_t byte = idx >> 1;
+        size_t nibble = idx & 0x1;
+
+        switch(dir) {
+        case strand::forward:
+            if (nibble) {
+                return Sequence[byte] & 0x0F;
+            } else {
+                return ((Sequence[byte] & 0xF0) >> 4) & 0x0F;
+            }
+            break;
+        case strand::reverse:
+            if (nibble) {
+                return encodedRevComp[Sequence[byte] & 0x0F];
+            } else {
+                return encodedRevComp[((Sequence[byte] & 0xF0) >> 4) & 0x0F];
+            }
+            break;
+        }
+
+        return std::numeric_limits<uint8_t>::max();
+    }
+
+    inline void addSharedCount(double sc) {
+	salmon::utils::incLoop(sharedCount_, sc);
+    }
+
+    inline void setLastTimestepUpdated(uint64_t currentTimestep) {
+        uint64_t oldTimestep = lastTimestepUpdated_;
+        if (currentTimestep > oldTimestep) {
+            lastTimestepUpdated_ = currentTimestep;
+        }
+    }
+
+    inline void addBias(double bias) {
+	salmon::utils::incLoopLog(avgMassBias_, bias);
+    }
+
+    inline void addMass(double mass) {
+	salmon::utils::incLoopLog(mass_, mass);
+    }
+
+    inline void setMass(double mass) {
+        mass_.store(mass);
+    }
+
+    inline double mass(bool withPrior=true) {
+        return (withPrior) ? salmon::math::logAdd(priorMass_, mass_.load()) : mass_.load();
+    }
+
+    inline double bias() {
+        return (totalCount_.load() > 0) ?
+                    avgMassBias_ - std::log(totalCount_.load()) :
+                    salmon::math::LOG_1;
+    }
+
+    /*
+    double getAverageSequenceBias(SequenceBiasModel& m) {
+        double bias = salmon::math::LOG_0;
+        for (int32_t i = 0; i < RefLength; ++i) {
+            bias = salmon::math::logAdd(bias, m.biasFactor(*this, i));
+        }
+        return bias - std::log(RefLength);
+    }
+    */
+
+    /**
+      *  NOTE: Adopted from "est_effective_length" at (https://github.com/adarob/eXpress/blob/master/src/targets.cpp)
+      *  originally written by Adam Roberts.
+      *
+      *
+      */
+    double updateEffectiveLength(const FragmentLengthDistribution& fragLengthDist) {
+
+        double effectiveLength = salmon::math::LOG_0;
+        double refLen = static_cast<double>(RefLength);
+        double logLength = std::log(refLen);
+
+        if (logLength < fragLengthDist.mean()) {
+            effectiveLength = logLength;
+        } else {
+            uint32_t mval = fragLengthDist.maxVal();
+            for (size_t l = fragLengthDist.minVal(); l <= std::min(RefLength, mval); ++l) {
+                effectiveLength = salmon::math::logAdd(
+                        effectiveLength,
+                        fragLengthDist.pmf(l) + std::log(refLen - l + 1));
+            }
+        }
+
+        return effectiveLength;
+    }
+
+    double getCachedEffectiveLength() {
+        return cachedEffectiveLength_.load();
+    }
+
+    double getEffectiveLength(const FragmentLengthDistribution& fragLengthDist,
+                              size_t currObs,
+                              size_t burnInObs) {
+        if (lastUpdate_ == 0 or
+            (currObs - lastUpdate_ >= 250000) or
+            (lastUpdate_ < burnInObs and currObs > burnInObs)) {
+            // compute new number
+            double cel = updateEffectiveLength(fragLengthDist);
+            cachedEffectiveLength_.store(cel);
+            lastUpdate_.store(currObs);
+            //priorMass_ = cel + logPerBasePrior_;
+            return cachedEffectiveLength_.load();
+        } else {
+            // return cached number
+            return cachedEffectiveLength_.load();
+        }
+    }
+
+    double perBasePrior() { return std::exp(logPerBasePrior_); }
+    inline size_t lastTimestepUpdated() { return lastTimestepUpdated_.load(); }
+
+    void lengthClassIndex(uint32_t ind) { lengthClassIndex_ = ind; }
+    uint32_t lengthClassIndex() { return lengthClassIndex_; }
+
+    std::string RefName;
+    uint32_t RefLength;
+    uint32_t id;
+
+    double uniqueCounts{0.0};
+    double totalCounts{0.0};
+    double projectedCounts{0.0};
+    double sharedCounts{0.0};
+
+    uint8_t* Sequence;
+
+private:
+    std::atomic<size_t> uniqueCount_;
+    std::atomic<size_t> totalCount_;
+    // The most recent timestep at which this transcript's mass was updated.
+    std::atomic<size_t> lastTimestepUpdated_;
+    double priorMass_;
+    tbb::atomic<double> mass_;
+    tbb::atomic<double> sharedCount_;
+    tbb::atomic<double> cachedEffectiveLength_;
+    tbb::atomic<size_t> lastUpdate_;
+    tbb::atomic<double> avgMassBias_;
+    uint32_t lengthClassIndex_;
+    double logPerBasePrior_;
+};
+
+#endif //TRANSCRIPT
diff --git a/include/TranscriptAlignments.hpp b/include/TranscriptAlignments.hpp
new file mode 100644
index 0000000..93cd682
--- /dev/null
+++ b/include/TranscriptAlignments.hpp
@@ -0,0 +1,20 @@
+#ifndef __TRANSCRIPT_ALIGNMENTS_HPP__
+#define __TRANSCRIPT_ALIGNMENTS_HPP__
+
+#include <vector>
+#include "SalmonMath.hpp"
+
+class SMEMAlignment;
+
+class TranscriptAlignments {
+    public:
+        TranscriptAlignments() : alignments(std::vector<SMEMAlignment*>()),
+                                 totalProb(salmon::math::LOG_0) {}
+
+        std::vector<SMEMAlignment*> alignments;
+        double totalProb;
+        double logMassPrior;
+        double logMassPosterior;
+};
+
+#endif //__TRANSCRIPT_ALIGNMENTS_HPP__
diff --git a/include/TranscriptCluster.hpp b/include/TranscriptCluster.hpp
new file mode 100644
index 0000000..27c50c7
--- /dev/null
+++ b/include/TranscriptCluster.hpp
@@ -0,0 +1,114 @@
+#ifndef __TRANSCRIPT_CLUSTER_HPP__
+#define __TRANSCRIPT_CLUSTER_HPP__
+
+#include <iostream>
+#include <atomic>
+#include <vector>
+#include <list>
+
+#include <boost/pending/disjoint_sets.hpp>
+#include <boost/dynamic_bitset.hpp>
+
+#include "SalmonMath.hpp"
+
+class ClusterForest;
+
+class TranscriptCluster {
+    friend class ClusterForest;
+public:
+    TranscriptCluster() : members_(std::list<size_t>()), count_(0), logMass_(salmon::math::LOG_0), active_(true) {}
+    TranscriptCluster(size_t initialMember) : members_(std::list<size_t>(1,initialMember)), count_(0),
+                                              logMass_(salmon::math::LOG_0), active_(true) {
+    }
+
+    //void incrementCount(size_t num) { count_ += num; }
+    void incrementCount(double num) { count_ += num; }
+    void addMass(double logNewMass) { logMass_ = salmon::math::logAdd(logMass_, logNewMass); }
+    void merge(TranscriptCluster& other) {
+        members_.splice(members_.begin(), other.members_);
+        logMass_ = salmon::math::logAdd(logMass_, other.logMass_);
+        count_ += other.count_;
+    }
+
+    std::list<size_t>& members() { return members_; }
+    //size_t numHits() { return count_.load(); }
+    double numHits() { return count_; }
+    bool isActive() { return active_; }
+    void deactivate() { active_ = false; }
+    double logMass() { return logMass_; }
+
+    // Adapted from https://github.com/adarob/eXpress/blob/master/src/targets.cpp
+    void projectToPolytope(std::vector<Transcript>& allTranscripts) {
+        using salmon::math::approxEqual;
+
+        // The transcript belonging to this cluster
+        double clusterCounts{static_cast<double>(count_)};
+        std::vector<Transcript*> transcripts;
+        for (auto tid : members_) {
+            transcripts.push_back(&allTranscripts[tid]);
+        }
+        // The cluster size
+        size_t clusterSize = transcripts.size();
+        size_t round{0};
+        boost::dynamic_bitset<> polytopeBound(clusterSize);
+        while(true) {
+            double unboundCounts{0.0};
+            double boundCounts{0.0};
+            for (size_t i = 0; i < clusterSize; ++i) {
+                Transcript* transcript = transcripts[i];
+                if (transcript->projectedCounts > transcript->totalCounts) {
+                    transcript->projectedCounts = transcript->totalCounts;
+                    polytopeBound[i] = true;
+                } else if (transcript->projectedCounts < transcript->uniqueCounts) {
+                    transcript->projectedCounts = transcript->uniqueCounts;
+                    polytopeBound[i] = true;
+                }
+
+                if (polytopeBound[i]) {
+                    boundCounts += transcript->projectedCounts;
+                } else {
+                    unboundCounts += transcript->projectedCounts;
+                }
+            }
+
+
+            if (approxEqual(unboundCounts + boundCounts, clusterCounts)) {
+                return;
+            }
+
+            if (unboundCounts == 0) {
+                polytopeBound = boost::dynamic_bitset<>(clusterSize);
+                unboundCounts = boundCounts;
+                boundCounts = 0;
+            }
+
+            double normalizer = (clusterCounts - boundCounts) / unboundCounts;
+            for (size_t i = 0; i < clusterSize; ++i) {
+                if (!polytopeBound[i]) {
+                    Transcript* transcript = transcripts[i];
+                    transcript->projectedCounts *= normalizer;
+                }
+            }
+            ++round;
+            if (round % 100 == 0) {
+                std::cerr << "\r\rproject to polytope: " << round;
+            }
+            if (round > 50000) {
+                return;
+            }
+
+        } // end while
+
+    }
+
+private:
+    std::list<size_t> members_;
+    //std::atomic<size_t> count_;
+    double count_;
+    double logMass_;
+    bool active_;
+};
+
+#endif // __TRANSCRIPT_CLUSTER_HPP__
+
+
diff --git a/include/TranscriptGeneMap.hpp b/include/TranscriptGeneMap.hpp
new file mode 100644
index 0000000..68499a0
--- /dev/null
+++ b/include/TranscriptGeneMap.hpp
@@ -0,0 +1,148 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Sailfish.
+
+    Sailfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Sailfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#ifndef TRANSCRIPT_GENE_MAP_HPP
+#define TRANSCRIPT_GENE_MAP_HPP
+
+// Allows for the serialization of this class
+#include <cereal/archives/binary.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+class TranscriptGeneMap {
+using Index = size_t;
+using Size = size_t;
+using NameVector = std::vector<std::string>;
+using IndexVector = std::vector<size_t>;
+using IndexVectorList = std::vector<std::vector<size_t>>;
+
+private:
+    NameVector _transcriptNames;
+    NameVector _geneNames;
+    IndexVector _transcriptsToGenes;
+    IndexVectorList _genesToTranscripts;
+    bool _haveReverseMap;
+
+    void _computeReverseMap() {
+
+        _genesToTranscripts.resize( _geneNames.size(), {});
+
+        Index transcriptID = 0;
+        size_t maxNumTrans = 0;
+        Index maxGene;
+        for ( transcriptID = 0; transcriptID < _transcriptsToGenes.size(); ++transcriptID ) {
+            _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].push_back( transcriptID );
+            if ( maxNumTrans < _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].size() ) {
+                maxNumTrans = _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].size();
+                maxGene = _transcriptsToGenes[transcriptID];
+            }
+        }
+        std::cerr << "max # of transcripts in a gene was " << maxNumTrans << " in gene " << _geneNames[maxGene] << "\n";
+    }
+
+    friend class cereal::access;
+
+    template<class Archive>
+    void serialize(Archive & ar, const unsigned int version)
+    {
+        ar(_transcriptNames, _geneNames, _transcriptsToGenes,
+           _genesToTranscripts, _haveReverseMap);
+    }
+
+public:
+    TranscriptGeneMap() :
+        _transcriptNames(NameVector()), _geneNames(NameVector()),
+        _transcriptsToGenes(IndexVector()), _haveReverseMap(false) {}
+
+
+    TranscriptGeneMap( const NameVector &transcriptNames,
+                       const NameVector &geneNames,
+                       const IndexVector &transcriptsToGenes ) :
+        _transcriptNames(transcriptNames), _geneNames(geneNames),
+        _transcriptsToGenes(transcriptsToGenes), _haveReverseMap(false) {}
+
+
+    TranscriptGeneMap(const TranscriptGeneMap& other) = default;
+    TranscriptGeneMap& operator=(const TranscriptGeneMap& other) = default;
+
+
+    Index INVALID { std::numeric_limits<Index>::max() };
+
+    Index findTranscriptID( const std::string &tname ) {
+        using std::distance;
+        using std::lower_bound;
+        auto it = lower_bound( _transcriptNames.begin(), _transcriptNames.end(), tname );
+        return ( it == _transcriptNames.end() ) ? INVALID : ( distance(_transcriptNames.begin(), it) );
+    }
+
+    Size numTranscripts() {
+        return _transcriptNames.size();
+    }
+    Size numGenes() {
+        return _geneNames.size();
+    }
+
+    bool needReverse() {
+        if ( _haveReverseMap ) {
+            return false;
+        } else {
+            _computeReverseMap();
+            return true;
+        }
+    }
+
+    const IndexVector &transcriptsForGene( Index geneID ) {
+        return _genesToTranscripts[geneID];
+    }
+
+    inline std::string nameFromGeneID( Index geneID ) {
+        return _geneNames[geneID];
+    }
+    inline Index gene( Index transcriptID ) {
+        return _transcriptsToGenes[transcriptID];
+    }
+    inline std::string geneName( Index transcriptID ) {
+        return _geneNames[_transcriptsToGenes[transcriptID]];
+    }
+    inline std::string geneName (const std::string& transcriptName,
+                                 bool complain=true) {
+        auto tid = findTranscriptID(transcriptName);
+        if (tid != INVALID) {
+            return geneName(tid);
+        } else {
+            std::cerr << "WARNING: couldn't find transcript named ["
+                      << transcriptName << "]; returning transcript "
+                      << " as it's own gene\n";
+            return transcriptName;
+        }
+    }
+
+    inline std::string transcriptName( Index transcriptID ) {
+        return _transcriptNames[transcriptID];
+    }
+};
+
+#endif // TRANSCRIPT_GENE_MAP_HPP
diff --git a/include/TranscriptGroup.hpp b/include/TranscriptGroup.hpp
new file mode 100644
index 0000000..5c1d163
--- /dev/null
+++ b/include/TranscriptGroup.hpp
@@ -0,0 +1,51 @@
+#ifndef TRANSCRIPT_GROUP_HPP
+#define TRANSCRIPT_GROUP_HPP
+
+#include <boost/functional/hash.hpp>
+
+#include <vector>
+#include <cstdint>
+
+class TranscriptGroup {
+    public:
+        TranscriptGroup();
+        TranscriptGroup(std::vector<uint32_t> txpsIn);
+
+	TranscriptGroup(
+        	std::vector<uint32_t> txpsIn,
+		    size_t hashIn);
+
+        TranscriptGroup(TranscriptGroup&& other);
+        TranscriptGroup(const TranscriptGroup& other);
+
+        TranscriptGroup& operator=(const TranscriptGroup& other);
+        TranscriptGroup& operator=(TranscriptGroup&& other);
+
+        friend bool operator==(const TranscriptGroup& lhs,
+                               const TranscriptGroup& rhs);
+
+        void setValid(bool v) const;
+
+        std::vector<uint32_t> txps;
+    	size_t hash;
+        double totalMass;
+        mutable bool valid;
+};
+
+bool operator==(const TranscriptGroup& lhs, const TranscriptGroup& rhs);
+
+struct TranscriptGroupHasher {
+    std::size_t operator()(const TranscriptGroup & k) const
+  {
+	return k.hash;
+    /*
+    std::size_t seed{0};
+    for (auto e : k.txps) {
+        boost::hash_combine(seed, e);
+    }
+    return seed;
+    */
+  }
+};
+
+#endif // TRANSCRIPT_GROUP_HPP
diff --git a/include/UnpairedRead.hpp b/include/UnpairedRead.hpp
new file mode 100644
index 0000000..9dd7ff7
--- /dev/null
+++ b/include/UnpairedRead.hpp
@@ -0,0 +1,90 @@
+#ifndef UNPAIRED_READ
+#define UNPAIRED_READ
+
+extern "C" {
+#ifdef HAVE_CONFIG_H
+#undef HAVE_CONFIG_H
+#endif
+
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+
+//#define HAVE_CONFIG_H
+}
+
+#include "StadenUtils.hpp"
+#include "SalmonMath.hpp"
+#include "LibraryFormat.hpp"
+
+struct UnpairedRead {
+   bam_seq_t* read = nullptr;
+   double logProb;
+   LibraryFormat libFmt{ReadType::PAIRED_END, ReadOrientation::NONE, ReadStrandedness::U};
+
+   UnpairedRead() : read(staden::utils::bam_init()), logProb(salmon::math::LOG_0) {}
+
+   UnpairedRead(bam_seq_t* r, double lp, LibraryFormat lf) :
+       read(r), logProb(lp), libFmt(lf) {}
+
+   UnpairedRead(UnpairedRead&& other) {
+       logProb = other.logProb;
+       std::swap(read, other.read);
+       libFmt = other.libFmt;
+   }
+
+   UnpairedRead& operator=(UnpairedRead&& other) {
+       logProb = other.logProb;
+       std::swap(read, other.read);
+       libFmt = other.libFmt;
+       return *this;
+   }
+
+   UnpairedRead(UnpairedRead& other) = default;
+
+   UnpairedRead& operator=(UnpairedRead& other) = default;
+
+   UnpairedRead* clone() {
+       return new UnpairedRead(bam_dup(read), logProb, libFmt);
+   }
+
+   ~UnpairedRead() { staden::utils::bam_destroy(read); }
+
+   inline LibraryFormat& libFormat() { return libFmt; }
+   inline bool isPaired() const { return false; }
+   inline bool isLeftOrphan() const { return false; }
+   inline bool isRightOrphan() const { return false; }
+
+    // return 0 on success, -1 on failure
+    int writeToFile(scram_fd* fp) {
+        return scram_put_seq(fp, read);
+    }
+
+    inline char* getName() {
+        return  bam_name(read);
+    }
+
+    inline uint32_t getNameLength() {
+        return bam_name_len(read);
+    }
+
+   inline bool isRight() const { return bam_flag(read) & BAM_FREVERSE; }
+   inline bool isLeft()  const { return !isRight(); }
+   inline int32_t left() const { return bam_pos(read); }
+   inline int32_t right() const { return left() + bam_seq_len(read); }
+   inline uint32_t fragLen() const { return 0; }
+   inline ReadType fragType() const { return ReadType::SINGLE_END; }
+   inline int32_t transcriptID() const { return bam_ref(read); }
+
+    inline double logQualProb() const {
+        return salmon::math::LOG_1;
+        /*
+        int q = bam_map_qual(read);
+        //double logP = (q == 255) ? calcQuality(read) : std::log(std::pow(10.0, -q * 0.1));
+        double logP = (q == 255) ? salmon::math::LOG_1 : std::log(std::pow(10.0, -q * 0.1));
+        return logP;
+        */
+    }
+
+};
+
+#endif //UNPAIRED_READ
diff --git a/include/VersionChecker.hpp b/include/VersionChecker.hpp
new file mode 100644
index 0000000..a9cb2f8
--- /dev/null
+++ b/include/VersionChecker.hpp
@@ -0,0 +1,52 @@
+// based off of 
+// async_client.cpp
+// ~~~~~~~~~~~~~~~~
+//
+// Copyright (c) 2003-2012 Christopher M. Kohlhoff (chris at kohlhoff dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+
+#ifndef VERSION_CHECKER_HPP
+#define VERSION_CHECKER_HPP
+
+#include <iostream>
+#include <istream>
+#include <ostream>
+#include <string>
+#include <sstream>
+#include <boost/asio.hpp>
+#include <boost/bind.hpp>
+#include <boost/date_time/posix_time/posix_time.hpp>
+
+using boost::asio::ip::tcp;
+
+class VersionChecker
+{
+public:
+  VersionChecker(boost::asio::io_service& io_service,
+      const std::string& server, const std::string& path);
+  std::string message();
+
+private:
+  void cancel_upgrade_check(const boost::system::error_code& err);
+  void handle_resolve(const boost::system::error_code& err,
+      tcp::resolver::iterator endpoint_iterator);
+  void handle_connect(const boost::system::error_code& err);
+  void handle_write_request(const boost::system::error_code& err);
+  void handle_read_status_line(const boost::system::error_code& err);
+  void handle_read_headers(const boost::system::error_code& err);
+  void handle_read_content(const boost::system::error_code& err);
+
+  tcp::resolver resolver_;
+  tcp::socket socket_;
+  boost::asio::streambuf request_;
+  boost::asio::streambuf response_;
+  boost::asio::deadline_timer deadline_;
+  std::stringstream messageStream_;
+};
+
+std::string getVersionMessage();
+
+#endif //VERSION_CHECKER_HPP
\ No newline at end of file
diff --git a/include/atomicops.h b/include/atomicops.h
new file mode 100644
index 0000000..aadd472
--- /dev/null
+++ b/include/atomicops.h
@@ -0,0 +1,276 @@
+// ©2013 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+
+#pragma once
+
+// Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation
+// of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment).
+// Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees).
+// Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols.
+
+#include <cassert>
+
+
+// Platform detection
+#if defined(__INTEL_COMPILER)
+#define AE_ICC
+#elif defined(_MSC_VER)
+#define AE_VCPP
+#elif defined(__GNUC__)
+#define AE_GCC
+#endif
+
+#if defined(_M_IA64) || defined(__ia64__)
+#define AE_ARCH_IA64
+#elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__)
+#define AE_ARCH_X64
+#elif defined(_M_IX86) || defined(__i386__)
+#define AE_ARCH_X86
+#elif defined(_M_PPC) || defined(__powerpc__)
+#define AE_ARCH_PPC
+#else
+#define AE_ARCH_UNKNOWN
+#endif
+
+
+// AE_UNUSED
+#define AE_UNUSED(x) ((void)x)
+
+
+// AE_FORCEINLINE
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_FORCEINLINE __forceinline
+#elif defined(AE_GCC)
+//#define AE_FORCEINLINE __attribute__((always_inline)) 
+#define AE_FORCEINLINE inline
+#else
+#define AE_FORCEINLINE inline
+#endif
+
+
+// AE_ALIGN
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_ALIGN(x) __declspec(align(x))
+#elif defined(AE_GCC)
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#else
+// Assume GCC compliant syntax...
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+
+// Portable atomic fences implemented below:
+
+namespace moodycamel {
+
+enum memory_order {
+	memory_order_relaxed,
+	memory_order_acquire,
+	memory_order_release,
+	memory_order_acq_rel,
+	memory_order_seq_cst,
+
+	// memory_order_sync: Forces a full sync:
+	// #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad
+	memory_order_sync = memory_order_seq_cst
+};
+
+}    // end namespace moodycamel
+
+#if defined(AE_VCPP) || defined(AE_ICC)
+// VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences
+
+#include <intrin.h>
+
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+#define AeFullSync _mm_mfence
+#define AeLiteSync _mm_mfence
+#elif defined(AE_ARCH_IA64)
+#define AeFullSync __mf
+#define AeLiteSync __mf
+#elif defined(AE_ARCH_PPC)
+#include <ppcintrinsics.h>
+#define AeFullSync __sync
+#define AeLiteSync __lwsync
+#endif
+
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4365)		// Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch' error when using `assert`
+#endif
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst: _ReadWriteBarrier(); break;
+		default: assert(false);
+	}
+}
+
+// x86/x64 have a strong memory model -- all loads and stores have
+// acquire and release semantics automatically (so only need compiler
+// barriers for those).
+#if defined(AE_ARCH_X86) || defined(AE_ARCH_X64)
+AE_FORCEINLINE void fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#else
+AE_FORCEINLINE void fence(memory_order order)
+{
+	// Non-specialized arch, use heavier memory barriers everywhere just in case :-(
+	switch (order) {
+		case memory_order_relaxed:
+			break;
+		case memory_order_acquire:
+			_ReadBarrier();
+			AeLiteSync();
+			_ReadBarrier();
+			break;
+		case memory_order_release:
+			_WriteBarrier();
+			AeLiteSync();
+			_WriteBarrier();
+			break;
+		case memory_order_acq_rel:
+			_ReadWriteBarrier();
+			AeLiteSync();
+			_ReadWriteBarrier();
+			break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#endif
+}    // end namespace moodycamel
+#else
+// Use standard library of atomics
+#include <atomic>
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break;
+		case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+AE_FORCEINLINE void fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: std::atomic_thread_fence(std::memory_order_acquire); break;
+		case memory_order_release: std::atomic_thread_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: std::atomic_thread_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: std::atomic_thread_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+}    // end namespace moodycamel
+
+#endif
+
+
+
+
+#if !defined(AE_VCPP) || _MSC_VER >= 1700
+#define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#endif
+
+#ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#include <atomic>
+#endif
+#include <utility>
+
+// WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY:
+// Provides basic support for atomic variables -- no memory ordering guarantees are provided.
+// The guarantee of atomicity is only made for types that already have atomic load and store guarantees
+// at the hardware level -- on most platforms this generally means aligned pointers and integers (only).
+namespace moodycamel {
+template<typename T>
+class weak_atomic
+{
+public:
+	weak_atomic() { }
+#ifdef AE_VCPP
+#pragma warning(disable: 4100)		// Get rid of (erroneous) 'unreferenced formal parameter' warning
+#endif
+	template<typename U> weak_atomic(U&& x) : value(std::forward<U>(x)) {  }
+	weak_atomic(weak_atomic const& other) : value(other.value) {  }
+	weak_atomic(weak_atomic&& other) : value(std::move(other.value)) {  }
+#ifdef AE_VCPP
+#pragma warning(default: 4100)
+#endif
+
+	AE_FORCEINLINE operator T() const { return load(); }
+
+	
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	template<typename U> AE_FORCEINLINE weak_atomic const& operator=(U&& x) { value = std::forward<U>(x); return *this; }
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) { value = other.value; return *this; }
+	
+	AE_FORCEINLINE T load() const { return value; }
+#else
+	template<typename U>
+	AE_FORCEINLINE weak_atomic const& operator=(U&& x)
+	{
+		value.store(std::forward<U>(x), std::memory_order_relaxed);
+		return *this;
+	}
+	
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other)
+	{
+		value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		return *this;
+	}
+
+	AE_FORCEINLINE T load() const { return value.load(std::memory_order_relaxed); }
+#endif
+	
+
+private:
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	// No std::atomic support, but still need to circumvent compiler optimizations.
+	// `volatile` will make memory access slow, but is guaranteed to be reliable.
+	volatile T value;
+#else
+	std::atomic<T> value;
+#endif
+};
+
+}	// end namespace moodycamel
+
+
+#ifdef AE_VCPP
+#pragma warning(pop)
+#endif
diff --git a/include/btree.h b/include/btree.h
new file mode 100755
index 0000000..49310a2
--- /dev/null
+++ b/include/btree.h
@@ -0,0 +1,2394 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree implementation of the STL set and map interfaces. A btree is both
+// smaller and faster than STL set/map. The red-black tree implementation of
+// STL set/map has an overhead of 3 pointers (left, right and parent) plus the
+// node color information for each stored value. So a set<int32> consumes 20
+// bytes for each value stored. This btree implementation stores multiple
+// values on fixed size nodes (usually 256 bytes) and doesn't store child
+// pointers for leaf nodes. The result is that a btree_set<int32> may use much
+// less memory per stored value. For the random insertion benchmark in
+// btree_test.cc, a btree_set<int32> with node-size of 256 uses 4.9 bytes per
+// stored value.
+//
+// The packing of multiple values on to each node of a btree has another effect
+// besides better space utilization: better cache locality due to fewer cache
+// lines being accessed. Better cache locality translates into faster
+// operations.
+//
+// CAVEATS
+//
+// Insertions and deletions on a btree can cause splitting, merging or
+// rebalancing of btree nodes. And even without these operations, insertions
+// and deletions on a btree will move values around within a node. In both
+// cases, the result is that insertions and deletions can invalidate iterators
+// pointing to values other than the one being inserted/deleted. This is
+// notably different from STL set/map which takes care to not invalidate
+// iterators on insert/erase except, of course, for iterators pointing to the
+// value being erased.  A partial workaround when erasing is available:
+// erase() returns an iterator pointing to the item just after the one that was
+// erased (or end() if none exists).  See also safe_btree.
+
+// PERFORMANCE
+//
+//   btree_bench --benchmarks=. 2>&1 | ./benchmarks.awk
+//
+// Run on pmattis-warp.nyc (4 X 2200 MHz CPUs); 2010/03/04-15:23:06
+// Benchmark                 STL(ns) B-Tree(ns) @    <size>
+// --------------------------------------------------------
+// BM_set_int32_insert        1516      608  +59.89%  <256>    [40.0,  5.2]
+// BM_set_int32_lookup        1160      414  +64.31%  <256>    [40.0,  5.2]
+// BM_set_int32_fulllookup     960      410  +57.29%  <256>    [40.0,  4.4]
+// BM_set_int32_delete        1741      528  +69.67%  <256>    [40.0,  5.2]
+// BM_set_int32_queueaddrem   3078     1046  +66.02%  <256>    [40.0,  5.5]
+// BM_set_int32_mixedaddrem   3600     1384  +61.56%  <256>    [40.0,  5.3]
+// BM_set_int32_fifo           227      113  +50.22%  <256>    [40.0,  4.4]
+// BM_set_int32_fwditer        158       26  +83.54%  <256>    [40.0,  5.2]
+// BM_map_int32_insert        1551      636  +58.99%  <256>    [48.0, 10.5]
+// BM_map_int32_lookup        1200      508  +57.67%  <256>    [48.0, 10.5]
+// BM_map_int32_fulllookup     989      487  +50.76%  <256>    [48.0,  8.8]
+// BM_map_int32_delete        1794      628  +64.99%  <256>    [48.0, 10.5]
+// BM_map_int32_queueaddrem   3189     1266  +60.30%  <256>    [48.0, 11.6]
+// BM_map_int32_mixedaddrem   3822     1623  +57.54%  <256>    [48.0, 10.9]
+// BM_map_int32_fifo           151      134  +11.26%  <256>    [48.0,  8.8]
+// BM_map_int32_fwditer        161       32  +80.12%  <256>    [48.0, 10.5]
+// BM_set_int64_insert        1546      636  +58.86%  <256>    [40.0, 10.5]
+// BM_set_int64_lookup        1200      512  +57.33%  <256>    [40.0, 10.5]
+// BM_set_int64_fulllookup     971      487  +49.85%  <256>    [40.0,  8.8]
+// BM_set_int64_delete        1745      616  +64.70%  <256>    [40.0, 10.5]
+// BM_set_int64_queueaddrem   3163     1195  +62.22%  <256>    [40.0, 11.6]
+// BM_set_int64_mixedaddrem   3760     1564  +58.40%  <256>    [40.0, 10.9]
+// BM_set_int64_fifo           146      103  +29.45%  <256>    [40.0,  8.8]
+// BM_set_int64_fwditer        162       31  +80.86%  <256>    [40.0, 10.5]
+// BM_map_int64_insert        1551      720  +53.58%  <256>    [48.0, 20.7]
+// BM_map_int64_lookup        1214      612  +49.59%  <256>    [48.0, 20.7]
+// BM_map_int64_fulllookup     994      592  +40.44%  <256>    [48.0, 17.2]
+// BM_map_int64_delete        1778      764  +57.03%  <256>    [48.0, 20.7]
+// BM_map_int64_queueaddrem   3189     1547  +51.49%  <256>    [48.0, 20.9]
+// BM_map_int64_mixedaddrem   3779     1887  +50.07%  <256>    [48.0, 21.6]
+// BM_map_int64_fifo           147      145   +1.36%  <256>    [48.0, 17.2]
+// BM_map_int64_fwditer        162       41  +74.69%  <256>    [48.0, 20.7]
+// BM_set_string_insert       1989     1966   +1.16%  <256>    [64.0, 44.5]
+// BM_set_string_lookup       1709     1600   +6.38%  <256>    [64.0, 44.5]
+// BM_set_string_fulllookup   1573     1529   +2.80%  <256>    [64.0, 35.4]
+// BM_set_string_delete       2520     1920  +23.81%  <256>    [64.0, 44.5]
+// BM_set_string_queueaddrem  4706     4309   +8.44%  <256>    [64.0, 48.3]
+// BM_set_string_mixedaddrem  5080     4654   +8.39%  <256>    [64.0, 46.7]
+// BM_set_string_fifo          318      512  -61.01%  <256>    [64.0, 35.4]
+// BM_set_string_fwditer       182       93  +48.90%  <256>    [64.0, 44.5]
+// BM_map_string_insert       2600     2227  +14.35%  <256>    [72.0, 55.8]
+// BM_map_string_lookup       2068     1730  +16.34%  <256>    [72.0, 55.8]
+// BM_map_string_fulllookup   1859     1618  +12.96%  <256>    [72.0, 44.0]
+// BM_map_string_delete       3168     2080  +34.34%  <256>    [72.0, 55.8]
+// BM_map_string_queueaddrem  5840     4701  +19.50%  <256>    [72.0, 59.4]
+// BM_map_string_mixedaddrem  6400     5200  +18.75%  <256>    [72.0, 57.8]
+// BM_map_string_fifo          398      596  -49.75%  <256>    [72.0, 44.0]
+// BM_map_string_fwditer       243      113  +53.50%  <256>    [72.0, 55.8]
+
+#ifndef UTIL_BTREE_BTREE_H__
+#define UTIL_BTREE_BTREE_H__
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+#include <new>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#ifndef NDEBUG
+#define NDEBUG 1
+#endif
+
+namespace btree {
+
+// Inside a btree method, if we just call swap(), it will choose the
+// btree::swap method, which we don't want. And we can't say ::swap
+// because then MSVC won't pickup any std::swap() implementations. We
+// can't just use std::swap() directly because then we don't get the
+// specialization for types outside the std namespace. So the solution
+// is to have a special swap helper function whose name doesn't
+// collide with other swap functions defined by the btree classes.
+template <typename T>
+inline void btree_swap_helper(T &a, T &b) {
+  using std::swap;
+  swap(a, b);
+}
+
+// A template helper used to select A or B based on a condition.
+template<bool cond, typename A, typename B>
+struct if_{
+  typedef A type;
+};
+
+template<typename A, typename B>
+struct if_<false, A, B> {
+  typedef B type;
+};
+
+// Types small_ and big_ are promise that sizeof(small_) < sizeof(big_)
+typedef char small_;
+
+struct big_ {
+  char dummy[2];
+};
+
+// A compile-time assertion.
+template <bool>
+struct CompileAssert {
+};
+
+#define COMPILE_ASSERT(expr, msg) \
+  typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
+
+// A helper type used to indicate that a key-compare-to functor has been
+// provided. A user can specify a key-compare-to functor by doing:
+//
+//  struct MyStringComparer
+//      : public util::btree::btree_key_compare_to_tag {
+//    int operator()(const string &a, const string &b) const {
+//      return a.compare(b);
+//    }
+//  };
+//
+// Note that the return type is an int and not a bool. There is a
+// COMPILE_ASSERT which enforces this return type.
+struct btree_key_compare_to_tag {
+};
+
+// A helper class that indicates if the Compare parameter is derived from
+// btree_key_compare_to_tag.
+template <typename Compare>
+struct btree_is_key_compare_to
+    : public std::is_convertible<Compare, btree_key_compare_to_tag> {
+};
+
+// A helper class to convert a boolean comparison into a three-way
+// "compare-to" comparison that returns a negative value to indicate
+// less-than, zero to indicate equality and a positive value to
+// indicate greater-than. This helper class is specialized for
+// less<string> and greater<string>. The btree_key_compare_to_adapter
+// class is provided so that btree users automatically get the more
+// efficient compare-to code when using common google string types
+// with common comparison functors.
+template <typename Compare>
+struct btree_key_compare_to_adapter : Compare {
+  btree_key_compare_to_adapter() { }
+  btree_key_compare_to_adapter(const Compare &c) : Compare(c) { }
+  btree_key_compare_to_adapter(const btree_key_compare_to_adapter<Compare> &c)
+      : Compare(c) {
+  }
+};
+
+template <>
+struct btree_key_compare_to_adapter<std::less<std::string> >
+    : public btree_key_compare_to_tag {
+  btree_key_compare_to_adapter() {}
+  btree_key_compare_to_adapter(const std::less<std::string>&) {}
+  btree_key_compare_to_adapter(
+      const btree_key_compare_to_adapter<std::less<std::string> >&) {}
+  int operator()(const std::string &a, const std::string &b) const {
+    return a.compare(b);
+  }
+};
+
+template <>
+struct btree_key_compare_to_adapter<std::greater<std::string> >
+    : public btree_key_compare_to_tag {
+  btree_key_compare_to_adapter() {}
+  btree_key_compare_to_adapter(const std::greater<std::string>&) {}
+  btree_key_compare_to_adapter(
+      const btree_key_compare_to_adapter<std::greater<std::string> >&) {}
+  int operator()(const std::string &a, const std::string &b) const {
+    return b.compare(a);
+  }
+};
+
+// A helper class that allows a compare-to functor to behave like a plain
+// compare functor. This specialization is used when we do not have a
+// compare-to functor.
+template <typename Key, typename Compare, bool HaveCompareTo>
+struct btree_key_comparer {
+  btree_key_comparer() {}
+  btree_key_comparer(Compare c) : comp(c) {}
+  static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
+    return comp(x, y);
+  }
+  bool operator()(const Key &x, const Key &y) const {
+    return bool_compare(comp, x, y);
+  }
+  Compare comp;
+};
+
+// A specialization of btree_key_comparer when a compare-to functor is
+// present. We need a plain (boolean) comparison in some parts of the btree
+// code, such as insert-with-hint.
+template <typename Key, typename Compare>
+struct btree_key_comparer<Key, Compare, true> {
+  btree_key_comparer() {}
+  btree_key_comparer(Compare c) : comp(c) {}
+  static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
+    return comp(x, y) < 0;
+  }
+  bool operator()(const Key &x, const Key &y) const {
+    return bool_compare(comp, x, y);
+  }
+  Compare comp;
+};
+
+// A helper function to compare to keys using the specified compare
+// functor. This dispatches to the appropriate btree_key_comparer comparison,
+// depending on whether we have a compare-to functor or not (which depends on
+// whether Compare is derived from btree_key_compare_to_tag).
+template <typename Key, typename Compare>
+static bool btree_compare_keys(
+    const Compare &comp, const Key &x, const Key &y) {
+  typedef btree_key_comparer<Key, Compare,
+      btree_is_key_compare_to<Compare>::value> key_comparer;
+  return key_comparer::bool_compare(comp, x, y);
+}
+
+template <typename Key, typename Compare,
+          typename Alloc, int TargetNodeSize, int ValueSize>
+struct btree_common_params {
+  // If Compare is derived from btree_key_compare_to_tag then use it as the
+  // key_compare type. Otherwise, use btree_key_compare_to_adapter<> which will
+  // fall-back to Compare if we don't have an appropriate specialization.
+  typedef typename if_<
+    btree_is_key_compare_to<Compare>::value,
+    Compare, btree_key_compare_to_adapter<Compare> >::type key_compare;
+  // A type which indicates if we have a key-compare-to functor or a plain old
+  // key-compare functor.
+  typedef btree_is_key_compare_to<key_compare> is_key_compare_to;
+
+  typedef Alloc allocator_type;
+  typedef Key key_type;
+  typedef ssize_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  enum {
+    kTargetNodeSize = TargetNodeSize,
+
+    // Available space for values.  This is largest for leaf nodes,
+    // which has overhead no fewer than two pointers.
+    kNodeValueSpace = TargetNodeSize - 2 * sizeof(void*),
+  };
+
+  // This is an integral type large enough to hold as many
+  // ValueSize-values as will fit a node of TargetNodeSize bytes.
+  typedef typename if_<
+    (kNodeValueSpace / ValueSize) >= 256,
+    uint16_t,
+    uint8_t>::type node_count_type;
+};
+
+// A parameters structure for holding the type parameters for a btree_map.
+template <typename Key, typename Data, typename Compare,
+          typename Alloc, int TargetNodeSize>
+struct btree_map_params
+    : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
+                                 sizeof(Key) + sizeof(Data)> {
+  typedef Data data_type;
+  typedef Data mapped_type;
+  typedef std::pair<const Key, data_type> value_type;
+  typedef std::pair<Key, data_type> mutable_value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  enum {
+    kValueSize = sizeof(Key) + sizeof(data_type),
+  };
+
+  static const Key& key(const value_type &x) { return x.first; }
+  static const Key& key(const mutable_value_type &x) { return x.first; }
+  static void swap(mutable_value_type *a, mutable_value_type *b) {
+    btree_swap_helper(a->first, b->first);
+    btree_swap_helper(a->second, b->second);
+  }
+};
+
+// A parameters structure for holding the type parameters for a btree_set.
+template <typename Key, typename Compare, typename Alloc, int TargetNodeSize>
+struct btree_set_params
+    : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
+                                 sizeof(Key)> {
+  typedef std::false_type data_type;
+  typedef std::false_type mapped_type;
+  typedef Key value_type;
+  typedef value_type mutable_value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  enum {
+    kValueSize = sizeof(Key),
+  };
+
+  static const Key& key(const value_type &x) { return x; }
+  static void swap(mutable_value_type *a, mutable_value_type *b) {
+    btree_swap_helper<mutable_value_type>(*a, *b);
+  }
+};
+
+// An adapter class that converts a lower-bound compare into an upper-bound
+// compare.
+template <typename Key, typename Compare>
+struct btree_upper_bound_adapter : public Compare {
+  btree_upper_bound_adapter(Compare c) : Compare(c) {}
+  bool operator()(const Key &a, const Key &b) const {
+    return !static_cast<const Compare&>(*this)(b, a);
+  }
+};
+
+template <typename Key, typename CompareTo>
+struct btree_upper_bound_compare_to_adapter : public CompareTo {
+  btree_upper_bound_compare_to_adapter(CompareTo c) : CompareTo(c) {}
+  int operator()(const Key &a, const Key &b) const {
+    return static_cast<const CompareTo&>(*this)(b, a);
+  }
+};
+
+// Dispatch helper class for using linear search with plain compare.
+template <typename K, typename N, typename Compare>
+struct btree_linear_search_plain_compare {
+  static int lower_bound(const K &k, const N &n, Compare comp)  {
+    return n.linear_search_plain_compare(k, 0, n.count(), comp);
+  }
+  static int upper_bound(const K &k, const N &n, Compare comp)  {
+    typedef btree_upper_bound_adapter<K, Compare> upper_compare;
+    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// Dispatch helper class for using linear search with compare-to
+template <typename K, typename N, typename CompareTo>
+struct btree_linear_search_compare_to {
+  static int lower_bound(const K &k, const N &n, CompareTo comp)  {
+    return n.linear_search_compare_to(k, 0, n.count(), comp);
+  }
+  static int upper_bound(const K &k, const N &n, CompareTo comp)  {
+    typedef btree_upper_bound_adapter<K,
+        btree_key_comparer<K, CompareTo, true> > upper_compare;
+    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// Dispatch helper class for using binary search with plain compare.
+template <typename K, typename N, typename Compare>
+struct btree_binary_search_plain_compare {
+  static int lower_bound(const K &k, const N &n, Compare comp)  {
+    return n.binary_search_plain_compare(k, 0, n.count(), comp);
+  }
+  static int upper_bound(const K &k, const N &n, Compare comp)  {
+    typedef btree_upper_bound_adapter<K, Compare> upper_compare;
+    return n.binary_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// Dispatch helper class for using binary search with compare-to.
+template <typename K, typename N, typename CompareTo>
+struct btree_binary_search_compare_to {
+  static int lower_bound(const K &k, const N &n, CompareTo comp)  {
+    return n.binary_search_compare_to(k, 0, n.count(), CompareTo());
+  }
+  static int upper_bound(const K &k, const N &n, CompareTo comp)  {
+    typedef btree_upper_bound_adapter<K,
+        btree_key_comparer<K, CompareTo, true> > upper_compare;
+    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// A node in the btree holding. The same node type is used for both internal
+// and leaf nodes in the btree, though the nodes are allocated in such a way
+// that the children array is only valid in internal nodes.
+template <typename Params>
+class btree_node {
+ public:
+  typedef Params params_type;
+  typedef btree_node<Params> self_type;
+  typedef typename Params::key_type key_type;
+  typedef typename Params::data_type data_type;
+  typedef typename Params::value_type value_type;
+  typedef typename Params::mutable_value_type mutable_value_type;
+  typedef typename Params::pointer pointer;
+  typedef typename Params::const_pointer const_pointer;
+  typedef typename Params::reference reference;
+  typedef typename Params::const_reference const_reference;
+  typedef typename Params::key_compare key_compare;
+  typedef typename Params::size_type size_type;
+  typedef typename Params::difference_type difference_type;
+  // Typedefs for the various types of node searches.
+  typedef btree_linear_search_plain_compare<
+    key_type, self_type, key_compare> linear_search_plain_compare_type;
+  typedef btree_linear_search_compare_to<
+    key_type, self_type, key_compare> linear_search_compare_to_type;
+  typedef btree_binary_search_plain_compare<
+    key_type, self_type, key_compare> binary_search_plain_compare_type;
+  typedef btree_binary_search_compare_to<
+    key_type, self_type, key_compare> binary_search_compare_to_type;
+  // If we have a valid key-compare-to type, use linear_search_compare_to,
+  // otherwise use linear_search_plain_compare.
+  typedef typename if_<
+    Params::is_key_compare_to::value,
+    linear_search_compare_to_type,
+    linear_search_plain_compare_type>::type linear_search_type;
+  // If we have a valid key-compare-to type, use binary_search_compare_to,
+  // otherwise use binary_search_plain_compare.
+  typedef typename if_<
+    Params::is_key_compare_to::value,
+    binary_search_compare_to_type,
+    binary_search_plain_compare_type>::type binary_search_type;
+  // If the key is an integral or floating point type, use linear search which
+  // is faster than binary search for such types. Might be wise to also
+  // configure linear search based on node-size.
+  typedef typename if_<
+    std::is_integral<key_type>::value ||
+    std::is_floating_point<key_type>::value,
+    linear_search_type, binary_search_type>::type search_type;
+
+  struct base_fields {
+    typedef typename Params::node_count_type field_type;
+
+    // A boolean indicating whether the node is a leaf or not.
+    bool leaf;
+    // The position of the node in the node's parent.
+    field_type position;
+    // The maximum number of values the node can hold.
+    field_type max_count;
+    // The count of the number of values in the node.
+    field_type count;
+    // A pointer to the node's parent.
+    btree_node *parent;
+  };
+
+  enum {
+    kValueSize = params_type::kValueSize,
+    kTargetNodeSize = params_type::kTargetNodeSize,
+
+    // Compute how many values we can fit onto a leaf node.
+    kNodeTargetValues = (kTargetNodeSize - sizeof(base_fields)) / kValueSize,
+    // We need a minimum of 3 values per internal node in order to perform
+    // splitting (1 value for the two nodes involved in the split and 1 value
+    // propagated to the parent as the delimiter for the split).
+    kNodeValues = kNodeTargetValues >= 3 ? kNodeTargetValues : 3,
+
+    kExactMatch = 1 << 30,
+    kMatchMask = kExactMatch - 1,
+  };
+
+  struct leaf_fields : public base_fields {
+    // The array of values. Only the first count of these values have been
+    // constructed and are valid.
+    mutable_value_type values[kNodeValues];
+  };
+
+  struct internal_fields : public leaf_fields {
+    // The array of child pointers. The keys in children_[i] are all less than
+    // key(i). The keys in children_[i + 1] are all greater than key(i). There
+    // are always count + 1 children.
+    btree_node *children[kNodeValues + 1];
+  };
+
+  struct root_fields : public internal_fields {
+    btree_node *rightmost;
+    size_type size;
+  };
+
+ public:
+  // Getter/setter for whether this is a leaf node or not. This value doesn't
+  // change after the node is created.
+  bool leaf() const { return fields_.leaf; }
+
+  // Getter for the position of this node in its parent.
+  int position() const { return fields_.position; }
+  void set_position(int v) { fields_.position = v; }
+
+  // Getter/setter for the number of values stored in this node.
+  int count() const { return fields_.count; }
+  void set_count(int v) { fields_.count = v; }
+  int max_count() const { return fields_.max_count; }
+
+  // Getter for the parent of this node.
+  btree_node* parent() const { return fields_.parent; }
+  // Getter for whether the node is the root of the tree. The parent of the
+  // root of the tree is the leftmost node in the tree which is guaranteed to
+  // be a leaf.
+  bool is_root() const { return parent()->leaf(); }
+  void make_root() {
+    assert(parent()->is_root());
+    fields_.parent = fields_.parent->parent();
+  }
+
+  // Getter for the rightmost root node field. Only valid on the root node.
+  btree_node* rightmost() const { return fields_.rightmost; }
+  btree_node** mutable_rightmost() { return &fields_.rightmost; }
+
+  // Getter for the size root node field. Only valid on the root node.
+  size_type size() const { return fields_.size; }
+  size_type* mutable_size() { return &fields_.size; }
+
+  // Getters for the key/value at position i in the node.
+  const key_type& key(int i) const {
+    return params_type::key(fields_.values[i]);
+  }
+  reference value(int i) {
+    return reinterpret_cast<reference>(fields_.values[i]);
+  }
+  const_reference value(int i) const {
+    return reinterpret_cast<const_reference>(fields_.values[i]);
+  }
+  mutable_value_type* mutable_value(int i) {
+    return &fields_.values[i];
+  }
+
+  // Swap value i in this node with value j in node x.
+  void value_swap(int i, btree_node *x, int j) {
+    params_type::swap(mutable_value(i), x->mutable_value(j));
+  }
+
+  // Getters/setter for the child at position i in the node.
+  btree_node* child(int i) const { return fields_.children[i]; }
+  btree_node** mutable_child(int i) { return &fields_.children[i]; }
+  void set_child(int i, btree_node *c) {
+    *mutable_child(i) = c;
+    c->fields_.parent = this;
+    c->fields_.position = i;
+  }
+
+  // Returns the position of the first value whose key is not less than k.
+  template <typename Compare>
+  int lower_bound(const key_type &k, const Compare &comp) const {
+    return search_type::lower_bound(k, *this, comp);
+  }
+  // Returns the position of the first value whose key is greater than k.
+  template <typename Compare>
+  int upper_bound(const key_type &k, const Compare &comp) const {
+    return search_type::upper_bound(k, *this, comp);
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // linear search performed using plain compare.
+  template <typename Compare>
+  int linear_search_plain_compare(
+      const key_type &k, int s, int e, const Compare &comp) const {
+    while (s < e) {
+      if (!btree_compare_keys(comp, key(s), k)) {
+        break;
+      }
+      ++s;
+    }
+    return s;
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // linear search performed using compare-to.
+  template <typename Compare>
+  int linear_search_compare_to(
+      const key_type &k, int s, int e, const Compare &comp) const {
+    while (s < e) {
+      int c = comp(key(s), k);
+      if (c == 0) {
+        return s | kExactMatch;
+      } else if (c > 0) {
+        break;
+      }
+      ++s;
+    }
+    return s;
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // binary search performed using plain compare.
+  template <typename Compare>
+  int binary_search_plain_compare(
+      const key_type &k, int s, int e, const Compare &comp) const {
+    while (s != e) {
+      int mid = (s + e) / 2;
+      if (btree_compare_keys(comp, key(mid), k)) {
+        s = mid + 1;
+      } else {
+        e = mid;
+      }
+    }
+    return s;
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // binary search performed using compare-to.
+  template <typename CompareTo>
+  int binary_search_compare_to(
+      const key_type &k, int s, int e, const CompareTo &comp) const {
+    while (s != e) {
+      int mid = (s + e) / 2;
+      int c = comp(key(mid), k);
+      if (c < 0) {
+        s = mid + 1;
+      } else if (c > 0) {
+        e = mid;
+      } else {
+        // Need to return the first value whose key is not less than k, which
+        // requires continuing the binary search. Note that we are guaranteed
+        // that the result is an exact match because if "key(mid-1) < k" the
+        // call to binary_search_compare_to() will return "mid".
+        s = binary_search_compare_to(k, s, mid, comp);
+        return s | kExactMatch;
+      }
+    }
+    return s;
+  }
+
+  // Inserts the value x at position i, shifting all existing values and
+  // children at positions >= i to the right by 1.
+  void insert_value(int i, const value_type &x);
+
+  // Removes the value at position i, shifting all existing values and children
+  // at positions > i to the left by 1.
+  void remove_value(int i);
+
+  // Rebalances a node with its right sibling.
+  void rebalance_right_to_left(btree_node *sibling, int to_move);
+  void rebalance_left_to_right(btree_node *sibling, int to_move);
+
+  // Splits a node, moving a portion of the node's values to its right sibling.
+  void split(btree_node *sibling, int insert_position);
+
+  // Merges a node with its right sibling, moving all of the values and the
+  // delimiting key in the parent node onto itself.
+  void merge(btree_node *sibling);
+
+  // Swap the contents of "this" and "src".
+  void swap(btree_node *src);
+
+  // Node allocation/deletion routines.
+  static btree_node* init_leaf(
+      leaf_fields *f, btree_node *parent, int max_count) {
+    btree_node *n = reinterpret_cast<btree_node*>(f);
+    f->leaf = 1;
+    f->position = 0;
+    f->max_count = max_count;
+    f->count = 0;
+    f->parent = parent;
+    if (!NDEBUG) {
+      memset(&f->values, 0, max_count * sizeof(value_type));
+    }
+    return n;
+  }
+  static btree_node* init_internal(internal_fields *f, btree_node *parent) {
+    btree_node *n = init_leaf(f, parent, kNodeValues);
+    f->leaf = 0;
+    if (!NDEBUG) {
+      memset(f->children, 0, sizeof(f->children));
+    }
+    return n;
+  }
+  static btree_node* init_root(root_fields *f, btree_node *parent) {
+    btree_node *n = init_internal(f, parent);
+    f->rightmost = parent;
+    f->size = parent->count();
+    return n;
+  }
+  void destroy() {
+    for (int i = 0; i < count(); ++i) {
+      value_destroy(i);
+    }
+  }
+
+ private:
+  void value_init(int i) {
+    new (&fields_.values[i]) mutable_value_type;
+  }
+  void value_init(int i, const value_type &x) {
+    new (&fields_.values[i]) mutable_value_type(x);
+  }
+  void value_destroy(int i) {
+    fields_.values[i].~mutable_value_type();
+  }
+
+ private:
+  root_fields fields_;
+
+ private:
+  btree_node(const btree_node&);
+  void operator=(const btree_node&);
+};
+
+template <typename Node, typename Reference, typename Pointer>
+struct btree_iterator {
+  typedef typename Node::key_type key_type;
+  typedef typename Node::size_type size_type;
+  typedef typename Node::difference_type difference_type;
+  typedef typename Node::params_type params_type;
+
+  typedef Node node_type;
+  typedef typename std::remove_const<Node>::type normal_node;
+  typedef const Node const_node;
+  typedef typename params_type::value_type value_type;
+  typedef typename params_type::pointer normal_pointer;
+  typedef typename params_type::reference normal_reference;
+  typedef typename params_type::const_pointer const_pointer;
+  typedef typename params_type::const_reference const_reference;
+
+  typedef Pointer pointer;
+  typedef Reference reference;
+  typedef std::bidirectional_iterator_tag iterator_category;
+
+  typedef btree_iterator<
+    normal_node, normal_reference, normal_pointer> iterator;
+  typedef btree_iterator<
+    const_node, const_reference, const_pointer> const_iterator;
+  typedef btree_iterator<Node, Reference, Pointer> self_type;
+
+  btree_iterator()
+      : node(NULL),
+        position(-1) {
+  }
+  btree_iterator(Node *n, int p)
+      : node(n),
+        position(p) {
+  }
+  btree_iterator(const iterator &x)
+      : node(x.node),
+        position(x.position) {
+  }
+
+  // Increment/decrement the iterator.
+  void increment() {
+    if (node->leaf() && ++position < node->count()) {
+      return;
+    }
+    increment_slow();
+  }
+  void increment_by(int count);
+  void increment_slow();
+
+  void decrement() {
+    if (node->leaf() && --position >= 0) {
+      return;
+    }
+    decrement_slow();
+  }
+  void decrement_slow();
+
+  bool operator==(const const_iterator &x) const {
+    return node == x.node && position == x.position;
+  }
+  bool operator!=(const const_iterator &x) const {
+    return node != x.node || position != x.position;
+  }
+
+  // Accessors for the key/value the iterator is pointing at.
+  const key_type& key() const {
+    return node->key(position);
+  }
+  reference operator*() const {
+    return node->value(position);
+  }
+  pointer operator->() const {
+    return &node->value(position);
+  }
+
+  self_type& operator++() {
+    increment();
+    return *this;
+  }
+  self_type& operator--() {
+    decrement();
+    return *this;
+  }
+  self_type operator++(int) {
+    self_type tmp = *this;
+    ++*this;
+    return tmp;
+  }
+  self_type operator--(int) {
+    self_type tmp = *this;
+    --*this;
+    return tmp;
+  }
+
+  // The node in the tree the iterator is pointing at.
+  Node *node;
+  // The position within the node of the tree the iterator is pointing at.
+  int position;
+};
+
+// Dispatch helper class for using btree::internal_locate with plain compare.
+struct btree_internal_locate_plain_compare {
+  template <typename K, typename T, typename Iter>
+  static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
+    return t.internal_locate_plain_compare(k, iter);
+  }
+};
+
+// Dispatch helper class for using btree::internal_locate with compare-to.
+struct btree_internal_locate_compare_to {
+  template <typename K, typename T, typename Iter>
+  static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
+    return t.internal_locate_compare_to(k, iter);
+  }
+};
+
+template <typename Params>
+class btree : public Params::key_compare {
+  typedef btree<Params> self_type;
+  typedef btree_node<Params> node_type;
+  typedef typename node_type::base_fields base_fields;
+  typedef typename node_type::leaf_fields leaf_fields;
+  typedef typename node_type::internal_fields internal_fields;
+  typedef typename node_type::root_fields root_fields;
+  typedef typename Params::is_key_compare_to is_key_compare_to;
+
+  friend class btree_internal_locate_plain_compare;
+  friend class btree_internal_locate_compare_to;
+  typedef typename if_<
+    is_key_compare_to::value,
+    btree_internal_locate_compare_to,
+    btree_internal_locate_plain_compare>::type internal_locate_type;
+
+  enum {
+    kNodeValues = node_type::kNodeValues,
+    kMinNodeValues = kNodeValues / 2,
+    kValueSize = node_type::kValueSize,
+    kExactMatch = node_type::kExactMatch,
+    kMatchMask = node_type::kMatchMask,
+  };
+
+  // A helper class to get the empty base class optimization for 0-size
+  // allocators. Base is internal_allocator_type.
+  // (e.g. empty_base_handle<internal_allocator_type, node_type*>). If Base is
+  // 0-size, the compiler doesn't have to reserve any space for it and
+  // sizeof(empty_base_handle) will simply be sizeof(Data). Google [empty base
+  // class optimization] for more details.
+  template <typename Base, typename Data>
+  struct empty_base_handle : public Base {
+    empty_base_handle(const Base &b, const Data &d)
+        : Base(b),
+          data(d) {
+    }
+    Data data;
+  };
+
+  struct node_stats {
+    node_stats(ssize_t l, ssize_t i)
+        : leaf_nodes(l),
+          internal_nodes(i) {
+    }
+
+    node_stats& operator+=(const node_stats &x) {
+      leaf_nodes += x.leaf_nodes;
+      internal_nodes += x.internal_nodes;
+      return *this;
+    }
+
+    ssize_t leaf_nodes;
+    ssize_t internal_nodes;
+  };
+
+ public:
+  typedef Params params_type;
+  typedef typename Params::key_type key_type;
+  typedef typename Params::data_type data_type;
+  typedef typename Params::mapped_type mapped_type;
+  typedef typename Params::value_type value_type;
+  typedef typename Params::key_compare key_compare;
+  typedef typename Params::pointer pointer;
+  typedef typename Params::const_pointer const_pointer;
+  typedef typename Params::reference reference;
+  typedef typename Params::const_reference const_reference;
+  typedef typename Params::size_type size_type;
+  typedef typename Params::difference_type difference_type;
+  typedef btree_iterator<node_type, reference, pointer> iterator;
+  typedef typename iterator::const_iterator const_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+
+  typedef typename Params::allocator_type allocator_type;
+  typedef typename allocator_type::template rebind<char>::other
+    internal_allocator_type;
+
+ public:
+  // Default constructor.
+  btree(const key_compare &comp, const allocator_type &alloc);
+
+  // Copy constructor.
+  btree(const self_type &x);
+
+  // Destructor.
+  ~btree() {
+    clear();
+  }
+
+  // Iterator routines.
+  iterator begin() {
+    return iterator(leftmost(), 0);
+  }
+  const_iterator begin() const {
+    return const_iterator(leftmost(), 0);
+  }
+  iterator end() {
+    return iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
+  }
+  const_iterator end() const {
+    return const_iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
+  }
+  reverse_iterator rbegin() {
+    return reverse_iterator(end());
+  }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  reverse_iterator rend() {
+    return reverse_iterator(begin());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  // Finds the first element whose key is not less than key.
+  iterator lower_bound(const key_type &key) {
+    return internal_end(
+        internal_lower_bound(key, iterator(root(), 0)));
+  }
+  const_iterator lower_bound(const key_type &key) const {
+    return internal_end(
+        internal_lower_bound(key, const_iterator(root(), 0)));
+  }
+
+  // Finds the first element whose key is greater than key.
+  iterator upper_bound(const key_type &key) {
+    return internal_end(
+        internal_upper_bound(key, iterator(root(), 0)));
+  }
+  const_iterator upper_bound(const key_type &key) const {
+    return internal_end(
+        internal_upper_bound(key, const_iterator(root(), 0)));
+  }
+
+  // Finds the range of values which compare equal to key. The first member of
+  // the returned pair is equal to lower_bound(key). The second member pair of
+  // the pair is equal to upper_bound(key).
+  std::pair<iterator,iterator> equal_range(const key_type &key) {
+    return std::make_pair(lower_bound(key), upper_bound(key));
+  }
+  std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
+    return std::make_pair(lower_bound(key), upper_bound(key));
+  }
+
+  // Inserts a value into the btree only if it does not already exist. The
+  // boolean return value indicates whether insertion succeeded or failed. The
+  // ValuePointer type is used to avoid instatiating the value unless the key
+  // is being inserted. Value is not dereferenced if the key already exists in
+  // the btree. See btree_map::operator[].
+  template <typename ValuePointer>
+  std::pair<iterator,bool> insert_unique(const key_type &key, ValuePointer value);
+
+  // Inserts a value into the btree only if it does not already exist. The
+  // boolean return value indicates whether insertion succeeded or failed.
+  std::pair<iterator,bool> insert_unique(const value_type &v) {
+    return insert_unique(params_type::key(v), &v);
+  }
+
+  // Insert with hint. Check to see if the value should be placed immediately
+  // before position in the tree. If it does, then the insertion will take
+  // amortized constant time. If not, the insertion will take amortized
+  // logarithmic time as if a call to insert_unique(v) were made.
+  iterator insert_unique(iterator position, const value_type &v);
+
+  // Insert a range of values into the btree.
+  template <typename InputIterator>
+  void insert_unique(InputIterator b, InputIterator e);
+
+  // Inserts a value into the btree. The ValuePointer type is used to avoid
+  // instatiating the value unless the key is being inserted. Value is not
+  // dereferenced if the key already exists in the btree. See
+  // btree_map::operator[].
+  template <typename ValuePointer>
+  iterator insert_multi(const key_type &key, ValuePointer value);
+
+  // Inserts a value into the btree.
+  iterator insert_multi(const value_type &v) {
+    return insert_multi(params_type::key(v), &v);
+  }
+
+  // Insert with hint. Check to see if the value should be placed immediately
+  // before position in the tree. If it does, then the insertion will take
+  // amortized constant time. If not, the insertion will take amortized
+  // logarithmic time as if a call to insert_multi(v) were made.
+  iterator insert_multi(iterator position, const value_type &v);
+
+  // Insert a range of values into the btree.
+  template <typename InputIterator>
+  void insert_multi(InputIterator b, InputIterator e);
+
+  void assign(const self_type &x);
+
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(iterator iter);
+
+  // Erases range. Returns the number of keys erased.
+  int erase(iterator begin, iterator end);
+
+  // Erases the specified key from the btree. Returns 1 if an element was
+  // erased and 0 otherwise.
+  int erase_unique(const key_type &key);
+
+  // Erases all of the entries matching the specified key from the
+  // btree. Returns the number of elements erased.
+  int erase_multi(const key_type &key);
+
+  // Finds the iterator corresponding to a key or returns end() if the key is
+  // not present.
+  iterator find_unique(const key_type &key) {
+    return internal_end(
+        internal_find_unique(key, iterator(root(), 0)));
+  }
+  const_iterator find_unique(const key_type &key) const {
+    return internal_end(
+        internal_find_unique(key, const_iterator(root(), 0)));
+  }
+  iterator find_multi(const key_type &key) {
+    return internal_end(
+        internal_find_multi(key, iterator(root(), 0)));
+  }
+  const_iterator find_multi(const key_type &key) const {
+    return internal_end(
+        internal_find_multi(key, const_iterator(root(), 0)));
+  }
+
+  // Returns a count of the number of times the key appears in the btree.
+  size_type count_unique(const key_type &key) const {
+    const_iterator begin = internal_find_unique(
+        key, const_iterator(root(), 0));
+    if (!begin.node) {
+      // The key doesn't exist in the tree.
+      return 0;
+    }
+    return 1;
+  }
+  // Returns a count of the number of times the key appears in the btree.
+  size_type count_multi(const key_type &key) const {
+    return distance(lower_bound(key), upper_bound(key));
+  }
+
+  // Clear the btree, deleting all of the values it contains.
+  void clear();
+
+  // Swap the contents of *this and x.
+  void swap(self_type &x);
+
+  // Assign the contents of x to *this.
+  self_type& operator=(const self_type &x) {
+    if (&x == this) {
+      // Don't copy onto ourselves.
+      return *this;
+    }
+    assign(x);
+    return *this;
+  }
+
+  key_compare* mutable_key_comp() {
+    return this;
+  }
+  const key_compare& key_comp() const {
+    return *this;
+  }
+  bool compare_keys(const key_type &x, const key_type &y) const {
+    return btree_compare_keys(key_comp(), x, y);
+  }
+
+  // Dump the btree to the specified ostream. Requires that operator<< is
+  // defined for Key and Value.
+  void dump(std::ostream &os) const {
+    if (root() != NULL) {
+      internal_dump(os, root(), 0);
+    }
+  }
+
+  // Verifies the structure of the btree.
+  void verify() const;
+
+  // Size routines. Note that empty() is slightly faster than doing size()==0.
+  size_type size() const {
+    if (empty()) return 0;
+    if (root()->leaf()) return root()->count();
+    return root()->size();
+  }
+  size_type max_size() const { return std::numeric_limits<size_type>::max(); }
+  bool empty() const { return root() == NULL; }
+
+  // The height of the btree. An empty tree will have height 0.
+  size_type height() const {
+    size_type h = 0;
+    if (root()) {
+      // Count the length of the chain from the leftmost node up to the
+      // root. We actually count from the root back around to the level below
+      // the root, but the calculation is the same because of the circularity
+      // of that traversal.
+      const node_type *n = root();
+      do {
+        ++h;
+        n = n->parent();
+      } while (n != root());
+    }
+    return h;
+  }
+
+  // The number of internal, leaf and total nodes used by the btree.
+  size_type leaf_nodes() const {
+    return internal_stats(root()).leaf_nodes;
+  }
+  size_type internal_nodes() const {
+    return internal_stats(root()).internal_nodes;
+  }
+  size_type nodes() const {
+    node_stats stats = internal_stats(root());
+    return stats.leaf_nodes + stats.internal_nodes;
+  }
+
+  // The total number of bytes used by the btree.
+  size_type bytes_used() const {
+    node_stats stats = internal_stats(root());
+    if (stats.leaf_nodes == 1 && stats.internal_nodes == 0) {
+      return sizeof(*this) +
+          sizeof(base_fields) + root()->max_count() * sizeof(value_type);
+    } else {
+      return sizeof(*this) +
+          sizeof(root_fields) - sizeof(internal_fields) +
+          stats.leaf_nodes * sizeof(leaf_fields) +
+          stats.internal_nodes * sizeof(internal_fields);
+    }
+  }
+
+  // The average number of bytes used per value stored in the btree.
+  static double average_bytes_per_value() {
+    // Returns the number of bytes per value on a leaf node that is 75%
+    // full. Experimentally, this matches up nicely with the computed number of
+    // bytes per value in trees that had their values inserted in random order.
+    return sizeof(leaf_fields) / (kNodeValues * 0.75);
+  }
+
+  // The fullness of the btree. Computed as the number of elements in the btree
+  // divided by the maximum number of elements a tree with the current number
+  // of nodes could hold. A value of 1 indicates perfect space
+  // utilization. Smaller values indicate space wastage.
+  double fullness() const {
+    return double(size()) / (nodes() * kNodeValues);
+  }
+  // The overhead of the btree structure in bytes per node. Computed as the
+  // total number of bytes used by the btree minus the number of bytes used for
+  // storing elements divided by the number of elements.
+  double overhead() const {
+    if (empty()) {
+      return 0.0;
+    }
+    return (bytes_used() - size() * kValueSize) / double(size());
+  }
+
+ private:
+  // Internal accessor routines.
+  node_type* root() { return root_.data; }
+  const node_type* root() const { return root_.data; }
+  node_type** mutable_root() { return &root_.data; }
+
+  // The rightmost node is stored in the root node.
+  node_type* rightmost() {
+    return (!root() || root()->leaf()) ? root() : root()->rightmost();
+  }
+  const node_type* rightmost() const {
+    return (!root() || root()->leaf()) ? root() : root()->rightmost();
+  }
+  node_type** mutable_rightmost() { return root()->mutable_rightmost(); }
+
+  // The leftmost node is stored as the parent of the root node.
+  node_type* leftmost() { return root() ? root()->parent() : NULL; }
+  const node_type* leftmost() const { return root() ? root()->parent() : NULL; }
+
+  // The size of the tree is stored in the root node.
+  size_type* mutable_size() { return root()->mutable_size(); }
+
+  // Allocator routines.
+  internal_allocator_type* mutable_internal_allocator() {
+    return static_cast<internal_allocator_type*>(&root_);
+  }
+  const internal_allocator_type& internal_allocator() const {
+    return *static_cast<const internal_allocator_type*>(&root_);
+  }
+
+  // Node creation/deletion routines.
+  node_type* new_internal_node(node_type *parent) {
+    internal_fields *p = reinterpret_cast<internal_fields*>(
+        mutable_internal_allocator()->allocate(sizeof(internal_fields)));
+    return node_type::init_internal(p, parent);
+  }
+  node_type* new_internal_root_node() {
+    root_fields *p = reinterpret_cast<root_fields*>(
+        mutable_internal_allocator()->allocate(sizeof(root_fields)));
+    return node_type::init_root(p, root()->parent());
+  }
+  node_type* new_leaf_node(node_type *parent) {
+    leaf_fields *p = reinterpret_cast<leaf_fields*>(
+        mutable_internal_allocator()->allocate(sizeof(leaf_fields)));
+    return node_type::init_leaf(p, parent, kNodeValues);
+  }
+  node_type* new_leaf_root_node(int max_count) {
+    leaf_fields *p = reinterpret_cast<leaf_fields*>(
+        mutable_internal_allocator()->allocate(
+            sizeof(base_fields) + max_count * sizeof(value_type)));
+    return node_type::init_leaf(p, reinterpret_cast<node_type*>(p), max_count);
+  }
+  void delete_internal_node(node_type *node) {
+    node->destroy();
+    assert(node != root());
+    mutable_internal_allocator()->deallocate(
+        reinterpret_cast<char*>(node), sizeof(internal_fields));
+  }
+  void delete_internal_root_node() {
+    root()->destroy();
+    mutable_internal_allocator()->deallocate(
+        reinterpret_cast<char*>(root()), sizeof(root_fields));
+  }
+  void delete_leaf_node(node_type *node) {
+    node->destroy();
+    mutable_internal_allocator()->deallocate(
+        reinterpret_cast<char*>(node),
+        sizeof(base_fields) + node->max_count() * sizeof(value_type));
+  }
+
+  // Rebalances or splits the node iter points to.
+  void rebalance_or_split(iterator *iter);
+
+  // Merges the values of left, right and the delimiting key on their parent
+  // onto left, removing the delimiting key and deleting right.
+  void merge_nodes(node_type *left, node_type *right);
+
+  // Tries to merge node with its left or right sibling, and failing that,
+  // rebalance with its left or right sibling. Returns true if a merge
+  // occurred, at which point it is no longer valid to access node. Returns
+  // false if no merging took place.
+  bool try_merge_or_rebalance(iterator *iter);
+
+  // Tries to shrink the height of the tree by 1.
+  void try_shrink();
+
+  iterator internal_end(iterator iter) {
+    return iter.node ? iter : end();
+  }
+  const_iterator internal_end(const_iterator iter) const {
+    return iter.node ? iter : end();
+  }
+
+  // Inserts a value into the btree immediately before iter. Requires that
+  // key(v) <= iter.key() and (--iter).key() <= key(v).
+  iterator internal_insert(iterator iter, const value_type &v);
+
+  // Returns an iterator pointing to the first value >= the value "iter" is
+  // pointing at. Note that "iter" might be pointing to an invalid location as
+  // iter.position == iter.node->count(). This routine simply moves iter up in
+  // the tree to a valid location.
+  template <typename IterType>
+  static IterType internal_last(IterType iter);
+
+  // Returns an iterator pointing to the leaf position at which key would
+  // reside in the tree. We provide 2 versions of internal_locate. The first
+  // version (internal_locate_plain_compare) always returns 0 for the second
+  // field of the pair. The second version (internal_locate_compare_to) is for
+  // the key-compare-to specialization and returns either kExactMatch (if the
+  // key was found in the tree) or -kExactMatch (if it wasn't) in the second
+  // field of the pair. The compare_to specialization allows the caller to
+  // avoid a subsequent comparison to determine if an exact match was made,
+  // speeding up string keys.
+  template <typename IterType>
+  std::pair<IterType, int> internal_locate(
+      const key_type &key, IterType iter) const;
+  template <typename IterType>
+  std::pair<IterType, int> internal_locate_plain_compare(
+      const key_type &key, IterType iter) const;
+  template <typename IterType>
+  std::pair<IterType, int> internal_locate_compare_to(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements lower_bound().
+  template <typename IterType>
+  IterType internal_lower_bound(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements upper_bound().
+  template <typename IterType>
+  IterType internal_upper_bound(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements find_unique().
+  template <typename IterType>
+  IterType internal_find_unique(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements find_multi().
+  template <typename IterType>
+  IterType internal_find_multi(
+      const key_type &key, IterType iter) const;
+
+  // Deletes a node and all of its children.
+  void internal_clear(node_type *node);
+
+  // Dumps a node and all of its children to the specified ostream.
+  void internal_dump(std::ostream &os, const node_type *node, int level) const;
+
+  // Verifies the tree structure of node.
+  int internal_verify(const node_type *node,
+                      const key_type *lo, const key_type *hi) const;
+
+  node_stats internal_stats(const node_type *node) const {
+    if (!node) {
+      return node_stats(0, 0);
+    }
+    if (node->leaf()) {
+      return node_stats(1, 0);
+    }
+    node_stats res(0, 1);
+    for (int i = 0; i <= node->count(); ++i) {
+      res += internal_stats(node->child(i));
+    }
+    return res;
+  }
+
+ private:
+  empty_base_handle<internal_allocator_type, node_type*> root_;
+
+ private:
+  // A never instantiated helper function that returns big_ if we have a
+  // key-compare-to functor or if R is bool and small_ otherwise.
+  template <typename R>
+  static typename if_<
+   if_<is_key_compare_to::value,
+             std::is_same<R, int>,
+             std::is_same<R, bool> >::type::value,
+   big_, small_>::type key_compare_checker(R);
+
+  // A never instantiated helper function that returns the key comparison
+  // functor.
+  static key_compare key_compare_helper();
+
+  // Verify that key_compare returns a bool. This is similar to the way
+  // is_convertible in base/type_traits.h works. Note that key_compare_checker
+  // is never actually invoked. The compiler will select which
+  // key_compare_checker() to instantiate and then figure out the size of the
+  // return type of key_compare_checker() at compile time which we then check
+  // against the sizeof of big_.
+  COMPILE_ASSERT(
+      sizeof(key_compare_checker(key_compare_helper()(key_type(), key_type()))) ==
+      sizeof(big_),
+      key_comparison_function_must_return_bool);
+
+  // Note: We insist on kTargetValues, which is computed from
+  // Params::kTargetNodeSize, must fit the base_fields::field_type.
+  COMPILE_ASSERT(kNodeValues <
+                 (1 << (8 * sizeof(typename base_fields::field_type))),
+                 target_node_size_too_large);
+
+  // Test the assumption made in setting kNodeValueSpace.
+  COMPILE_ASSERT(sizeof(base_fields) >= 2 * sizeof(void*),
+                 node_space_assumption_incorrect);
+};
+
+////
+// btree_node methods
+template <typename P>
+inline void btree_node<P>::insert_value(int i, const value_type &x) {
+  assert(i <= count());
+  value_init(count(), x);
+  for (int j = count(); j > i; --j) {
+    value_swap(j, this, j - 1);
+  }
+  set_count(count() + 1);
+
+  if (!leaf()) {
+    ++i;
+    for (int j = count(); j > i; --j) {
+      *mutable_child(j) = child(j - 1);
+      child(j)->set_position(j);
+    }
+    *mutable_child(i) = NULL;
+  }
+}
+
+template <typename P>
+inline void btree_node<P>::remove_value(int i) {
+  if (!leaf()) {
+    assert(child(i + 1)->count() == 0);
+    for (int j = i + 1; j < count(); ++j) {
+      *mutable_child(j) = child(j + 1);
+      child(j)->set_position(j);
+    }
+    *mutable_child(count()) = NULL;
+  }
+
+  set_count(count() - 1);
+  for (; i < count(); ++i) {
+    value_swap(i, this, i + 1);
+  }
+  value_destroy(i);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_right_to_left(btree_node *src, int to_move) {
+  assert(parent() == src->parent());
+  assert(position() + 1 == src->position());
+  assert(src->count() >= count());
+  assert(to_move >= 1);
+  assert(to_move <= src->count());
+
+  // Make room in the left node for the new values.
+  for (int i = 0; i < to_move; ++i) {
+    value_init(i + count());
+  }
+
+  // Move the delimiting value to the left node and the new delimiting value
+  // from the right node.
+  value_swap(count(), parent(), position());
+  parent()->value_swap(position(), src, to_move - 1);
+
+  // Move the values from the right to the left node.
+  for (int i = 1; i < to_move; ++i) {
+    value_swap(count() + i, src, i - 1);
+  }
+  // Shift the values in the right node to their correct position.
+  for (int i = to_move; i < src->count(); ++i) {
+    src->value_swap(i - to_move, src, i);
+  }
+  for (int i = 1; i <= to_move; ++i) {
+    src->value_destroy(src->count() - i);
+  }
+
+  if (!leaf()) {
+    // Move the child pointers from the right to the left node.
+    for (int i = 0; i < to_move; ++i) {
+      set_child(1 + count() + i, src->child(i));
+    }
+    for (int i = 0; i <= src->count() - to_move; ++i) {
+      assert(i + to_move <= src->max_count());
+      src->set_child(i, src->child(i + to_move));
+      *src->mutable_child(i + to_move) = NULL;
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(count() + to_move);
+  src->set_count(src->count() - to_move);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_left_to_right(btree_node *dest, int to_move) {
+  assert(parent() == dest->parent());
+  assert(position() + 1 == dest->position());
+  assert(count() >= dest->count());
+  assert(to_move >= 1);
+  assert(to_move <= count());
+
+  // Make room in the right node for the new values.
+  for (int i = 0; i < to_move; ++i) {
+    dest->value_init(i + dest->count());
+  }
+  for (int i = dest->count() - 1; i >= 0; --i) {
+    dest->value_swap(i, dest, i + to_move);
+  }
+
+  // Move the delimiting value to the right node and the new delimiting value
+  // from the left node.
+  dest->value_swap(to_move - 1, parent(), position());
+  parent()->value_swap(position(), this, count() - to_move);
+  value_destroy(count() - to_move);
+
+  // Move the values from the left to the right node.
+  for (int i = 1; i < to_move; ++i) {
+    value_swap(count() - to_move + i, dest, i - 1);
+    value_destroy(count() - to_move + i);
+  }
+
+  if (!leaf()) {
+    // Move the child pointers from the left to the right node.
+    for (int i = dest->count(); i >= 0; --i) {
+      dest->set_child(i + to_move, dest->child(i));
+      *dest->mutable_child(i) = NULL;
+    }
+    for (int i = 1; i <= to_move; ++i) {
+      dest->set_child(i - 1, child(count() - to_move + i));
+      *mutable_child(count() - to_move + i) = NULL;
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(count() - to_move);
+  dest->set_count(dest->count() + to_move);
+}
+
+template <typename P>
+void btree_node<P>::split(btree_node *dest, int insert_position) {
+  assert(dest->count() == 0);
+
+  // We bias the split based on the position being inserted. If we're
+  // inserting at the beginning of the left node then bias the split to put
+  // more values on the right node. If we're inserting at the end of the
+  // right node then bias the split to put more values on the left node.
+  if (insert_position == 0) {
+    dest->set_count(count() - 1);
+  } else if (insert_position == max_count()) {
+    dest->set_count(0);
+  } else {
+    dest->set_count(count() / 2);
+  }
+  set_count(count() - dest->count());
+  assert(count() >= 1);
+
+  // Move values from the left sibling to the right sibling.
+  for (int i = 0; i < dest->count(); ++i) {
+    dest->value_init(i);
+    value_swap(count() + i, dest, i);
+    value_destroy(count() + i);
+  }
+
+  // The split key is the largest value in the left sibling.
+  set_count(count() - 1);
+  parent()->insert_value(position(), value_type());
+  value_swap(count(), parent(), position());
+  value_destroy(count());
+  parent()->set_child(position() + 1, dest);
+
+  if (!leaf()) {
+    for (int i = 0; i <= dest->count(); ++i) {
+      assert(child(count() + i + 1) != NULL);
+      dest->set_child(i, child(count() + i + 1));
+      *mutable_child(count() + i + 1) = NULL;
+    }
+  }
+}
+
+template <typename P>
+void btree_node<P>::merge(btree_node *src) {
+  assert(parent() == src->parent());
+  assert(position() + 1 == src->position());
+
+  // Move the delimiting value to the left node.
+  value_init(count());
+  value_swap(count(), parent(), position());
+
+  // Move the values from the right to the left node.
+  for (int i = 0; i < src->count(); ++i) {
+    value_init(1 + count() + i);
+    value_swap(1 + count() + i, src, i);
+    src->value_destroy(i);
+  }
+
+  if (!leaf()) {
+    // Move the child pointers from the right to the left node.
+    for (int i = 0; i <= src->count(); ++i) {
+      set_child(1 + count() + i, src->child(i));
+      *src->mutable_child(i) = NULL;
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(1 + count() + src->count());
+  src->set_count(0);
+
+  // Remove the value on the parent node.
+  parent()->remove_value(position());
+}
+
+template <typename P>
+void btree_node<P>::swap(btree_node *x) {
+  assert(leaf() == x->leaf());
+
+  // Swap the values.
+  for (int i = count(); i < x->count(); ++i) {
+    value_init(i);
+  }
+  for (int i = x->count(); i < count(); ++i) {
+    x->value_init(i);
+  }
+  int n = std::max(count(), x->count());
+  for (int i = 0; i < n; ++i) {
+    value_swap(i, x, i);
+  }
+  for (int i = count(); i < x->count(); ++i) {
+    x->value_destroy(i);
+  }
+  for (int i = x->count(); i < count(); ++i) {
+    value_destroy(i);
+  }
+
+  if (!leaf()) {
+    // Swap the child pointers.
+    for (int i = 0; i <= n; ++i) {
+      btree_swap_helper(*mutable_child(i), *x->mutable_child(i));
+    }
+    for (int i = 0; i <= count(); ++i) {
+      x->child(i)->fields_.parent = x;
+    }
+    for (int i = 0; i <= x->count(); ++i) {
+      child(i)->fields_.parent = this;
+    }
+  }
+
+  // Swap the counts.
+  btree_swap_helper(fields_.count, x->fields_.count);
+}
+
+////
+// btree_iterator methods
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_slow() {
+  if (node->leaf()) {
+    assert(position >= node->count());
+    self_type save(*this);
+    while (position == node->count() && !node->is_root()) {
+      assert(node->parent()->child(node->position()) == node);
+      position = node->position();
+      node = node->parent();
+    }
+    if (position == node->count()) {
+      *this = save;
+    }
+  } else {
+    assert(position < node->count());
+    node = node->child(position + 1);
+    while (!node->leaf()) {
+      node = node->child(0);
+    }
+    position = 0;
+  }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_by(int count) {
+  while (count > 0) {
+    if (node->leaf()) {
+      int rest = node->count() - position;
+      position += std::min(rest, count);
+      count = count - rest;
+      if (position < node->count()) {
+        return;
+      }
+    } else {
+      --count;
+    }
+    increment_slow();
+  }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::decrement_slow() {
+  if (node->leaf()) {
+    assert(position <= -1);
+    self_type save(*this);
+    while (position < 0 && !node->is_root()) {
+      assert(node->parent()->child(node->position()) == node);
+      position = node->position() - 1;
+      node = node->parent();
+    }
+    if (position < 0) {
+      *this = save;
+    }
+  } else {
+    assert(position >= 0);
+    node = node->child(position);
+    while (!node->leaf()) {
+      node = node->child(node->count());
+    }
+    position = node->count() - 1;
+  }
+}
+
+////
+// btree methods
+template <typename P>
+btree<P>::btree(const key_compare &comp, const allocator_type &alloc)
+    : key_compare(comp),
+      root_(alloc, NULL) {
+}
+
+template <typename P>
+btree<P>::btree(const self_type &x)
+    : key_compare(x.key_comp()),
+      root_(x.internal_allocator(), NULL) {
+  assign(x);
+}
+
+template <typename P> template <typename ValuePointer>
+std::pair<typename btree<P>::iterator, bool>
+btree<P>::insert_unique(const key_type &key, ValuePointer value) {
+  if (empty()) {
+    *mutable_root() = new_leaf_root_node(1);
+  }
+
+  std::pair<iterator, int> res = internal_locate(key, iterator(root(), 0));
+  iterator &iter = res.first;
+  if (res.second == kExactMatch) {
+    // The key already exists in the tree, do nothing.
+    return std::make_pair(internal_last(iter), false);
+  } else if (!res.second) {
+    iterator last = internal_last(iter);
+    if (last.node && !compare_keys(key, last.key())) {
+      // The key already exists in the tree, do nothing.
+      return std::make_pair(last, false);
+    }
+  }
+
+  return std::make_pair(internal_insert(iter, *value), true);
+}
+
+template <typename P>
+inline typename btree<P>::iterator
+btree<P>::insert_unique(iterator position, const value_type &v) {
+  if (!empty()) {
+    const key_type &key = params_type::key(v);
+    if (position == end() || compare_keys(key, position.key())) {
+      iterator prev = position;
+      if (position == begin() || compare_keys((--prev).key(), key)) {
+        // prev.key() < key < position.key()
+        return internal_insert(position, v);
+      }
+    } else if (compare_keys(position.key(), key)) {
+      iterator next = position;
+      ++next;
+      if (next == end() || compare_keys(key, next.key())) {
+        // position.key() < key < next.key()
+        return internal_insert(next, v);
+      }
+    } else {
+      // position.key() == key
+      return position;
+    }
+  }
+  return insert_unique(v).first;
+}
+
+template <typename P> template <typename InputIterator>
+void btree<P>::insert_unique(InputIterator b, InputIterator e) {
+  for (; b != e; ++b) {
+    insert_unique(end(), *b);
+  }
+}
+
+template <typename P> template <typename ValuePointer>
+typename btree<P>::iterator
+btree<P>::insert_multi(const key_type &key, ValuePointer value) {
+  if (empty()) {
+    *mutable_root() = new_leaf_root_node(1);
+  }
+
+  iterator iter = internal_upper_bound(key, iterator(root(), 0));
+  if (!iter.node) {
+    iter = end();
+  }
+  return internal_insert(iter, *value);
+}
+
+template <typename P>
+typename btree<P>::iterator
+btree<P>::insert_multi(iterator position, const value_type &v) {
+  if (!empty()) {
+    const key_type &key = params_type::key(v);
+    if (position == end() || !compare_keys(position.key(), key)) {
+      iterator prev = position;
+      if (position == begin() || !compare_keys(key, (--prev).key())) {
+        // prev.key() <= key <= position.key()
+        return internal_insert(position, v);
+      }
+    } else {
+      iterator next = position;
+      ++next;
+      if (next == end() || !compare_keys(next.key(), key)) {
+        // position.key() < key <= next.key()
+        return internal_insert(next, v);
+      }
+    }
+  }
+  return insert_multi(v);
+}
+
+template <typename P> template <typename InputIterator>
+void btree<P>::insert_multi(InputIterator b, InputIterator e) {
+  for (; b != e; ++b) {
+    insert_multi(end(), *b);
+  }
+}
+
+template <typename P>
+void btree<P>::assign(const self_type &x) {
+  clear();
+
+  *mutable_key_comp() = x.key_comp();
+  *mutable_internal_allocator() = x.internal_allocator();
+
+  // Assignment can avoid key comparisons because we know the order of the
+  // values is the same order we'll store them in.
+  for (const_iterator iter = x.begin(); iter != x.end(); ++iter) {
+    if (empty()) {
+      insert_multi(*iter);
+    } else {
+      // If the btree is not empty, we can just insert the new value at the end
+      // of the tree!
+      internal_insert(end(), *iter);
+    }
+  }
+}
+
+template <typename P>
+typename btree<P>::iterator btree<P>::erase(iterator iter) {
+  bool internal_delete = false;
+  if (!iter.node->leaf()) {
+    // Deletion of a value on an internal node. Swap the key with the largest
+    // value of our left child. This is easy, we just decrement iter.
+    iterator tmp_iter(iter--);
+    assert(iter.node->leaf());
+    assert(!compare_keys(tmp_iter.key(), iter.key()));
+    iter.node->value_swap(iter.position, tmp_iter.node, tmp_iter.position);
+    internal_delete = true;
+    --*mutable_size();
+  } else if (!root()->leaf()) {
+    --*mutable_size();
+  }
+
+  // Delete the key from the leaf.
+  iter.node->remove_value(iter.position);
+
+  // We want to return the next value after the one we just erased. If we
+  // erased from an internal node (internal_delete == true), then the next
+  // value is ++(++iter). If we erased from a leaf node (internal_delete ==
+  // false) then the next value is ++iter. Note that ++iter may point to an
+  // internal node and the value in the internal node may move to a leaf node
+  // (iter.node) when rebalancing is performed at the leaf level.
+
+  // Merge/rebalance as we walk back up the tree.
+  iterator res(iter);
+  for (;;) {
+    if (iter.node == root()) {
+      try_shrink();
+      if (empty()) {
+        return end();
+      }
+      break;
+    }
+    if (iter.node->count() >= kMinNodeValues) {
+      break;
+    }
+    bool merged = try_merge_or_rebalance(&iter);
+    if (iter.node->leaf()) {
+      res = iter;
+    }
+    if (!merged) {
+      break;
+    }
+    iter.node = iter.node->parent();
+  }
+
+  // Adjust our return value. If we're pointing at the end of a node, advance
+  // the iterator.
+  if (res.position == res.node->count()) {
+    res.position = res.node->count() - 1;
+    ++res;
+  }
+  // If we erased from an internal node, advance the iterator.
+  if (internal_delete) {
+    ++res;
+  }
+  return res;
+}
+
+template <typename P>
+int btree<P>::erase(iterator begin, iterator end) {
+  int count = distance(begin, end);
+  for (int i = 0; i < count; i++) {
+    begin = erase(begin);
+  }
+  return count;
+}
+
+template <typename P>
+int btree<P>::erase_unique(const key_type &key) {
+  iterator iter = internal_find_unique(key, iterator(root(), 0));
+  if (!iter.node) {
+    // The key doesn't exist in the tree, return nothing done.
+    return 0;
+  }
+  erase(iter);
+  return 1;
+}
+
+template <typename P>
+int btree<P>::erase_multi(const key_type &key) {
+  iterator begin = internal_lower_bound(key, iterator(root(), 0));
+  if (!begin.node) {
+    // The key doesn't exist in the tree, return nothing done.
+    return 0;
+  }
+  // Delete all of the keys between begin and upper_bound(key).
+  iterator end = internal_end(
+      internal_upper_bound(key, iterator(root(), 0)));
+  return erase(begin, end);
+}
+
+template <typename P>
+void btree<P>::clear() {
+  if (root() != NULL) {
+    internal_clear(root());
+  }
+  *mutable_root() = NULL;
+}
+
+template <typename P>
+void btree<P>::swap(self_type &x) {
+  std::swap(static_cast<key_compare&>(*this), static_cast<key_compare&>(x));
+  std::swap(root_, x.root_);
+}
+
+template <typename P>
+void btree<P>::verify() const {
+  if (root() != NULL) {
+    assert(size() == internal_verify(root(), NULL, NULL));
+    assert(leftmost() == (++const_iterator(root(), -1)).node);
+    assert(rightmost() == (--const_iterator(root(), root()->count())).node);
+    assert(leftmost()->leaf());
+    assert(rightmost()->leaf());
+  } else {
+    assert(size() == 0);
+    assert(leftmost() == NULL);
+    assert(rightmost() == NULL);
+  }
+}
+
+template <typename P>
+void btree<P>::rebalance_or_split(iterator *iter) {
+  node_type *&node = iter->node;
+  int &insert_position = iter->position;
+  assert(node->count() == node->max_count());
+
+  // First try to make room on the node by rebalancing.
+  node_type *parent = node->parent();
+  if (node != root()) {
+    if (node->position() > 0) {
+      // Try rebalancing with our left sibling.
+      node_type *left = parent->child(node->position() - 1);
+      if (left->count() < left->max_count()) {
+        // We bias rebalancing based on the position being inserted. If we're
+        // inserting at the end of the right node then we bias rebalancing to
+        // fill up the left node.
+        int to_move = (left->max_count() - left->count()) /
+            (1 + (insert_position < left->max_count()));
+        to_move = std::max(1, to_move);
+
+        if (((insert_position - to_move) >= 0) ||
+            ((left->count() + to_move) < left->max_count())) {
+          left->rebalance_right_to_left(node, to_move);
+
+          assert(node->max_count() - node->count() == to_move);
+          insert_position = insert_position - to_move;
+          if (insert_position < 0) {
+            insert_position = insert_position + left->count() + 1;
+            node = left;
+          }
+
+          assert(node->count() < node->max_count());
+          return;
+        }
+      }
+    }
+
+    if (node->position() < parent->count()) {
+      // Try rebalancing with our right sibling.
+      node_type *right = parent->child(node->position() + 1);
+      if (right->count() < right->max_count()) {
+        // We bias rebalancing based on the position being inserted. If we're
+        // inserting at the beginning of the left node then we bias rebalancing
+        // to fill up the right node.
+        int to_move = (right->max_count() - right->count()) /
+            (1 + (insert_position > 0));
+        to_move = std::max(1, to_move);
+
+        if ((insert_position <= (node->count() - to_move)) ||
+            ((right->count() + to_move) < right->max_count())) {
+          node->rebalance_left_to_right(right, to_move);
+
+          if (insert_position > node->count()) {
+            insert_position = insert_position - node->count() - 1;
+            node = right;
+          }
+
+          assert(node->count() < node->max_count());
+          return;
+        }
+      }
+    }
+
+    // Rebalancing failed, make sure there is room on the parent node for a new
+    // value.
+    if (parent->count() == parent->max_count()) {
+      iterator parent_iter(node->parent(), node->position());
+      rebalance_or_split(&parent_iter);
+    }
+  } else {
+    // Rebalancing not possible because this is the root node.
+    if (root()->leaf()) {
+      // The root node is currently a leaf node: create a new root node and set
+      // the current root node as the child of the new root.
+      parent = new_internal_root_node();
+      parent->set_child(0, root());
+      *mutable_root() = parent;
+      assert(*mutable_rightmost() == parent->child(0));
+    } else {
+      // The root node is an internal node. We do not want to create a new root
+      // node because the root node is special and holds the size of the tree
+      // and a pointer to the rightmost node. So we create a new internal node
+      // and move all of the items on the current root into the new node.
+      parent = new_internal_node(parent);
+      parent->set_child(0, parent);
+      parent->swap(root());
+      node = parent;
+    }
+  }
+
+  // Split the node.
+  node_type *split_node;
+  if (node->leaf()) {
+    split_node = new_leaf_node(parent);
+    node->split(split_node, insert_position);
+    if (rightmost() == node) {
+      *mutable_rightmost() = split_node;
+    }
+  } else {
+    split_node = new_internal_node(parent);
+    node->split(split_node, insert_position);
+  }
+
+  if (insert_position > node->count()) {
+    insert_position = insert_position - node->count() - 1;
+    node = split_node;
+  }
+}
+
+template <typename P>
+void btree<P>::merge_nodes(node_type *left, node_type *right) {
+  left->merge(right);
+  if (right->leaf()) {
+    if (rightmost() == right) {
+      *mutable_rightmost() = left;
+    }
+    delete_leaf_node(right);
+  } else {
+    delete_internal_node(right);
+  }
+}
+
+template <typename P>
+bool btree<P>::try_merge_or_rebalance(iterator *iter) {
+  node_type *parent = iter->node->parent();
+  if (iter->node->position() > 0) {
+    // Try merging with our left sibling.
+    node_type *left = parent->child(iter->node->position() - 1);
+    if ((1 + left->count() + iter->node->count()) <= left->max_count()) {
+      iter->position += 1 + left->count();
+      merge_nodes(left, iter->node);
+      iter->node = left;
+      return true;
+    }
+  }
+  if (iter->node->position() < parent->count()) {
+    // Try merging with our right sibling.
+    node_type *right = parent->child(iter->node->position() + 1);
+    if ((1 + iter->node->count() + right->count()) <= right->max_count()) {
+      merge_nodes(iter->node, right);
+      return true;
+    }
+    // Try rebalancing with our right sibling. We don't perform rebalancing if
+    // we deleted the first element from iter->node and the node is not
+    // empty. This is a small optimization for the common pattern of deleting
+    // from the front of the tree.
+    if ((right->count() > kMinNodeValues) &&
+        ((iter->node->count() == 0) ||
+         (iter->position > 0))) {
+      int to_move = (right->count() - iter->node->count()) / 2;
+      to_move = std::min(to_move, right->count() - 1);
+      iter->node->rebalance_right_to_left(right, to_move);
+      return false;
+    }
+  }
+  if (iter->node->position() > 0) {
+    // Try rebalancing with our left sibling. We don't perform rebalancing if
+    // we deleted the last element from iter->node and the node is not
+    // empty. This is a small optimization for the common pattern of deleting
+    // from the back of the tree.
+    node_type *left = parent->child(iter->node->position() - 1);
+    if ((left->count() > kMinNodeValues) &&
+        ((iter->node->count() == 0) ||
+         (iter->position < iter->node->count()))) {
+      int to_move = (left->count() - iter->node->count()) / 2;
+      to_move = std::min(to_move, left->count() - 1);
+      left->rebalance_left_to_right(iter->node, to_move);
+      iter->position += to_move;
+      return false;
+    }
+  }
+  return false;
+}
+
+template <typename P>
+void btree<P>::try_shrink() {
+  if (root()->count() > 0) {
+    return;
+  }
+  // Deleted the last item on the root node, shrink the height of the tree.
+  if (root()->leaf()) {
+    assert(size() == 0);
+    delete_leaf_node(root());
+    *mutable_root() = NULL;
+  } else {
+    node_type *child = root()->child(0);
+    if (child->leaf()) {
+      // The child is a leaf node so simply make it the root node in the tree.
+      child->make_root();
+      delete_internal_root_node();
+      *mutable_root() = child;
+    } else {
+      // The child is an internal node. We want to keep the existing root node
+      // so we move all of the values from the child node into the existing
+      // (empty) root node.
+      child->swap(root());
+      delete_internal_node(child);
+    }
+  }
+}
+
+template <typename P> template <typename IterType>
+inline IterType btree<P>::internal_last(IterType iter) {
+  while (iter.node && iter.position == iter.node->count()) {
+    iter.position = iter.node->position();
+    iter.node = iter.node->parent();
+    if (iter.node->leaf()) {
+      iter.node = NULL;
+    }
+  }
+  return iter;
+}
+
+template <typename P>
+inline typename btree<P>::iterator
+btree<P>::internal_insert(iterator iter, const value_type &v) {
+  if (!iter.node->leaf()) {
+    // We can't insert on an internal node. Instead, we'll insert after the
+    // previous value which is guaranteed to be on a leaf node.
+    --iter;
+    ++iter.position;
+  }
+  if (iter.node->count() == iter.node->max_count()) {
+    // Make room in the leaf for the new item.
+    if (iter.node->max_count() < kNodeValues) {
+      // Insertion into the root where the root is smaller that the full node
+      // size. Simply grow the size of the root node.
+      assert(iter.node == root());
+      iter.node = new_leaf_root_node(
+          std::min<int>(kNodeValues, 2 * iter.node->max_count()));
+      iter.node->swap(root());
+      delete_leaf_node(root());
+      *mutable_root() = iter.node;
+    } else {
+      rebalance_or_split(&iter);
+      ++*mutable_size();
+    }
+  } else if (!root()->leaf()) {
+    ++*mutable_size();
+  }
+  iter.node->insert_value(iter.position, v);
+  return iter;
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate(
+    const key_type &key, IterType iter) const {
+  return internal_locate_type::dispatch(key, *this, iter);
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate_plain_compare(
+    const key_type &key, IterType iter) const {
+  for (;;) {
+    iter.position = iter.node->lower_bound(key, key_comp());
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return std::make_pair(iter, 0);
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate_compare_to(
+    const key_type &key, IterType iter) const {
+  for (;;) {
+    int res = iter.node->lower_bound(key, key_comp());
+    iter.position = res & kMatchMask;
+    if (res & kExactMatch) {
+      return std::make_pair(iter, static_cast<int>(kExactMatch));
+    }
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return std::make_pair(iter, -kExactMatch);
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_lower_bound(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    for (;;) {
+      iter.position =
+          iter.node->lower_bound(key, key_comp()) & kMatchMask;
+      if (iter.node->leaf()) {
+        break;
+      }
+      iter.node = iter.node->child(iter.position);
+    }
+    iter = internal_last(iter);
+  }
+  return iter;
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_upper_bound(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    for (;;) {
+      iter.position = iter.node->upper_bound(key, key_comp());
+      if (iter.node->leaf()) {
+        break;
+      }
+      iter.node = iter.node->child(iter.position);
+    }
+    iter = internal_last(iter);
+  }
+  return iter;
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_find_unique(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    std::pair<IterType, int> res = internal_locate(key, iter);
+    if (res.second == kExactMatch) {
+      return res.first;
+    }
+    if (!res.second) {
+      iter = internal_last(res.first);
+      if (iter.node && !compare_keys(key, iter.key())) {
+        return iter;
+      }
+    }
+  }
+  return IterType(NULL, 0);
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_find_multi(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    iter = internal_lower_bound(key, iter);
+    if (iter.node) {
+      iter = internal_last(iter);
+      if (iter.node && !compare_keys(key, iter.key())) {
+        return iter;
+      }
+    }
+  }
+  return IterType(NULL, 0);
+}
+
+template <typename P>
+void btree<P>::internal_clear(node_type *node) {
+  if (!node->leaf()) {
+    for (int i = 0; i <= node->count(); ++i) {
+      internal_clear(node->child(i));
+    }
+    if (node == root()) {
+      delete_internal_root_node();
+    } else {
+      delete_internal_node(node);
+    }
+  } else {
+    delete_leaf_node(node);
+  }
+}
+
+template <typename P>
+void btree<P>::internal_dump(
+    std::ostream &os, const node_type *node, int level) const {
+  for (int i = 0; i < node->count(); ++i) {
+    if (!node->leaf()) {
+      internal_dump(os, node->child(i), level + 1);
+    }
+    for (int j = 0; j < level; ++j) {
+      os << "  ";
+    }
+    os << node->key(i) << " [" << level << "]\n";
+  }
+  if (!node->leaf()) {
+    internal_dump(os, node->child(node->count()), level + 1);
+  }
+}
+
+template <typename P>
+int btree<P>::internal_verify(
+    const node_type *node, const key_type *lo, const key_type *hi) const {
+  assert(node->count() > 0);
+  assert(node->count() <= node->max_count());
+  if (lo) {
+    assert(!compare_keys(node->key(0), *lo));
+  }
+  if (hi) {
+    assert(!compare_keys(*hi, node->key(node->count() - 1)));
+  }
+  for (int i = 1; i < node->count(); ++i) {
+    assert(!compare_keys(node->key(i), node->key(i - 1)));
+  }
+  int count = node->count();
+  if (!node->leaf()) {
+    for (int i = 0; i <= node->count(); ++i) {
+      assert(node->child(i) != NULL);
+      assert(node->child(i)->parent() == node);
+      assert(node->child(i)->position() == i);
+      count += internal_verify(
+          node->child(i),
+          (i == 0) ? lo : &node->key(i - 1),
+          (i == node->count()) ? hi : &node->key(i));
+    }
+  }
+  return count;
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_H__
diff --git a/include/btree_container.h b/include/btree_container.h
new file mode 100755
index 0000000..c9c62c3
--- /dev/null
+++ b/include/btree_container.h
@@ -0,0 +1,350 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_BTREE_BTREE_CONTAINER_H__
+#define UTIL_BTREE_BTREE_CONTAINER_H__
+
+#include <iosfwd>
+#include <utility>
+
+#include "btree.h"
+
+namespace btree {
+
+// A common base class for btree_set, btree_map, btree_multiset and
+// btree_multimap.
+template <typename Tree>
+class btree_container {
+  typedef btree_container<Tree> self_type;
+
+ public:
+  typedef typename Tree::params_type params_type;
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+  typedef typename Tree::pointer pointer;
+  typedef typename Tree::const_pointer const_pointer;
+  typedef typename Tree::reference reference;
+  typedef typename Tree::const_reference const_reference;
+  typedef typename Tree::size_type size_type;
+  typedef typename Tree::difference_type difference_type;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+  typedef typename Tree::reverse_iterator reverse_iterator;
+  typedef typename Tree::const_reverse_iterator const_reverse_iterator;
+
+ public:
+  // Default constructor.
+  btree_container(const key_compare &comp, const allocator_type &alloc)
+      : tree_(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_container(const self_type &x)
+      : tree_(x.tree_) {
+  }
+
+  // Iterator routines.
+  iterator begin() { return tree_.begin(); }
+  const_iterator begin() const { return tree_.begin(); }
+  iterator end() { return tree_.end(); }
+  const_iterator end() const { return tree_.end(); }
+  reverse_iterator rbegin() { return tree_.rbegin(); }
+  const_reverse_iterator rbegin() const { return tree_.rbegin(); }
+  reverse_iterator rend() { return tree_.rend(); }
+  const_reverse_iterator rend() const { return tree_.rend(); }
+
+  // Lookup routines.
+  iterator lower_bound(const key_type &key) {
+    return tree_.lower_bound(key);
+  }
+  const_iterator lower_bound(const key_type &key) const {
+    return tree_.lower_bound(key);
+  }
+  iterator upper_bound(const key_type &key) {
+    return tree_.upper_bound(key);
+  }
+  const_iterator upper_bound(const key_type &key) const {
+    return tree_.upper_bound(key);
+  }
+  std::pair<iterator,iterator> equal_range(const key_type &key) {
+    return tree_.equal_range(key);
+  }
+  std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
+    return tree_.equal_range(key);
+  }
+
+  // Utility routines.
+  void clear() {
+    tree_.clear();
+  }
+  void swap(self_type &x) {
+    tree_.swap(x.tree_);
+  }
+  void dump(std::ostream &os) const {
+    tree_.dump(os);
+  }
+  void verify() const {
+    tree_.verify();
+  }
+
+  // Size routines.
+  size_type size() const { return tree_.size(); }
+  size_type max_size() const { return tree_.max_size(); }
+  bool empty() const { return tree_.empty(); }
+  size_type height() const { return tree_.height(); }
+  size_type internal_nodes() const { return tree_.internal_nodes(); }
+  size_type leaf_nodes() const { return tree_.leaf_nodes(); }
+  size_type nodes() const { return tree_.nodes(); }
+  size_type bytes_used() const { return tree_.bytes_used(); }
+  static double average_bytes_per_value() {
+    return Tree::average_bytes_per_value();
+  }
+  double fullness() const { return tree_.fullness(); }
+  double overhead() const { return tree_.overhead(); }
+
+  bool operator==(const self_type& x) const {
+    if (size() != x.size()) {
+      return false;
+    }
+    for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi) {
+      if (*i != *xi) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool operator!=(const self_type& other) const {
+    return !operator==(other);
+  }
+
+
+ protected:
+  Tree tree_;
+};
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const btree_container<T> &b) {
+  b.dump(os);
+  return os;
+}
+
+// A common base class for btree_set and safe_btree_set.
+template <typename Tree>
+class btree_unique_container : public btree_container<Tree> {
+  typedef btree_unique_container<Tree> self_type;
+  typedef btree_container<Tree> super_type;
+
+ public:
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::size_type size_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+
+ public:
+  // Default constructor.
+  btree_unique_container(const key_compare &comp = key_compare(),
+                         const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_unique_container(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_unique_container(InputIterator b, InputIterator e,
+                         const key_compare &comp = key_compare(),
+                         const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+    insert(b, e);
+  }
+
+  // Lookup routines.
+  iterator find(const key_type &key) {
+    return this->tree_.find_unique(key);
+  }
+  const_iterator find(const key_type &key) const {
+    return this->tree_.find_unique(key);
+  }
+  size_type count(const key_type &key) const {
+    return this->tree_.count_unique(key);
+  }
+
+  // Insertion routines.
+  std::pair<iterator,bool> insert(const value_type &x) {
+    return this->tree_.insert_unique(x);
+  }
+  iterator insert(iterator position, const value_type &x) {
+    return this->tree_.insert_unique(position, x);
+  }
+  template <typename InputIterator>
+  void insert(InputIterator b, InputIterator e) {
+    this->tree_.insert_unique(b, e);
+  }
+
+  // Deletion routines.
+  int erase(const key_type &key) {
+    return this->tree_.erase_unique(key);
+  }
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(const iterator &iter) {
+    return this->tree_.erase(iter);
+  }
+  void erase(const iterator &first, const iterator &last) {
+    this->tree_.erase(first, last);
+  }
+};
+
+// A common base class for btree_map and safe_btree_map.
+template <typename Tree>
+class btree_map_container : public btree_unique_container<Tree> {
+  typedef btree_map_container<Tree> self_type;
+  typedef btree_unique_container<Tree> super_type;
+
+ public:
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::data_type data_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::mapped_type mapped_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+
+ private:
+  // A pointer-like object which only generates its value when
+  // dereferenced. Used by operator[] to avoid constructing an empty data_type
+  // if the key already exists in the map.
+  struct generate_value {
+    generate_value(const key_type &k)
+        : key(k) {
+    }
+    value_type operator*() const {
+      return std::make_pair(key, data_type());
+    }
+    const key_type &key;
+  };
+
+ public:
+  // Default constructor.
+  btree_map_container(const key_compare &comp = key_compare(),
+                      const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_map_container(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_map_container(InputIterator b, InputIterator e,
+                      const key_compare &comp = key_compare(),
+                      const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+    insert(b, e);
+  }
+
+  // Insertion routines.
+  data_type& operator[](const key_type &key) {
+    return this->tree_.insert_unique(key, generate_value(key)).first->second;
+  }
+};
+
+// A common base class for btree_multiset and btree_multimap.
+template <typename Tree>
+class btree_multi_container : public btree_container<Tree> {
+  typedef btree_multi_container<Tree> self_type;
+  typedef btree_container<Tree> super_type;
+
+ public:
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::size_type size_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+
+ public:
+  // Default constructor.
+  btree_multi_container(const key_compare &comp = key_compare(),
+                        const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_multi_container(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_multi_container(InputIterator b, InputIterator e,
+                        const key_compare &comp = key_compare(),
+                        const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+    insert(b, e);
+  }
+
+  // Lookup routines.
+  iterator find(const key_type &key) {
+    return this->tree_.find_multi(key);
+  }
+  const_iterator find(const key_type &key) const {
+    return this->tree_.find_multi(key);
+  }
+  size_type count(const key_type &key) const {
+    return this->tree_.count_multi(key);
+  }
+
+  // Insertion routines.
+  iterator insert(const value_type &x) {
+    return this->tree_.insert_multi(x);
+  }
+  iterator insert(iterator position, const value_type &x) {
+    return this->tree_.insert_multi(position, x);
+  }
+  template <typename InputIterator>
+  void insert(InputIterator b, InputIterator e) {
+    this->tree_.insert_multi(b, e);
+  }
+
+  // Deletion routines.
+  int erase(const key_type &key) {
+    return this->tree_.erase_multi(key);
+  }
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(const iterator &iter) {
+    return this->tree_.erase(iter);
+  }
+  void erase(const iterator &first, const iterator &last) {
+    this->tree_.erase(first, last);
+  }
+};
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_CONTAINER_H__
diff --git a/include/btree_map.h b/include/btree_map.h
new file mode 100755
index 0000000..07b799e
--- /dev/null
+++ b/include/btree_map.h
@@ -0,0 +1,130 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree_map<> implements the STL unique sorted associative container
+// interface and the pair associative container interface (a.k.a map<>) using a
+// btree. A btree_multimap<> implements the STL multiple sorted associative
+// container interface and the pair associtive container interface (a.k.a
+// multimap<>) using a btree. See btree.h for details of the btree
+// implementation and caveats.
+
+#ifndef UTIL_BTREE_BTREE_MAP_H__
+#define UTIL_BTREE_BTREE_MAP_H__
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// The btree_map class is needed mainly for it's constructors.
+template <typename Key, typename Value,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<std::pair<const Key, Value> >,
+          int TargetNodeSize = 256>
+class btree_map : public btree_map_container<
+  btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_map<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_map_params<
+    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_map_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+  // Default constructor.
+  btree_map(const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_map(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_map(InputIterator b, InputIterator e,
+            const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+};
+
+template <typename K, typename V, typename C, typename A, int N>
+inline void swap(btree_map<K, V, C, A, N> &x,
+                 btree_map<K, V, C, A, N> &y) {
+  x.swap(y);
+}
+
+// The btree_multimap class is needed mainly for it's constructors.
+template <typename Key, typename Value,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<std::pair<const Key, Value> >,
+          int TargetNodeSize = 256>
+class btree_multimap : public btree_multi_container<
+  btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_multimap<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_map_params<
+    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_multi_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+  typedef typename btree_type::data_type data_type;
+  typedef typename btree_type::mapped_type mapped_type;
+
+ public:
+  // Default constructor.
+  btree_multimap(const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_multimap(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_multimap(InputIterator b, InputIterator e,
+                 const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename V, typename C, typename A, int N>
+inline void swap(btree_multimap<K, V, C, A, N> &x,
+                 btree_multimap<K, V, C, A, N> &y) {
+  x.swap(y);
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_MAP_H__
diff --git a/include/btree_set.h b/include/btree_set.h
new file mode 100755
index 0000000..2bc9e58
--- /dev/null
+++ b/include/btree_set.h
@@ -0,0 +1,121 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree_set<> implements the STL unique sorted associative container
+// interface (a.k.a set<>) using a btree. A btree_multiset<> implements the STL
+// multiple sorted associative container interface (a.k.a multiset<>) using a
+// btree. See btree.h for details of the btree implementation and caveats.
+
+#ifndef UTIL_BTREE_BTREE_SET_H__
+#define UTIL_BTREE_BTREE_SET_H__
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// The btree_set class is needed mainly for it's constructors.
+template <typename Key,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<Key>,
+          int TargetNodeSize = 256>
+class btree_set : public btree_unique_container<
+  btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_set<Key, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_unique_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+  // Default constructor.
+  btree_set(const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_set(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_set(InputIterator b, InputIterator e,
+            const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename C, typename A, int N>
+inline void swap(btree_set<K, C, A, N> &x, btree_set<K, C, A, N> &y) {
+  x.swap(y);
+}
+
+// The btree_multiset class is needed mainly for it's constructors.
+template <typename Key,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<Key>,
+          int TargetNodeSize = 256>
+class btree_multiset : public btree_multi_container<
+  btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_multiset<Key, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_multi_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+  // Default constructor.
+  btree_multiset(const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_multiset(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_multiset(InputIterator b, InputIterator e,
+                 const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename C, typename A, int N>
+inline void swap(btree_multiset<K, C, A, N> &x,
+                 btree_multiset<K, C, A, N> &y) {
+  x.swap(y);
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_SET_H__
diff --git a/include/concurrentqueue.h b/include/concurrentqueue.h
new file mode 100644
index 0000000..a9f89a8
--- /dev/null
+++ b/include/concurrentqueue.h
@@ -0,0 +1,3363 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+
+// Simplified BSD license:
+// Copyright (c) 2013-2014, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#pragma once
+
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <cassert>
+#include <array>
+#include <thread>		// for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+#if defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	static inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#else
+#define MOODYCAMEL_TRY if (true)
+#define MOODYCAMEL_CATCH(...) else if (false)
+#define MOODYCAMEL_RETHROW
+#endif
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+// Assume `thread_local` is fully supported in all other C++11 compilers/runtimes
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#endif
+#endif
+
+// rob-p test
+#undef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	inline bool likely(bool x) { return __builtin_expect((x), true); }
+	inline bool unlikely(bool x) { return __builtin_expect((x), false); }
+#else
+	inline bool likely(bool x) { return x; }
+	inline bool unlikely(bool x) { return x; }
+#endif
+} }
+
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+#if defined(__GNUC__)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+}
+
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+
+		ConcurrentQueueProducerTypelessBase()
+			: inactive(false), token(nullptr)
+		{
+		}
+	};
+
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_t)>::hash(id));
+	}
+
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4554)
+#endif
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+	}
+
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = std::move(left.load(std::memory_order_relaxed));
+		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+		right.store(std::move(temp), std::memory_order_relaxed);
+	}
+
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+	};
+
+
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			listener->next = tlsInst.tail;
+			tlsInst.tail = listener;
+		}
+
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) = delete;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) = delete;
+
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->callback(ptr->userData);
+			}
+		}
+
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+
+	private:
+		ThreadExitListener* tail;
+	};
+	#endif
+
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+
+	explicit ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) = delete;
+	ProducerToken& operator=(ProducerToken const&) = delete;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+
+	explicit ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) = delete;
+	ConsumerToken& operator=(ConsumerToken const&) = delete;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear();
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+
+		// Destroy implicit producer hash tables
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					Traits::free(hash);
+				}
+				hash = prev;
+			}
+		}
+
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) = delete;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) = delete;
+
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear();
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+
+		reown_producers();
+	}
+
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+
+private:
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+
+		swap_implicit_producer_hashes(other);
+
+		reown_producers();
+		other.reown_producers();
+
+		return *this;
+	}
+
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CanAlloc>(item);
+	}
+
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CanAlloc>(std::move(item));
+	}
+
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue_bulk<CanAlloc>(std::forward<It>(itemFirst), count);
+	}
+
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, std::forward<It>(itemFirst), count);
+	}
+
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CannotAlloc>(item);
+	}
+
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue_bulk<CannotAlloc>(std::forward<It>(itemFirst), count);
+	}
+
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, std::forward<It>(itemFirst), count);
+	}
+
+
+
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if (details::likely(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+
+
+
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+
+
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+
+
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<details::thread_id_t>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	friend struct ExplicitProducer;
+	friend class ConcurrentQueueTests;
+
+	enum AllocationMode { CanAlloc, CannotAlloc };
+
+
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(std::forward<It>(itemFirst), count);
+	}
+
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(std::forward<It>(itemFirst), count);
+	}
+
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if (details::unlikely(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+
+
+	///////////////////////////
+	// Free list
+	///////////////////////////
+
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+
+		FreeList(FreeList const&) = delete;
+		FreeList& operator=(FreeList const&) = delete;
+
+		inline void add(N* node)
+		{
+#if MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+
+		inline N* try_get()
+		{
+#if MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_add(-2, std::memory_order_release);
+					return head;
+				}
+
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_add(-1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+
+			return nullptr;
+		}
+
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+
+#if MCDBGQ_NOLOCKFREE_FREELIST
+		debug::DebugMutex mutex;
+#endif
+	};
+
+
+	///////////////////////////
+	// Block
+	///////////////////////////
+
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+
+	struct Block
+	{
+		Block()
+			: elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true)
+		{
+#if MCDBGQ_TRACKMEM
+			owner = nullptr;
+#endif
+		}
+
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(index_t i)
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(index_t i, size_t count)
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return reinterpret_cast<T*>(elements) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return reinterpret_cast<T*>(elements) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	private:
+		char elements[sizeof(T) * BLOCK_SIZE];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		std::atomic<bool> shouldBeOnFreeList;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+
+#if MCDBGQ_TRACKMEM
+		void* owner;
+#endif
+	};
+
+
+#if MCDBGQ_TRACKMEM
+public:
+	struct MemStats;
+private:
+#endif
+
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent, bool isExplicit) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit),
+			parent(parent)
+		{
+		}
+
+		virtual ~ProducerBase() { };
+
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+
+		Block* tailBlock;
+
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+
+	protected:
+#if MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+
+
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent) :
+			ProducerBase(parent, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if (this->headIndex.load(std::memory_order_relaxed) != this->tailIndex.load(std::memory_order_relaxed) && (this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto next = block->next;
+					if (block->dynamicallyAllocated) {
+						destroy(block);
+					}
+					block = next;
+				} while (block != this->tailBlock);
+			}
+
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				Traits::free(header);
+				header = prev;
+			}
+		}
+
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+
+						if (allocMode == CannotAlloc || !new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
+#if MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (nullptr) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (nullptr) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				assert(overcommit <= myDequeueCount);
+
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if (details::likely(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail.
+					auto index = this->headIndex.fetch_add(1, std::memory_order_relaxed);
+
+
+					// Determine which block the element is in
+
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / BLOCK_SIZE);
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+
+						element = std::move(el);
+					}
+					else {
+						element = std::move(el);
+						el.~T();
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+
+			return false;
+		}
+
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+			Block* firstAllocatedBlock = nullptr;
+
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						if (allocMode == CannotAlloc || full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+
+#if MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+					++pr_blockIndexSlotsUsed;
+
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+
+			if (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst))) && firstAllocatedBlock != nullptr) {
+				blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				assert(overcommit <= myDequeueCount);
+
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_relaxed);
+
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE);
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+
+			return 0;
+		}
+
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+
+
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>(Traits::malloc(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+
+			return true;
+		}
+
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+
+#if MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+
+
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+
+	struct ImplicitProducer : public ProducerBase
+	{
+		ImplicitProducer(ConcurrentQueue* parent) :
+			ProducerBase(parent, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr && block->dynamicallyAllocated) {
+						// Free the old block
+						this->parent->destroy(block);
+					}
+
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) && this->tailBlock->dynamicallyAllocated) {
+				this->parent->destroy(this->tailBlock);
+			}
+
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					Traits::free(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
+#if MCDBGQ_TRACKMEM
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (nullptr) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+				this->tailBlock = newBlock;
+
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (nullptr) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				assert(overcommit <= myDequeueCount);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if (details::likely(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_relaxed);
+
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+
+						element = std::move(el);
+					}
+					else {
+						element = std::move(el);
+						el.~T();
+
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+
+			return false;
+		}
+
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry;
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+
+						return false;
+					}
+
+#if MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				assert(overcommit <= myDequeueCount);
+
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_relaxed);
+
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+
+			return 0;
+		}
+
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			auto newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+
+			// No room in the old block index, try to allocate another one!
+			if (allocMode == CannotAlloc || !new_block_index()) {
+				return false;
+			}
+			localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+			idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+			localBlockIndex->tail.store(newTail, std::memory_order_release);
+			return true;
+		}
+
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / BLOCK_SIZE);
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>(Traits::malloc(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+
+			blockIndex.store(header, std::memory_order_release);
+
+			nextBlockIndexCapacity <<= 1;
+
+			return true;
+		}
+
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
+#endif
+
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+		mutable debug::DebugMutex mutex;
+#endif
+#if MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+
+
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+
+	inline void add_block_to_free_list(Block* block)
+	{
+#if MCDBGQ_TRACKMEM
+		block->owner = nullptr;
+#endif
+		freeList.add(block);
+	}
+
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+
+		if (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+
+		return nullptr;
+	}
+
+
+#if MCDBGQ_TRACKMEM
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+
+			friend class ConcurrentQueue;
+
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+
+				stats.elementsEnqueued = q->size_approx();
+
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+				return stats;
+			}
+		};
+
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+
+
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////
+
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					return ptr;
+				}
+			}
+		}
+
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+
+		return producer;
+	}
+
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+
+
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+
+		ImplicitProducerKVP() { }
+
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+
+	inline void populate_initial_implicit_producer_hash()
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
+
+		implicitProducerHashCount.store(0, std::memory_order_relaxed);
+		auto hash = &initialImplicitProducerHash;
+		hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+		hash->entries = &initialImplicitProducerHashEntries[0];
+		for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+			initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+		}
+		hash->prev = nullptr;
+		implicitProducerHash.store(hash, std::memory_order_relaxed);
+	}
+
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
+
+		// Swap (assumes our implicit producer hash is initialized)
+		initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+		initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+		other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+
+		details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+
+		details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+		if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+			implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+		}
+		else {
+			ImplicitProducerHash* hash;
+			for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+				continue;
+			}
+			hash->prev = &initialImplicitProducerHash;
+		}
+		if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+			other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+		}
+		else {
+			ImplicitProducerHash* hash;
+			for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+				continue;
+			}
+			hash->prev = &other.initialImplicitProducerHash;
+		}
+	}
+
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1;
+
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1;
+							probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+							auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+							auto reusable = details::invalid_thread_id2;
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed)) ||
+								(probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire))) {
+#else
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed))) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				auto newCapacity = mainHash->capacity << 1;
+				while (newCount >= (newCapacity >> 1)) {
+					newCapacity <<= 1;
+				}
+				auto raw = static_cast<char*>(Traits::malloc(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+				if (raw == nullptr) {
+					// Allocation failed
+					implicitProducerHashCount.fetch_add(-1, std::memory_order_relaxed);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+					return nullptr;
+				}
+
+				auto newHash = new (raw) ImplicitProducerHash;
+				newHash->capacity = newCapacity;
+				newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+				for (size_t i = 0; i != newCapacity; ++i) {
+					new (newHash->entries + i) ImplicitProducerKVP;
+					newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+				}
+				newHash->prev = mainHash;
+				implicitProducerHash.store(newHash, std::memory_order_release);
+				implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				mainHash = newHash;
+			}
+
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+				if (producer == nullptr) {
+					return nullptr;
+				}
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1;
+					auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+					auto empty = details::invalid_thread_id;
+					if (probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed)) {
+						mainHash->entries[index].value = producer;
+						break;
+					}
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+					auto reusable = details::invalid_thread_id2;
+					if (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire)) {
+						implicitProducerHashCount.fetch_add(-1, std::memory_order_relaxed);
+						mainHash->entries[index].value = producer;
+						break;
+					}
+#endif
+					++index;
+				}
+				return producer;
+			}
+
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from thread exit listeners
+		details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener);
+
+		// Remove from hash
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Only need to remove from main table (as that's the only one that new inserts could go into)
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto index = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		do {
+			index &= hash->capacity - 1;
+			probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+			if (probedKey == id) {
+				hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release);
+				break;
+			}
+			++index;
+		} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet
+
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		auto p = static_cast<U*>(Traits::malloc(sizeof(U) * count));
+		if (p == nullptr) {
+			return nullptr;
+		}
+
+		for (size_t i = 0; i != count; ++i) {
+			new (p + i) U();
+		}
+		return p;
+	}
+
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; ) {
+				(p + --i)->~U();
+			}
+			Traits::free(p);
+		}
+	}
+
+	template<typename U>
+	static inline U* create()
+	{
+		auto p = Traits::malloc(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		auto p = Traits::malloc(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		Traits::free(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+
+#if !MCDBGQ_USEDEBUGFREELIST
+	FreeList<Block> freeList;
+#else
+	debug::DebugFreeList<Block> freeList;
+#endif
+
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+	debug::DebugMutex implicitProdMutex;
+#endif
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = -1;
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/include/count_main_cmdline.hpp b/include/count_main_cmdline.hpp
new file mode 100644
index 0000000..004c886
--- /dev/null
+++ b/include/count_main_cmdline.hpp
@@ -0,0 +1,711 @@
+/***** This code was generated by Yaggo. Do not edit ******/
+
+/*  This file is part of Jellyfish.
+
+    Jellyfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Jellyfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Jellyfish.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __COUNT_MAIN_CMDLINE_HPP__
+#define __COUNT_MAIN_CMDLINE_HPP__
+
+#include <stdint.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <errno.h>
+#include <string.h>
+#include <stdexcept>
+#include <string>
+#include <limits>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <memory>
+
+class count_main_cmdline {
+ // Boiler plate stuff. Conversion from string to other formats
+  static bool adjust_double_si_suffix(double &res, const char *suffix) {
+    if(*suffix == '\0')
+      return true;
+    if(*(suffix + 1) != '\0')
+      return false;
+
+    switch(*suffix) {
+    case 'a': res *= 1e-18; break;
+    case 'f': res *= 1e-15; break;
+    case 'p': res *= 1e-12; break;
+    case 'n': res *= 1e-9;  break;
+    case 'u': res *= 1e-6;  break;
+    case 'm': res *= 1e-3;  break;
+    case 'k': res *= 1e3;   break;
+    case 'M': res *= 1e6;   break;
+    case 'G': res *= 1e9;   break;
+    case 'T': res *= 1e12;  break;
+    case 'P': res *= 1e15;  break;
+    case 'E': res *= 1e18;  break;
+    default: return false;
+    }
+    return true;
+  }
+
+  static double conv_double(const char *str, ::std::string &err, bool si_suffix) {
+    char *endptr = 0;
+    errno = 0;
+    double res = strtod(str, &endptr);
+    if(errno) {
+      err.assign(strerror(errno));
+      return (double)0.0;
+    }
+    bool invalid =
+      si_suffix ? !adjust_double_si_suffix(res, endptr) : *endptr != '\0';
+    if(invalid) {
+      err.assign("Invalid character");
+      return (double)0.0;
+    }
+    return res;
+  }
+
+  static int conv_enum(const char* str, ::std::string& err, const char* const strs[]) {
+    int res = 0;
+    for(const char* const* cstr = strs; *cstr; ++cstr, ++res)
+      if(!strcmp(*cstr, str))
+        return res;
+    err += "Invalid constant '";
+    err += str;
+    err += "'. Expected one of { ";
+    for(const char* const* cstr = strs; *cstr; ++cstr) {
+      if(cstr != strs)
+        err += ", ";
+      err += *cstr;
+    }
+    err += " }";
+    return -1;
+  }
+
+  template<typename T>
+  static bool adjust_int_si_suffix(T &res, const char *suffix) {
+    if(*suffix == '\0')
+      return true;
+    if(*(suffix + 1) != '\0')
+      return false;
+
+    switch(*suffix) {
+    case 'k': res *= (T)1000; break;
+    case 'M': res *= (T)1000000; break;
+    case 'G': res *= (T)1000000000; break;
+    case 'T': res *= (T)1000000000000; break;
+    case 'P': res *= (T)1000000000000000; break;
+    case 'E': res *= (T)1000000000000000000; break;
+    default: return false;
+    }
+    return true;
+  }
+
+  template<typename T>
+  static T conv_int(const char *str, ::std::string &err, bool si_suffix) {
+    char *endptr = 0;
+    errno = 0;
+    long long int res = strtoll(str, &endptr, 0);
+    if(errno) {
+      err.assign(strerror(errno));
+      return (T)0;
+    }
+    bool invalid =
+      si_suffix ? !adjust_int_si_suffix(res, endptr) : *endptr != '\0';
+    if(invalid) {
+      err.assign("Invalid character");
+      return (T)0;
+    }
+    if(res > ::std::numeric_limits<T>::max() ||
+       res < ::std::numeric_limits<T>::min()) {
+      err.assign("Value out of range");
+      return (T)0;
+    }
+    return (T)res;
+  }
+
+  template<typename T>
+  static T conv_uint(const char *str, ::std::string &err, bool si_suffix) {
+    char *endptr = 0;
+    errno = 0;
+    while(isspace(*str)) { ++str; }
+    if(*str == '-') {
+      err.assign("Negative value");
+      return (T)0;
+    }
+    unsigned long long int res = strtoull(str, &endptr, 0);
+    if(errno) {
+      err.assign(strerror(errno));
+      return (T)0;
+    }
+    bool invalid =
+      si_suffix ? !adjust_int_si_suffix(res, endptr) : *endptr != '\0';
+    if(invalid) {
+      err.assign("Invalid character");
+      return (T)0;
+    }
+    if(res > ::std::numeric_limits<T>::max()) {
+      err.assign("Value out of range");
+      return (T)0;
+    }
+    return (T)res;
+  }
+
+  template<typename T>
+  static ::std::string vec_str(const std::vector<T> &vec) {
+    ::std::ostringstream os;
+    for(typename ::std::vector<T>::const_iterator it = vec.begin();
+        it != vec.end(); ++it) {
+      if(it != vec.begin())
+        os << ",";
+      os << *it;
+    }
+    return os.str();
+  }
+
+  class string : public ::std::string {
+  public:
+    string() : ::std::string() {}
+    explicit string(const ::std::string &s) : std::string(s) {}
+    explicit string(const char *s) : ::std::string(s) {}
+    int as_enum(const char* const strs[]) {
+      ::std::string err;
+      int res = conv_enum((const char*)this->c_str(), err, strs);
+      if(!err.empty())
+        throw ::std::runtime_error(err);
+      return res;
+    }
+
+
+    uint32_t as_uint32_suffix() const { return as_uint32(true); }
+    uint32_t as_uint32(bool si_suffix = false) const {
+      ::std::string err;
+      uint32_t res = conv_uint<uint32_t>((const char*)this->c_str(), err, si_suffix);
+      if(!err.empty()) {
+        ::std::string msg("Invalid conversion of '");
+        msg += *this;
+        msg += "' to uint32_t: ";
+        msg += err;
+        throw ::std::runtime_error(msg);
+      }
+      return res;
+    }
+    uint64_t as_uint64_suffix() const { return as_uint64(true); }
+    uint64_t as_uint64(bool si_suffix = false) const {
+      ::std::string err;
+      uint64_t res = conv_uint<uint64_t>((const char*)this->c_str(), err, si_suffix);
+      if(!err.empty()) {
+        ::std::string msg("Invalid conversion of '");
+        msg += *this;
+        msg += "' to uint64_t: ";
+        msg += err;
+        throw ::std::runtime_error(msg);
+      }
+      return res;
+    }
+    int32_t as_int32_suffix() const { return as_int32(true); }
+    int32_t as_int32(bool si_suffix = false) const {
+      ::std::string err;
+      int32_t res = conv_int<int32_t>((const char*)this->c_str(), err, si_suffix);
+      if(!err.empty()) {
+        ::std::string msg("Invalid conversion of '");
+        msg += *this;
+        msg += "' to int32_t: ";
+        msg += err;
+        throw ::std::runtime_error(msg);
+      }
+      return res;
+    }
+    int64_t as_int64_suffix() const { return as_int64(true); }
+    int64_t as_int64(bool si_suffix = false) const {
+      ::std::string err;
+      int64_t res = conv_int<int64_t>((const char*)this->c_str(), err, si_suffix);
+      if(!err.empty()) {
+        ::std::string msg("Invalid conversion of '");
+        msg += *this;
+        msg += "' to int64_t: ";
+        msg += err;
+        throw ::std::runtime_error(msg);
+      }
+      return res;
+    }
+    int as_int_suffix() const { return as_int(true); }
+    int as_int(bool si_suffix = false) const {
+      ::std::string err;
+      int res = conv_int<int>((const char*)this->c_str(), err, si_suffix);
+      if(!err.empty()) {
+        ::std::string msg("Invalid conversion of '");
+        msg += *this;
+        msg += "' to int_t: ";
+        msg += err;
+        throw ::std::runtime_error(msg);
+      }
+      return res;
+    }
+    long as_long_suffix() const { return as_long(true); }
+    long as_long(bool si_suffix = false) const {
+      ::std::string err;
+      long res = conv_int<long>((const char*)this->c_str(), err, si_suffix);
+      if(!err.empty()) {
+        ::std::string msg("Invalid conversion of '");
+        msg += *this;
+        msg += "' to long_t: ";
+        msg += err;
+        throw ::std::runtime_error(msg);
+      }
+      return res;
+    }
+    double as_double_suffix() const { return as_double(true); }
+    double as_double(bool si_suffix = false) const {
+      ::std::string err;
+      double res = conv_double((const char*)this->c_str(), err, si_suffix);
+      if(!err.empty()) {
+        ::std::string msg("Invalid conversion of '");
+        msg += *this;
+        msg += "' to double_t: ";
+        msg += err;
+        throw ::std::runtime_error(msg);
+      }
+      return res;
+    }
+  };
+
+public:
+  uint32_t                       mer_len_arg;
+  bool                           mer_len_given;
+  uint64_t                       size_arg;
+  bool                           size_given;
+  uint32_t                       threads_arg;
+  bool                           threads_given;
+  uint32_t                       Files_arg;
+  bool                           Files_given;
+  const char *                   generator_arg;
+  bool                           generator_given;
+  uint32_t                       Generators_arg;
+  bool                           Generators_given;
+  const char *                   shell_arg;
+  bool                           shell_given;
+  const char *                   output_arg;
+  bool                           output_given;
+  uint32_t                       counter_len_arg;
+  bool                           counter_len_given;
+  uint32_t                       out_counter_len_arg;
+  bool                           out_counter_len_given;
+  bool                           canonical_flag;
+  const char *                   bc_arg;
+  bool                           bc_given;
+  uint64_t                       bf_size_arg;
+  bool                           bf_size_given;
+  double                         bf_fp_arg;
+  bool                           bf_fp_given;
+  ::std::vector<const char *>    if_arg;
+  typedef ::std::vector<const char *>::iterator if_arg_it;
+  typedef ::std::vector<const char *>::const_iterator if_arg_const_it;
+  bool                           if_given;
+  string                         min_qual_char_arg;
+  bool                           min_qual_char_given;
+  uint32_t                       reprobes_arg;
+  bool                           reprobes_given;
+  bool                           text_flag;
+  bool                           disk_flag;
+  bool                           no_merge_flag;
+  bool                           no_unlink_flag;
+  uint64_t                       lower_count_arg;
+  bool                           lower_count_given;
+  uint64_t                       upper_count_arg;
+  bool                           upper_count_given;
+  const char *                   timing_arg;
+  bool                           timing_given;
+  bool                           no_write_flag;
+  ::std::vector<const char *>    file_arg;
+  typedef ::std::vector<const char *>::iterator file_arg_it;
+  typedef ::std::vector<const char *>::const_iterator file_arg_const_it;
+
+  enum {
+    START_OPT = 1000,
+    FULL_HELP_OPT,
+    USAGE_OPT,
+    OUT_COUNTER_LEN_OPT,
+    BC_OPT,
+    BF_SIZE_OPT,
+    BF_FP_OPT,
+    IF_OPT,
+    TEXT_OPT,
+    DISK_OPT,
+    NO_MERGE_OPT,
+    NO_UNLINK_OPT,
+    TIMING_OPT,
+    NO_WRITE_OPT
+  };
+
+  count_main_cmdline() :
+    mer_len_arg(), mer_len_given(false),
+    size_arg(), size_given(false),
+    threads_arg(1), threads_given(false),
+    Files_arg(1), Files_given(false),
+    generator_arg(""), generator_given(false),
+    Generators_arg(1), Generators_given(false),
+    shell_arg(""), shell_given(false),
+    output_arg("mer_counts.jf"), output_given(false),
+    counter_len_arg(7), counter_len_given(false),
+    out_counter_len_arg(4), out_counter_len_given(false),
+    canonical_flag(false),
+    bc_arg(""), bc_given(false),
+    bf_size_arg(), bf_size_given(false),
+    bf_fp_arg(0.01), bf_fp_given(false),
+    if_arg(), if_given(false),
+    min_qual_char_arg(""), min_qual_char_given(false),
+    reprobes_arg(126), reprobes_given(false),
+    text_flag(false),
+    disk_flag(false),
+    no_merge_flag(false),
+    no_unlink_flag(false),
+    lower_count_arg(), lower_count_given(false),
+    upper_count_arg(), upper_count_given(false),
+    timing_arg(""), timing_given(false),
+    no_write_flag(false),
+    file_arg()
+  { }
+
+  count_main_cmdline(int argc, char* argv[]) :
+    mer_len_arg(), mer_len_given(false),
+    size_arg(), size_given(false),
+    threads_arg(1), threads_given(false),
+    Files_arg(1), Files_given(false),
+    generator_arg(""), generator_given(false),
+    Generators_arg(1), Generators_given(false),
+    shell_arg(""), shell_given(false),
+    output_arg("mer_counts.jf"), output_given(false),
+    counter_len_arg(7), counter_len_given(false),
+    out_counter_len_arg(4), out_counter_len_given(false),
+    canonical_flag(false),
+    bc_arg(""), bc_given(false),
+    bf_size_arg(), bf_size_given(false),
+    bf_fp_arg(0.01), bf_fp_given(false),
+    if_arg(), if_given(false),
+    min_qual_char_arg(""), min_qual_char_given(false),
+    reprobes_arg(126), reprobes_given(false),
+    text_flag(false),
+    disk_flag(false),
+    no_merge_flag(false),
+    no_unlink_flag(false),
+    lower_count_arg(), lower_count_given(false),
+    upper_count_arg(), upper_count_given(false),
+    timing_arg(""), timing_given(false),
+    no_write_flag(false),
+    file_arg()
+  { parse(argc, argv); }
+
+  void parse(int argc, char* argv[]) {
+    static struct option long_options[] = {
+      {"mer-len", 1, 0, 'm'},
+      {"size", 1, 0, 's'},
+      {"threads", 1, 0, 't'},
+      {"Files", 1, 0, 'F'},
+      {"generator", 1, 0, 'g'},
+      {"Generators", 1, 0, 'G'},
+      {"shell", 1, 0, 'S'},
+      {"output", 1, 0, 'o'},
+      {"counter-len", 1, 0, 'c'},
+      {"out-counter-len", 1, 0, OUT_COUNTER_LEN_OPT},
+      {"canonical", 0, 0, 'C'},
+      {"bc", 1, 0, BC_OPT},
+      {"bf-size", 1, 0, BF_SIZE_OPT},
+      {"bf-fp", 1, 0, BF_FP_OPT},
+      {"if", 1, 0, IF_OPT},
+      {"min-qual-char", 1, 0, 'Q'},
+      {"reprobes", 1, 0, 'p'},
+      {"text", 0, 0, TEXT_OPT},
+      {"disk", 0, 0, DISK_OPT},
+      {"no-merge", 0, 0, NO_MERGE_OPT},
+      {"no-unlink", 0, 0, NO_UNLINK_OPT},
+      {"lower-count", 1, 0, 'L'},
+      {"upper-count", 1, 0, 'U'},
+      {"timing", 1, 0, TIMING_OPT},
+      {"no-write", 0, 0, NO_WRITE_OPT},
+      {"help", 0, 0, 'h'},
+      {"full-help", 0, 0, FULL_HELP_OPT},
+      {"usage", 0, 0, USAGE_OPT},
+      {"version", 0, 0, 'V'},
+      {0, 0, 0, 0}
+    };
+    static const char *short_options = "hVm:s:t:F:g:G:S:o:c:CQ:p:L:U:";
+
+    ::std::string err;
+#define CHECK_ERR(type,val,which) if(!err.empty()) { ::std::cerr << "Invalid " #type " '" << val << "' for [" which "]: " << err << "\n"; exit(1); }
+    while(true) {
+      int index = -1;
+      int c = getopt_long(argc, argv, short_options, long_options, &index);
+      if(c == -1) break;
+      switch(c) {
+      case ':':
+        ::std::cerr << "Missing required argument for "
+                  << (index == -1 ? ::std::string(1, (char)optopt) : std::string(long_options[index].name))
+                  << ::std::endl;
+        exit(1);
+      case 'h':
+        ::std::cout << usage() << "\n\n" << help() << std::endl;
+        exit(0);
+      case USAGE_OPT:
+        ::std::cout << usage() << "\nUse --help for more information." << std::endl;
+        exit(0);
+      case 'V':
+        print_version();
+        exit(0);
+      case '?':
+        ::std::cerr << "Use --usage or --help for some help\n";
+        exit(1);
+      case FULL_HELP_OPT:
+        ::std::cout << usage() << "\n\n" << help() << "\n\n" << hidden() << std::flush;
+        exit(0);
+      case 'm':
+        mer_len_given = true;
+        mer_len_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
+        CHECK_ERR(uint32_t, optarg, "-m, --mer-len=uint32")
+        break;
+      case 's':
+        size_given = true;
+        size_arg = conv_uint<uint64_t>((const char*)optarg, err, true);
+        CHECK_ERR(uint64_t, optarg, "-s, --size=uint64")
+        break;
+      case 't':
+        threads_given = true;
+        threads_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
+        CHECK_ERR(uint32_t, optarg, "-t, --threads=uint32")
+        break;
+      case 'F':
+        Files_given = true;
+        Files_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
+        CHECK_ERR(uint32_t, optarg, "-F, --Files=uint32")
+        break;
+      case 'g':
+        generator_given = true;
+        generator_arg = optarg;
+        break;
+      case 'G':
+        Generators_given = true;
+        Generators_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
+        CHECK_ERR(uint32_t, optarg, "-G, --Generators=uint32")
+        break;
+      case 'S':
+        shell_given = true;
+        shell_arg = optarg;
+        break;
+      case 'o':
+        output_given = true;
+        output_arg = optarg;
+        break;
+      case 'c':
+        counter_len_given = true;
+        counter_len_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
+        CHECK_ERR(uint32_t, optarg, "-c, --counter-len=Length in bits")
+        break;
+      case OUT_COUNTER_LEN_OPT:
+        out_counter_len_given = true;
+        out_counter_len_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
+        CHECK_ERR(uint32_t, optarg, "    --out-counter-len=Length in bytes")
+        break;
+      case 'C':
+        canonical_flag = true;
+        break;
+      case BC_OPT:
+        bc_given = true;
+        bc_arg = optarg;
+        break;
+      case BF_SIZE_OPT:
+        bf_size_given = true;
+        bf_size_arg = conv_uint<uint64_t>((const char*)optarg, err, true);
+        CHECK_ERR(uint64_t, optarg, "    --bf-size=uint64")
+        break;
+      case BF_FP_OPT:
+        bf_fp_given = true;
+        bf_fp_arg = conv_double((const char*)optarg, err, false);
+        CHECK_ERR(double_t, optarg, "    --bf-fp=double")
+        break;
+      case IF_OPT:
+        if_given = true;
+        if_arg.push_back(optarg);
+        break;
+      case 'Q':
+        min_qual_char_given = true;
+        min_qual_char_arg.assign(optarg);
+        break;
+      case 'p':
+        reprobes_given = true;
+        reprobes_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
+        CHECK_ERR(uint32_t, optarg, "-p, --reprobes=uint32")
+        break;
+      case TEXT_OPT:
+        text_flag = true;
+        break;
+      case DISK_OPT:
+        disk_flag = true;
+        break;
+      case NO_MERGE_OPT:
+        no_merge_flag = true;
+        break;
+      case NO_UNLINK_OPT:
+        no_unlink_flag = true;
+        break;
+      case 'L':
+        lower_count_given = true;
+        lower_count_arg = conv_uint<uint64_t>((const char*)optarg, err, false);
+        CHECK_ERR(uint64_t, optarg, "-L, --lower-count=uint64")
+        break;
+      case 'U':
+        upper_count_given = true;
+        upper_count_arg = conv_uint<uint64_t>((const char*)optarg, err, false);
+        CHECK_ERR(uint64_t, optarg, "-U, --upper-count=uint64")
+        break;
+      case TIMING_OPT:
+        timing_given = true;
+        timing_arg = optarg;
+        break;
+      case NO_WRITE_OPT:
+        no_write_flag = true;
+        break;
+      }
+    }
+
+    // Check that required switches are present
+    if(!mer_len_given)
+      error("[-m, --mer-len=uint32] required switch");
+    if(!size_given)
+      error("[-s, --size=uint64] required switch");
+
+    // Check mutually exlusive switches
+    if(bf_size_given && bc_given)
+      error("Switches [    --bf-size=uint64] and [    --bc=peath] are mutually exclusive");
+
+    // Parse arguments
+    if(argc - optind < 0)
+      error("Requires at least 0 argument.");
+    for( ; optind < argc; ++optind) {
+      file_arg.push_back(argv[optind]);
+    }
+  }
+  static const char * usage() { return "Usage: jellyfish count [options] file:path+"; }
+  class error {
+    int code_;
+    std::ostringstream msg_;
+
+    // Select the correct version (GNU or XSI) version of
+    // strerror_r. strerror_ behaves like the GNU version of strerror_r,
+    // regardless of which version is provided by the system.
+    static const char* strerror__(char* buf, int res) {
+      return res != -1 ? buf : "Invalid error";
+    }
+    static const char* strerror__(char* buf, char* res) {
+      return res;
+    }
+    static const char* strerror_(int err, char* buf, size_t buflen) {
+      return strerror__(buf, strerror_r(err, buf, buflen));
+    }
+    struct no_t { };
+
+  public:
+    static no_t no;
+    error(int code = EXIT_FAILURE) : code_(code) { }
+    explicit error(const char* msg, int code = EXIT_FAILURE) : code_(code)
+      { msg_ << msg; }
+    error(const std::string& msg, int code = EXIT_FAILURE) : code_(code)
+      { msg_ << msg; }
+    error& operator<<(no_t) {
+      char buf[1024];
+      msg_ << ": " << strerror_(errno, buf, sizeof(buf));
+      return *this;
+    }
+    template<typename T>
+    error& operator<<(const T& x) { msg_ << x; return (*this); }
+    ~error() {
+      ::std::cerr << "Error: " << msg_.str() << "\n"
+                  << usage() << "\n"
+                  << "Use --help for more information"
+                  << ::std::endl;
+      exit(code_);
+    }
+  };
+  static const char * help() { return
+    "Count k-mers in fasta or fastq files\n\n"
+    "Options (default value in (), *required):\n"
+    " -m, --mer-len=uint32                    *Length of mer\n"
+    " -s, --size=uint64                       *Initial hash size\n"
+    " -t, --threads=uint32                     Number of threads (1)\n"
+    " -F, --Files=uint32                       Number files open simultaneously (1)\n"
+    " -g, --generator=path                     File of commands generating fast[aq]\n"
+    " -G, --Generators=uint32                  Number of generators run simultaneously (1)\n"
+    " -S, --shell=string                       Shell used to run generator commands ($SHELL or /bin/sh)\n"
+    " -o, --output=string                      Output file (mer_counts.jf)\n"
+    " -c, --counter-len=Length in bits         Length bits of counting field (7)\n"
+    "     --out-counter-len=Length in bytes    Length in bytes of counter field in output (4)\n"
+    " -C, --canonical                          Count both strand, canonical representation (false)\n"
+    "     --bc=peath                           Bloom counter to filter out singleton mers\n"
+    "     --bf-size=uint64                     Use bloom filter to count high-frequency mers\n"
+    "     --bf-fp=double                       False positive rate of bloom filter (0.01)\n"
+    "     --if=path                            Count only k-mers in this files\n"
+    " -Q, --min-qual-char=string               Any base with quality below this character is changed to N\n"
+    " -p, --reprobes=uint32                    Maximum number of reprobes (126)\n"
+    "     --text                               Dump in text format (false)\n"
+    "     --disk                               Disk operation. Do not do size doubling (false)\n"
+    " -L, --lower-count=uint64                 Don't output k-mer with count < lower-count\n"
+    " -U, --upper-count=uint64                 Don't output k-mer with count > upper-count\n"
+    "     --timing=Timing file                 Print timing information\n"
+    "     --usage                              Usage\n"
+    " -h, --help                               This message\n"
+    "     --full-help                          Detailed help\n"
+    " -V, --version                            Version";
+  }
+  static const char* hidden() { return
+    "Hidden options:\n"
+    "     --no-merge                           Do not merge files intermediary files (false)\n"
+    "     --no-unlink                          Do not unlink intermediary files after automatic merging (false)\n"
+    "     --no-write                           Don't write database (false)\n"
+    "";
+  }
+  void print_version(::std::ostream &os = std::cout) const {
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "0.0.0"
+#endif
+    os << PACKAGE_VERSION << "\n";
+  }
+  void dump(::std::ostream &os = std::cout) {
+    os << "mer_len_given:" << mer_len_given << " mer_len_arg:" << mer_len_arg << "\n";
+    os << "size_given:" << size_given << " size_arg:" << size_arg << "\n";
+    os << "threads_given:" << threads_given << " threads_arg:" << threads_arg << "\n";
+    os << "Files_given:" << Files_given << " Files_arg:" << Files_arg << "\n";
+    os << "generator_given:" << generator_given << " generator_arg:" << generator_arg << "\n";
+    os << "Generators_given:" << Generators_given << " Generators_arg:" << Generators_arg << "\n";
+    os << "shell_given:" << shell_given << " shell_arg:" << shell_arg << "\n";
+    os << "output_given:" << output_given << " output_arg:" << output_arg << "\n";
+    os << "counter_len_given:" << counter_len_given << " counter_len_arg:" << counter_len_arg << "\n";
+    os << "out_counter_len_given:" << out_counter_len_given << " out_counter_len_arg:" << out_counter_len_arg << "\n";
+    os << "canonical_flag:" << canonical_flag << "\n";
+    os << "bc_given:" << bc_given << " bc_arg:" << bc_arg << "\n";
+    os << "bf_size_given:" << bf_size_given << " bf_size_arg:" << bf_size_arg << "\n";
+    os << "bf_fp_given:" << bf_fp_given << " bf_fp_arg:" << bf_fp_arg << "\n";
+    os << "if_given:" << if_given << " if_arg:" << vec_str(if_arg) << "\n";
+    os << "min_qual_char_given:" << min_qual_char_given << " min_qual_char_arg:" << min_qual_char_arg << "\n";
+    os << "reprobes_given:" << reprobes_given << " reprobes_arg:" << reprobes_arg << "\n";
+    os << "text_flag:" << text_flag << "\n";
+    os << "disk_flag:" << disk_flag << "\n";
+    os << "no_merge_flag:" << no_merge_flag << "\n";
+    os << "no_unlink_flag:" << no_unlink_flag << "\n";
+    os << "lower_count_given:" << lower_count_given << " lower_count_arg:" << lower_count_arg << "\n";
+    os << "upper_count_given:" << upper_count_given << " upper_count_arg:" << upper_count_arg << "\n";
+    os << "timing_given:" << timing_given << " timing_arg:" << timing_arg << "\n";
+    os << "no_write_flag:" << no_write_flag << "\n";
+    os << "file_arg:" << vec_str(file_arg) << "\n";
+  }
+};
+#endif // __COUNT_MAIN_CMDLINE_HPP__"
diff --git a/include/cuckoohash_config.h b/include/cuckoohash_config.h
new file mode 100644
index 0000000..6c9fe0c
--- /dev/null
+++ b/include/cuckoohash_config.h
@@ -0,0 +1,16 @@
+/*! \file */
+
+#ifndef _CUCKOOHASH_CONFIG_H
+#define _CUCKOOHASH_CONFIG_H
+
+//! SLOT_PER_BUCKET is the maximum number of keys per bucket
+const size_t SLOT_PER_BUCKET = 4;
+
+//! DEFAULT_SIZE is the default number of elements in an empty hash
+//! table
+const size_t DEFAULT_SIZE = (1U << 16) * SLOT_PER_BUCKET;
+
+//! set LIBCUCKOO_DEBUG to 1 to enable debug output
+#define LIBCUCKOO_DEBUG 0
+
+#endif // _CUCKOOHASH_CONFIG_H
diff --git a/include/cuckoohash_map.hh b/include/cuckoohash_map.hh
new file mode 100644
index 0000000..f60262a
--- /dev/null
+++ b/include/cuckoohash_map.hh
@@ -0,0 +1,2145 @@
+#ifndef _CUCKOOHASH_MAP_HH
+#define _CUCKOOHASH_MAP_HH
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <bitset>
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <thread>
+#include <tuple>
+#include <type_traits>
+#include <unistd.h>
+#include <utility>
+#include <vector>
+
+#include "cuckoohash_config.h"
+#include "cuckoohash_util.h"
+
+//! cuckoohash_map is the hash table class.
+template <class Key, class T, class Hash = std::hash<Key>,
+          class Pred = std::equal_to<Key> >
+class cuckoohash_map {
+public:
+    //! key_type is the type of keys.
+    typedef Key               key_type;
+    //! value_type is the type of key-value pairs.
+    typedef std::pair<const Key, T> value_type;
+    //! mapped_type is the type of values.
+    typedef T                 mapped_type;
+    //! hasher is the type of the hash function.
+    typedef Hash              hasher;
+    //! key_equal is the type of the equality predicate.
+    typedef Pred              key_equal;
+
+    //! Class returned by operator[] which wraps an entry in the hash table.
+    //! Note that this reference type behave somewhat differently from an STL
+    //! map reference. Most importantly, running this operator will not insert a
+    //! default key-value pair into the map if the given key is not already in
+    //! the map.
+    class reference {
+        // Note that this implementation here is not exactly STL compliant. To
+        // maintain performance and avoid hitting the hash table too many times,
+        // The reference object is *lazy*. In other words,
+        //
+        //  - operator[] does not actually perform an insert. It returns a
+        //    reference object pointing to the requested key.
+        //  - On table[i] = val // reference::operator=(mapped_type)
+        //    an update / insert is called
+        //  - On table[i] = table[j] // reference::operator=(const reference&)
+        //    an update / insert is called with the value of table[j]
+        //  - On val = table[i] // operator mapped_type()
+        //    a find is called
+        //  - On table[i] (i.e. no operation performed)
+        //    the destructor is called immediately (reference::~reference())
+        //    and nothing happens.
+    public:
+        //! Delete the default constructor, which should never be used
+        reference() = delete;
+
+        //! Casting to \p mapped_type runs a find for the stored key. If the
+        //! find fails, it will thrown an exception.
+        operator mapped_type() const {
+            return owner_.find(key_);
+        }
+
+        //! The assignment operator will first try to update the value at the
+        //! reference's key. If the key isn't in the table, it will insert the
+        //! key with \p val.
+        reference& operator=(const mapped_type& val) {
+            owner_.upsert(
+                key_, [&val](mapped_type& v) { v = val; }, val);
+            return *this;
+        }
+
+        //! The copy assignment operator doesn't actually copy the passed-in
+        //! reference. Instead, it has the same behavior as operator=(const
+        //! mapped_type& val).
+        reference& operator=(const reference& ref) {
+            *this = (mapped_type) ref;
+            return *this;
+        }
+
+    private:
+        // private constructor which initializes the owner and key
+        reference(cuckoohash_map& owner, const key_type& key)
+            : owner_(owner), key_(key) {}
+
+        // reference to the hash map instance
+        cuckoohash_map& owner_;
+        // the referenced key
+        const key_type& key_;
+
+        // cuckoohash_map needs to call the private constructor
+        friend class cuckoohash_map;
+    };
+
+    typedef const mapped_type const_reference;
+
+private:
+    // Constants used internally
+
+    // true if the key is small and simple, which means using partial keys would
+    // probably slow us down
+    static const bool is_simple =
+        std::is_pod<key_type>::value && sizeof(key_type) <= 8;
+
+    static const bool value_copy_assignable = std::is_copy_assignable<
+        mapped_type>::value;
+
+    // number of locks in the locks_ array
+    static const size_t kNumLocks = 1 << 16;
+
+    // number of cores on the machine
+    static size_t kNumCores() {
+        static size_t cores = std::thread::hardware_concurrency() == 0 ?
+            sysconf(_SC_NPROCESSORS_ONLN) : std::thread::hardware_concurrency();
+        return cores;
+    }
+
+    // A fast, lightweight spinlock
+    class spinlock {
+        std::atomic_flag lock_;
+    public:
+        spinlock() {
+            lock_.clear();
+        }
+
+        inline void lock() {
+            while (lock_.test_and_set(std::memory_order_acquire));
+        }
+
+        inline void unlock() {
+            lock_.clear(std::memory_order_release);
+        }
+
+        inline bool try_lock() {
+            return !lock_.test_and_set(std::memory_order_acquire);
+        }
+
+    } __attribute__((aligned(64)));
+
+    typedef enum {
+        ok = 0,
+        failure = 1,
+        failure_key_not_found = 2,
+        failure_key_duplicated = 3,
+        failure_space_not_enough = 4,
+        failure_function_not_supported = 5,
+        failure_table_full = 6,
+        failure_under_expansion = 7,
+    } cuckoo_status;
+
+    typedef char partial_t;
+    // Two partial key containers. One for when we're actually using partial
+    // keys and another that mocks partial keys for when the type is simple. The
+    // bucket will derive the correct class depending on whether the type is
+    // simple or not.
+    class RealPartialContainer {
+        std::array<partial_t, SLOT_PER_BUCKET> partials_;
+    public:
+        const partial_t& partial(int ind) const {
+            return partials_[ind];
+        }
+        partial_t& partial(int ind) {
+            return partials_[ind];
+        }
+    };
+
+    class FakePartialContainer {
+    public:
+        // These methods should never be called, so we raise an exception if
+        // they are.
+        const partial_t& partial(int) const {
+            throw std::logic_error(
+                "FakePartialContainer::partial should never be called");
+        }
+        partial_t& partial(int) {
+            throw std::logic_error(
+                "FakePartialContainer::partial should never be called");
+        }
+    };
+
+    // The Bucket type holds SLOT_PER_BUCKET keys and values, and a occupied
+    // bitset, which indicates whether the slot at the given bit index is in
+    // the table or not. It uses aligned_storage arrays to store the keys and
+    // values to allow constructing and destroying key-value pairs in place.
+    class Bucket : public std::conditional<is_simple, FakePartialContainer,
+                                           RealPartialContainer>::type {
+    private:
+        std::array<typename std::aligned_storage<
+                       sizeof(key_type), alignof(key_type)>::type,
+                   SLOT_PER_BUCKET> keys_;
+        std::array<typename std::aligned_storage<
+                       sizeof(mapped_type), alignof(mapped_type)>::type,
+                   SLOT_PER_BUCKET> vals_;
+        std::bitset<SLOT_PER_BUCKET> occupied_;
+
+        // key_allocator is the allocator used to construct keys
+        static std::allocator<key_type> key_allocator;
+
+        // value_allocator is the allocator to construct values
+        static std::allocator<mapped_type> value_allocator;
+
+    public:
+        bool occupied(int ind) const {
+            return occupied_.test(ind);
+        }
+
+        const key_type& key(int ind) const {
+            return *static_cast<const key_type*>(
+                static_cast<const void*>(&keys_[ind]));
+        }
+
+        key_type& key(int ind) {
+            return *static_cast<key_type*>(static_cast<void*>(&keys_[ind]));
+        }
+
+        const mapped_type& val(int ind) const {
+            return *static_cast<const mapped_type*>(
+                static_cast<const void*>(&vals_[ind]));
+        }
+
+        mapped_type& val(int ind) {
+            return *static_cast<mapped_type*>(static_cast<void*>(&vals_[ind]));
+        }
+
+        template <class V>
+        void setKV(size_t ind, const key_type& k, V v) {
+            occupied_.set(ind);
+            key_allocator.construct(&key(ind), k);
+            value_allocator.construct(&val(ind), std::forward<V>(v));
+        }
+
+        void eraseKV(size_t ind) {
+            occupied_.reset(ind);
+            key_allocator.destroy(&key(ind));
+            value_allocator.destroy(&val(ind));
+        }
+
+        Bucket() {
+            occupied_.reset();
+        }
+
+        ~Bucket() {
+            for (size_t i = 0; i < SLOT_PER_BUCKET; ++i) {
+                if (occupied(i)) {
+                    eraseKV(i);
+                }
+            }
+        }
+    };
+
+    // cacheint is a cache-aligned atomic integer type.
+    struct cacheint {
+        std::atomic<size_t> num;
+        cacheint(): num(0) {}
+        cacheint(size_t x): num(x) {}
+        cacheint(cacheint&& x): num(x.num.load()) {}
+    } __attribute__((aligned(64)));
+
+    // An alias for the type of lock we are using
+    typedef spinlock locktype;
+
+    // TableInfo contains the entire state of the hashtable. We allocate one
+    // TableInfo pointer per hash table and store all of the table memory in it,
+    // so that all the data can be atomically swapped during expansion.
+    struct TableInfo {
+        // 2**hashpower is the number of buckets
+        size_t hashpower_;
+
+        // vector of buckets
+        std::vector<Bucket> buckets_;
+
+        // array of locks
+        std::array<locktype, kNumLocks> locks_;
+
+        // per-core counters for the number of inserts and deletes
+        std::vector<cacheint> num_inserts, num_deletes;
+
+        // The constructor allocates the memory for the table. It allocates one
+        // cacheint for each core in num_inserts and num_deletes.
+        TableInfo(const size_t hashpower)
+            : hashpower_(hashpower), buckets_(hashsize(hashpower_)),
+              num_inserts(kNumCores()), num_deletes(kNumCores()) {}
+
+        ~TableInfo() {}
+    };
+
+    // This is a hazard pointer, used to indicate which version of the TableInfo
+    // is currently being used in the thread. Since cuckoohash_map operations
+    // can run simultaneously in different threads, this variable is thread
+    // local. Note that this variable can be safely shared between different
+    // cuckoohash_map instances, since multiple operations cannot occur
+    // simultaneously in one thread. The hazard pointer variable points to a
+    // pointer inside a global list of pointers, that each map checks before
+    // deleting any old TableInfo pointers.
+    static __thread TableInfo** hazard_pointer;
+
+    // A GlobalHazardPointerList stores a list of pointers that cannot be
+    // deleted by an expansion thread. Each thread gets its own node in the
+    // list, whose data pointer it can modify without contention.
+    class GlobalHazardPointerList {
+        std::list<TableInfo*> hp_;
+        std::mutex lock_;
+    public:
+        // new_hazard_pointer creates and returns a new hazard pointer for a
+        // thread.
+        TableInfo** new_hazard_pointer() {
+            std::unique_lock<std::mutex> ul(lock_);
+            hp_.emplace_back(nullptr);
+            return &hp_.back();
+        }
+
+        // delete_unused scans the list of hazard pointers, deleting any
+        // pointers in old_pointers that aren't in this list. If it does delete
+        // a pointer in old_pointers, it deletes that node from the list.
+        void delete_unused(std::list<std::unique_ptr<TableInfo>>&
+                           old_pointers) {
+            std::unique_lock<std::mutex> ul(lock_);
+            old_pointers.remove_if(
+                [this](const std::unique_ptr<TableInfo>& ptr) {
+                    return std::find(hp_.begin(), hp_.end(), ptr.get()) ==
+                        hp_.end();
+                });
+        }
+    };
+
+    // As long as the thread_local hazard_pointer is static, which means each
+    // template instantiation of a cuckoohash_map class gets its own per-thread
+    // hazard pointer, then each template instantiation of a cuckoohash_map
+    // class can get its own global_hazard_pointers list, since different
+    // template instantiations won't interfere with each other.
+    static GlobalHazardPointerList global_hazard_pointers;
+
+    // check_hazard_pointer should be called before any public method that loads
+    // a table snapshot. It checks that the thread local hazard pointer pointer
+    // is not null, and gets a new pointer if it is null.
+    static inline void check_hazard_pointer() {
+        if (hazard_pointer == nullptr) {
+            hazard_pointer = global_hazard_pointers.new_hazard_pointer();
+        }
+    }
+
+    // Once a function is finished with a version of the table, it will want to
+    // unset the hazard pointer it set so that it can be freed if it needs to.
+    // This is an object which, upon destruction, will unset the hazard pointer.
+    class HazardPointerUnsetter {
+    public:
+        ~HazardPointerUnsetter() {
+            *hazard_pointer = nullptr;
+        }
+    };
+
+    // counterid stores the per-thread counter index of each thread.
+    static __thread int counterid;
+
+    // check_counterid checks if the counterid has already been determined. If
+    // not, it assigns a counterid to the current thread by picking a random
+    // core. This should be called at the beginning of any function that changes
+    // the number of elements in the table.
+    static inline void check_counterid() {
+        if (counterid < 0) {
+            counterid = rand() % kNumCores();
+        }
+    }
+
+    // reserve_calc takes in a parameter specifying a certain number of slots
+    // for a table and returns the smallest hashpower that will hold n elements.
+    static size_t reserve_calc(size_t n) {
+        double nhd = ceil(log2((double)n / (double)SLOT_PER_BUCKET));
+        size_t new_hashpower = (size_t) (nhd <= 0 ? 1.0 : nhd);
+        assert(n <= hashsize(new_hashpower) * SLOT_PER_BUCKET);
+        return new_hashpower;
+    }
+
+    // hashfn returns an instance of the hash function
+    static hasher hashfn() {
+        static hasher hash;
+        return hash;
+    }
+
+    // eqfn returns an instance of the equality predicate
+    static key_equal eqfn() {
+        static key_equal eq;
+        return eq;
+    }
+
+public:
+    //! The constructor creates a new hash table with enough space for \p n
+    //! elements. If the constructor fails, it will throw an exception.
+    explicit cuckoohash_map(size_t n = DEFAULT_SIZE) {
+        cuckoo_init(reserve_calc(n));
+    }
+
+    //! The destructor explicitly deletes the current table info.
+    ~cuckoohash_map() {
+        TableInfo* ti = table_info.load();
+        if (ti != nullptr) {
+            delete ti;
+        }
+    }
+
+    //! clear removes all the elements in the hash table, calling their
+    //! destructors.
+    void clear() {
+        check_hazard_pointer();
+        TableInfo* ti = snapshot_and_lock_all();
+        assert(ti == table_info.load());
+        AllUnlocker au(ti);
+        HazardPointerUnsetter hpu;
+        cuckoo_clear(ti);
+    }
+
+    //! size returns the number of items currently in the hash table. Since it
+    //! doesn't lock the table, elements can be inserted during the computation,
+    //! so the result may not necessarily be exact.
+    size_t size() const {
+        check_hazard_pointer();
+        const TableInfo* ti = snapshot_table_nolock();
+        HazardPointerUnsetter hpu;
+        const size_t s = cuckoo_size(ti);
+        return s;
+    }
+
+    //! empty returns true if the table is empty.
+    bool empty() const {
+        return size() == 0;
+    }
+
+    //! hashpower returns the hashpower of the table, which is
+    //! log<SUB>2</SUB>(the number of buckets).
+    size_t hashpower() const {
+        check_hazard_pointer();
+        TableInfo* ti = snapshot_table_nolock();
+        HazardPointerUnsetter hpu;
+        const size_t hashpower = ti->hashpower_;
+        return hashpower;
+    }
+
+    //! bucket_count returns the number of buckets in the table.
+    size_t bucket_count() const {
+        check_hazard_pointer();
+        TableInfo* ti = snapshot_table_nolock();
+        HazardPointerUnsetter hpu;
+        size_t buckets = hashsize(ti->hashpower_);
+        return buckets;
+    }
+
+    //! load_factor returns the ratio of the number of items in the table to the
+    //! total number of available slots in the table.
+    double load_factor() const {
+        check_hazard_pointer();
+        const TableInfo* ti = snapshot_table_nolock();
+        HazardPointerUnsetter hpu;
+        return cuckoo_loadfactor(ti);
+    }
+
+    //! find searches through the table for \p key, and stores the associated
+    //! value it finds in \p val.
+    ENABLE_IF(, value_copy_assignable, bool)
+    find(const key_type& key, mapped_type& val) const {
+        check_hazard_pointer();
+        size_t hv = hashed_key(key);
+        TableInfo* ti;
+        size_t i1, i2;
+        std::tie(ti, i1, i2) = snapshot_and_lock_two(hv);
+        HazardPointerUnsetter hpu;
+
+        const cuckoo_status st = cuckoo_find(key, val, hv, ti, i1, i2);
+        unlock_two(ti, i1, i2);
+        return (st == ok);
+    }
+
+    //! This version of find does the same thing as the two-argument version,
+    //! except it returns the value it finds, throwing an \p std::out_of_range
+    //! exception if the key isn't in the table.
+    ENABLE_IF(, value_copy_assignable, mapped_type)
+    find(const key_type& key) const {
+        mapped_type val;
+        bool done = find(key, val);
+        if (done) {
+            return val;
+        } else {
+            throw std::out_of_range("key not found in table");
+        }
+    }
+
+    //! contains searches through the table for \p key, and returns true if it
+    //! finds it in the table, and false otherwise.
+    bool contains(const key_type& key) const {
+        check_hazard_pointer();
+        size_t hv = hashed_key(key);
+        TableInfo* ti;
+        size_t i1, i2;
+        std::tie(ti, i1, i2) = snapshot_and_lock_two(hv);
+        HazardPointerUnsetter hpu;
+
+        const bool result = cuckoo_contains(key, hv, ti, i1, i2);
+        unlock_two(ti, i1, i2);
+        return result;
+    }
+
+    //! insert puts the given key-value pair into the table. It first checks
+    //! that \p key isn't already in the table, since the table doesn't support
+    //! duplicate keys. If the table is out of space, insert will automatically
+    //! expand until it can succeed. Note that expansion can throw an exception,
+    //! which insert will propagate. If \p key is already in the table, it
+    //! returns false, otherwise it returns true.
+    template <class V>
+    typename std::enable_if<std::is_convertible<V, const mapped_type&>::value,
+                            bool>::type
+    insert(const key_type& key, V val) {
+        check_hazard_pointer();
+        check_counterid();
+        size_t hv = hashed_key(key);
+        TableInfo* ti;
+        size_t i1, i2;
+        std::tie(ti, i1, i2) = snapshot_and_lock_two(hv);
+        HazardPointerUnsetter hpu;
+        return cuckoo_insert_loop(key, std::forward<V>(val),
+                                  hv, ti, i1, i2);
+    }
+
+    //! erase removes \p key and it's associated value from the table, calling
+    //! their destructors. If \p key is not there, it returns false, otherwise
+    //! it returns true.
+    bool erase(const key_type& key) {
+        check_hazard_pointer();
+        check_counterid();
+        size_t hv = hashed_key(key);
+        TableInfo* ti;
+        size_t i1, i2;
+        std::tie(ti, i1, i2) = snapshot_and_lock_two(hv);
+        HazardPointerUnsetter hpu;
+
+        const cuckoo_status st = cuckoo_delete(key, hv, ti, i1, i2);
+        unlock_two(ti, i1, i2);
+        return (st == ok);
+    }
+
+    //! update changes the value associated with \p key to \p val. If \p key is
+    //! not there, it returns false, otherwise it returns true.
+    ENABLE_IF(, value_copy_assignable, bool)
+    update(const key_type& key, const mapped_type& val) {
+        check_hazard_pointer();
+        size_t hv = hashed_key(key);
+        TableInfo* ti;
+        size_t i1, i2;
+        std::tie(ti, i1, i2) = snapshot_and_lock_two(hv);
+        HazardPointerUnsetter hpu;
+
+        const cuckoo_status st = cuckoo_update(key, val, hv, ti, i1, i2);
+        unlock_two(ti, i1, i2);
+        return (st == ok);
+    }
+
+    //! update_fn changes the value associated with \p key with the function \p
+    //! fn. \p fn will be passed one argument of type \p mapped_type& and can
+    //! modify the argument as desired, returning nothing. If \p key is not
+    //! there, it returns false, otherwise it returns true.
+    template <typename Updater>
+    bool update_fn(const key_type& key, Updater fn) {
+        check_hazard_pointer();
+        size_t hv = hashed_key(key);
+        TableInfo* ti;
+        size_t i1, i2;
+        std::tie(ti, i1, i2) = snapshot_and_lock_two(hv);
+        HazardPointerUnsetter hpu;
+
+        const cuckoo_status st = cuckoo_update_fn(key, fn, hv, ti, i1, i2);
+        unlock_two(ti, i1, i2);
+        return (st == ok);
+    }
+
+    //! upsert is a combination of update_fn and insert. It first tries updating
+    //! the value associated with \p key using \p fn. If \p key is not in the
+    //! table, then it runs an insert with \p key and \p val. It will always
+    //! succeed, since if the update fails and the insert finds the key already
+    //! inserted, it can retry the update.
+    template <typename Updater>
+    void upsert(const key_type& key, Updater fn, const mapped_type& val) {
+        check_hazard_pointer();
+        check_counterid();
+        size_t hv = hashed_key(key);
+        TableInfo* ti;
+        size_t i1, i2;
+
+        bool res;
+        do {
+            std::tie(ti, i1, i2) = snapshot_and_lock_two(hv);
+            HazardPointerUnsetter hpu;
+            const cuckoo_status st = cuckoo_update_fn(key, fn, hv, ti, i1, i2);
+            if (st == ok) {
+                unlock_two(ti, i1, i2);
+                return;
+            }
+
+            // We run an insert, since the update failed
+            res = cuckoo_insert_loop(key, val, hv, ti, i1, i2);
+
+            // The only valid reason for res being false is if insert
+            // encountered a duplicate key after releasing the locks and
+            // performing cuckoo hashing. In this case, we retry the entire
+            // upsert operation.
+        } while (!res);
+        return;
+    }
+
+    //! rehash will size the table using a hashpower of \p n. Note that the
+    //! number of buckets in the table will be 2<SUP>\p n</SUP> after expansion,
+    //! so the table will have 2<SUP>\p n</SUP> × \ref SLOT_PER_BUCKET
+    //! slots to store items in. If \p n is not larger than the current
+    //! hashpower, then the function does nothing. It returns true if the table
+    //! expansion succeeded, and false otherwise. rehash can throw an exception
+    //! if the expansion fails to allocate enough memory for the larger table.
+    bool rehash(size_t n) {
+        check_hazard_pointer();
+        TableInfo* ti = snapshot_table_nolock();
+        HazardPointerUnsetter hpu;
+        if (n <= ti->hashpower_) {
+            return false;
+        }
+        const cuckoo_status st = cuckoo_expand_simple(n);
+        return (st == ok);
+    }
+
+    //! reserve will size the table to have enough slots for at least \p n
+    //! elements. If the table can already hold that many elements, the function
+    //! has no effect. Otherwise, the function will expand the table to a
+    //! hashpower sufficient to hold \p n elements. It will return true if there
+    //! was an expansion, and false otherwise. reserve can throw an exception if
+    //! the expansion fails to allocate enough memory for the larger table.
+    bool reserve(size_t n) {
+        check_hazard_pointer();
+        TableInfo* ti = snapshot_table_nolock();
+        HazardPointerUnsetter hpu;
+        if (n <= hashsize(ti->hashpower_) * SLOT_PER_BUCKET) {
+            return false;
+        }
+        const cuckoo_status st = cuckoo_expand_simple(reserve_calc(n));
+        return (st == ok);
+    }
+
+    //! hash_function returns the hash function object used by the table.
+    hasher hash_function() const {
+        return hashfn();
+    }
+
+    //! key_eq returns the equality predicate object used by the table.
+    key_equal key_eq() const {
+        return eqfn();
+    }
+
+    //! Returns a \ref reference to the mapped value stored at the given key.
+    //! Note that the reference behaves somewhat differently from an STL map
+    //! reference (see the \ref reference documentation for details).
+    reference operator[](const key_type& key) {
+        return (reference(*this, key));
+    }
+
+    //! Returns a \ref const_reference to the mapped value stored at the given
+    //! key. This is equivalent to running the overloaded \ref find function
+    //! with no value parameter.
+    const_reference operator[](const key_type& key) const {
+        return find(key);
+    }
+
+private:
+    std::atomic<TableInfo*> table_info;
+
+    // old_table_infos holds pointers to old TableInfos that were replaced
+    // during expansion. This keeps the memory alive for any leftover
+    // operations, until they are deleted by the global hazard pointer manager.
+    std::list<std::unique_ptr<TableInfo>> old_table_infos;
+
+    // lock locks the given bucket index.
+    static inline void lock(TableInfo* ti, const size_t i) {
+        ti->locks_[lock_ind(i)].lock();
+    }
+
+    // unlock unlocks the given bucket index.
+    static inline void unlock(TableInfo* ti, const size_t i) {
+        ti->locks_[lock_ind(i)].unlock();
+    }
+
+    // lock_two locks the two bucket indexes, always locking the earlier index
+    // first to avoid deadlock. If the two indexes are the same, it just locks
+    // one.
+    static void lock_two(TableInfo* ti, size_t i1, size_t i2) {
+        i1 = lock_ind(i1);
+        i2 = lock_ind(i2);
+        if (i1 < i2) {
+            ti->locks_[i1].lock();
+            ti->locks_[i2].lock();
+        } else if (i2 < i1) {
+            ti->locks_[i2].lock();
+            ti->locks_[i1].lock();
+        } else {
+            ti->locks_[i1].lock();
+        }
+    }
+
+    // unlock_two unlocks both of the given bucket indexes, or only one if they
+    // are equal. Order doesn't matter here.
+    static void unlock_two(TableInfo* ti, size_t i1, size_t i2) {
+        i1 = lock_ind(i1);
+        i2 = lock_ind(i2);
+        ti->locks_[i1].unlock();
+        if (i1 != i2) {
+            ti->locks_[i2].unlock();
+        }
+    }
+
+    // lock_three locks the three bucket indexes in numerical order.
+    static void lock_three(TableInfo* ti, size_t i1,
+                           size_t i2, size_t i3) {
+        i1 = lock_ind(i1);
+        i2 = lock_ind(i2);
+        i3 = lock_ind(i3);
+        // If any are the same, we just run lock_two
+        if (i1 == i2) {
+            lock_two(ti, i1, i3);
+        } else if (i2 == i3) {
+            lock_two(ti, i1, i3);
+        } else if (i1 == i3) {
+            lock_two(ti, i1, i2);
+        } else {
+            if (i1 < i2) {
+                if (i2 < i3) {
+                    ti->locks_[i1].lock();
+                    ti->locks_[i2].lock();
+                    ti->locks_[i3].lock();
+                } else if (i1 < i3) {
+                    ti->locks_[i1].lock();
+                    ti->locks_[i3].lock();
+                    ti->locks_[i2].lock();
+                } else {
+                    ti->locks_[i3].lock();
+                    ti->locks_[i1].lock();
+                    ti->locks_[i2].lock();
+                }
+            } else if (i2 < i3) {
+                if (i1 < i3) {
+                    ti->locks_[i2].lock();
+                    ti->locks_[i1].lock();
+                    ti->locks_[i3].lock();
+                } else {
+                    ti->locks_[i2].lock();
+                    ti->locks_[i3].lock();
+                    ti->locks_[i1].lock();
+                }
+            } else {
+                ti->locks_[i3].lock();
+                ti->locks_[i2].lock();
+                ti->locks_[i1].lock();
+            }
+        }
+    }
+
+    // unlock_three unlocks the three given buckets
+    static void unlock_three(TableInfo* ti, size_t i1,
+                             size_t i2, size_t i3) {
+        i1 = lock_ind(i1);
+        i2 = lock_ind(i2);
+        i3 = lock_ind(i3);
+        ti->locks_[i1].unlock();
+        if (i2 != i1) {
+            ti->locks_[i2].unlock();
+        }
+        if (i3 != i1 && i3 != i2) {
+            ti->locks_[i3].unlock();
+        }
+    }
+
+    // snapshot_table_nolock loads the table info pointer and sets the hazard
+    // pointer, whithout locking anything. There is a possibility that after
+    // loading a snapshot and setting the hazard pointer, an expansion runs and
+    // create a new version of the table, leaving the old one for deletion. To
+    // deal with that, we check that the table_info we loaded is the same as the
+    // current one, and if it isn't, we try again. Whenever we check if (ti !=
+    // table_info.load()) after setting the hazard pointer, there is an ABA
+    // issue, where the address of the new table_info equals the address of a
+    // previously deleted one, however it doesn't matter, since we would still
+    // be looking at the most recent table_info in that case.
+    TableInfo* snapshot_table_nolock() const {
+        while (true) {
+            TableInfo* ti = table_info.load();
+            *hazard_pointer = ti;
+            // If the table info has changed in the time we set the hazard
+            // pointer, ti could have been deleted, so try again.
+            if (ti != table_info.load()) {
+                continue;
+            }
+            return ti;
+        }
+    }
+
+    // snapshot_and_lock_two loads the table_info pointer and locks the buckets
+    // associated with the given hash value. It returns the table_info and the
+    // two locked buckets as a tuple. Since the positions of the bucket locks
+    // depends on the number of buckets in the table, the table_info pointer
+    // needs to be grabbed first.
+    std::tuple<TableInfo*, size_t, size_t>
+    snapshot_and_lock_two(const size_t hv) const {
+        TableInfo* ti;
+        size_t i1, i2;
+        while (true) {
+            ti = table_info.load();
+            *hazard_pointer = ti;
+            // If the table info has changed in the time we set the hazard
+            // pointer, ti could have been deleted, so try again.
+            if (ti != table_info.load()) {
+                continue;
+            }
+            i1 = index_hash(ti, hv);
+            i2 = alt_index(ti, hv, i1);
+            lock_two(ti, i1, i2);
+            // Check the table info again
+            if (ti != table_info.load()) {
+                unlock_two(ti, i1, i2);
+                continue;
+            }
+            return std::make_tuple(ti, i1, i2);
+        }
+    }
+
+    // AllUnlocker is an object which releases all the locks on the given table
+    // info when it's destructor is called.
+    class AllUnlocker {
+        TableInfo* ti_;
+    public:
+        AllUnlocker(TableInfo* ti): ti_(ti) {}
+        ~AllUnlocker() {
+            if (ti_ != nullptr) {
+                for (size_t i = 0; i < kNumLocks; ++i) {
+                    ti_->locks_[i].unlock();
+                }
+            }
+        }
+    };
+
+    // snapshot_and_lock_all is similar to snapshot_and_lock_two, except that it
+    // takes all the locks in the table.
+    TableInfo* snapshot_and_lock_all() const {
+        while (true) {
+            TableInfo* ti = table_info.load();
+            *hazard_pointer = ti;
+            // If the table info has changed, ti could have been deleted, so try
+            // again
+            if (ti != table_info.load()) {
+                continue;
+            }
+            for (size_t i = 0; i < kNumLocks; ++i) {
+                ti->locks_[i].lock();
+            }
+            // If the table info has changed, unlock the locks and try again.
+            if (ti != table_info.load()) {
+                AllUnlocker au(ti);
+                continue;
+            }
+            return ti;
+        }
+    }
+
+    // lock_ind converts an index into buckets_ to an index into locks_.
+    static inline size_t lock_ind(const size_t bucket_ind) {
+        return bucket_ind & (kNumLocks - 1);
+    }
+
+    // hashsize returns the number of buckets corresponding to a given
+    // hashpower.
+    static inline size_t hashsize(const size_t hashpower) {
+        return 1U << hashpower;
+    }
+
+    // hashmask returns the bitmask for the buckets array corresponding to a
+    // given hashpower.
+    static inline size_t hashmask(const size_t hashpower) {
+        return hashsize(hashpower) - 1;
+    }
+
+    // hashed_key hashes the given key.
+    static inline size_t hashed_key(const key_type &key) {
+        return hashfn()(key);
+    }
+
+    // index_hash returns the first possible bucket that the given hashed key
+    // could be.
+    static inline size_t index_hash(const TableInfo* ti, const size_t hv) {
+        return hv & hashmask(ti->hashpower_);
+    }
+
+    // alt_index returns the other possible bucket that the given hashed key
+    // could be. It takes the first possible bucket as a parameter. Note that
+    // this function will return the first possible bucket if index is the
+    // second possible bucket, so alt_index(ti, hv, alt_index(ti, hv,
+    // index_hash(ti, hv))) == index_hash(ti, hv).
+    static inline size_t alt_index(
+        const TableInfo* ti, const size_t hv, const size_t index) {
+        // ensure tag is nonzero for the multiply
+        const size_t tag = (hv >> ti->hashpower_) + 1;
+        // 0x5bd1e995 is the hash constant from MurmurHash2
+        return (index ^ (tag * 0x5bd1e995)) & hashmask(ti->hashpower_);
+    }
+
+    // partial_key returns a partial_t representing the upper sizeof(partial_t)
+    // bytes of the hashed key. This is used for partial-key cuckoohashing. If
+    // the key type is POD and small, we don't use partial keys, so we just
+    // return 0.
+    ENABLE_IF(static inline, is_simple, partial_t)
+        partial_key(const size_t hv) {
+        return (partial_t)(hv >> ((sizeof(size_t)-sizeof(partial_t)) * 8));
+    }
+
+    ENABLE_IF(static inline, !is_simple, partial_t) partial_key(const size_t&) {
+        return 0;
+    }
+
+    // A constexpr version of pow that we can use for static_asserts
+    static constexpr size_t const_pow(size_t a, size_t b) {
+        return (b == 0) ? 1 : a * const_pow(a, b - 1);
+    }
+
+    // The maximum number of items in a BFS path.
+    static const uint8_t MAX_BFS_PATH_LEN = 5;
+
+    // CuckooRecord holds one position in a cuckoo path.
+    typedef struct  {
+        size_t bucket;
+        size_t slot;
+        key_type key;
+    }  CuckooRecord;
+
+    // b_slot holds the information for a BFS path through the table
+    struct b_slot {
+        // The bucket of the last item in the path
+        size_t bucket;
+        // a compressed representation of the slots for each of the buckets in
+        // the path. pathcode is sort of like a base-SLOT_PER_BUCKET number, and
+        // we need to hold at most MAX_BFS_PATH_LEN slots. Thus we need the
+        // maximum pathcode to be at least SLOT_PER_BUCKET^(MAX_BFS_PATH_LEN)
+        size_t pathcode;
+        static_assert(const_pow(SLOT_PER_BUCKET, MAX_BFS_PATH_LEN) <
+                      std::numeric_limits<decltype(pathcode)>::max(),
+                      "pathcode may not be large enough to encode a cuckoo"
+                      " path");
+        // The 0-indexed position in the cuckoo path this slot occupies. It must
+        // be less than MAX_BFS_PATH_LEN, and also able to hold negative values.
+        int_fast8_t depth;
+        static_assert(MAX_BFS_PATH_LEN - 1 <=
+                      std::numeric_limits<decltype(depth)>::max(),
+                      "The depth type must able to hold a value of"
+                      " MAX_BFS_PATH_LEN - 1");
+        static_assert(-1 >= std::numeric_limits<decltype(depth)>::min(),
+                      "The depth type must be able to hold a value of -1");
+        b_slot() {}
+        b_slot(const size_t b, const size_t p, const decltype(depth) d)
+            : bucket(b), pathcode(p), depth(d) {
+            assert(d < MAX_BFS_PATH_LEN);
+        }
+    } __attribute__((__packed__));
+
+    // b_queue is the queue used to store b_slots for BFS cuckoo hashing.
+    class b_queue {
+        // The maximum size of the BFS queue. Unless it's less than
+        // SLOT_PER_BUCKET^MAX_BFS_PATH_LEN, it won't really mean anything. If
+        // it's a power of 2, then we can quickly wrap around to the beginning
+        // of the array, so we do that.
+        static const size_t MAX_CUCKOO_COUNT = 512;
+        static_assert(const_pow(SLOT_PER_BUCKET, MAX_BFS_PATH_LEN) >=
+                      MAX_CUCKOO_COUNT, "MAX_CUCKOO_COUNT value is too large"
+                      " to be useful");
+        static_assert((MAX_CUCKOO_COUNT & (MAX_CUCKOO_COUNT - 1)) == 0,
+                      "MAX_CUCKOO_COUNT should be a power of 2");
+        // A circular array of b_slots
+        b_slot slots[MAX_CUCKOO_COUNT];
+        // The index of the head of the queue in the array
+        size_t first;
+        // One past the index of the last item of the queue in the array.
+        size_t last;
+
+        // returns the index in the queue after ind, wrapping around if
+        // necessary.
+        size_t increment(size_t ind) {
+            return (ind + 1) & (MAX_CUCKOO_COUNT - 1);
+        }
+
+    public:
+        b_queue() : first(0), last(0) {}
+
+        void enqueue(b_slot x) {
+            assert(!full());
+            slots[last] = x;
+            last = increment(last);
+        }
+
+        b_slot dequeue() {
+            assert(!empty());
+            b_slot& x = slots[first];
+            first = increment(first);
+            return x;
+        }
+
+        bool empty() {
+            return first == last;
+        }
+
+        bool full() {
+            return increment(last) == first;
+        }
+    } __attribute__((__packed__));
+
+    // slot_search searches for a cuckoo path using breadth-first search. It
+    // starts with the i1 and i2 buckets, and, until it finds a bucket with an
+    // empty slot, adds each slot of the bucket in the b_slot. If the queue runs
+    // out of space, it fails.
+    static b_slot slot_search(TableInfo* ti, const size_t i1, const size_t i2) {
+        b_queue q;
+        // The initial pathcode informs cuckoopath_search which bucket the path
+        // starts on
+        q.enqueue(b_slot(i1, 0, 0));
+        q.enqueue(b_slot(i2, 1, 0));
+        while (!q.full() && !q.empty()) {
+            b_slot x = q.dequeue();
+            // Picks a (sort-of) random slot to start from
+            size_t starting_slot = x.pathcode % SLOT_PER_BUCKET;
+            for (size_t i = 0; i < SLOT_PER_BUCKET && !q.full();
+                 ++i) {
+                size_t slot = (starting_slot + i) % SLOT_PER_BUCKET;
+                lock(ti, x.bucket);
+                if (!ti->buckets_[x.bucket].occupied(slot)) {
+                    // We can terminate the search here
+                    x.pathcode = x.pathcode * SLOT_PER_BUCKET + slot;
+                    unlock(ti, x.bucket);
+                    return x;
+                }
+
+                // If x has less than the maximum number of path components,
+                // create a new b_slot item, that represents the bucket we would
+                // have come from if we kicked out the item at this slot.
+                if (x.depth < MAX_BFS_PATH_LEN - 1) {
+                    const size_t hv = hashed_key(
+                        ti->buckets_[x.bucket].key(slot));
+                    unlock(ti, x.bucket);
+                    b_slot y(alt_index(ti, hv, x.bucket),
+                             x.pathcode * SLOT_PER_BUCKET + slot, x.depth+1);
+                    q.enqueue(y);
+                }
+            }
+        }
+        // We didn't find a short-enough cuckoo path, so the queue ran out of
+        // space. Return a failure value.
+        return b_slot(0, 0, -1);
+    }
+
+    // cuckoopath_search finds a cuckoo path from one of the starting buckets to
+    // an empty slot in another bucket. It returns the depth of the discovered
+    // cuckoo path on success, and -1 on failure. Since it doesn't take locks on
+    // the buckets it searches, the data can change between this function and
+    // cuckoopath_move. Thus cuckoopath_move checks that the data matches the
+    // cuckoo path before changing it.
+    static int cuckoopath_search(TableInfo* ti, CuckooRecord* cuckoo_path,
+                                 const size_t i1, const size_t i2) {
+        b_slot x = slot_search(ti, i1, i2);
+        if (x.depth == -1) {
+            return -1;
+        }
+        // Fill in the cuckoo path slots from the end to the beginning
+        for (int i = x.depth; i >= 0; i--) {
+            cuckoo_path[i].slot = x.pathcode % SLOT_PER_BUCKET;
+            x.pathcode /= SLOT_PER_BUCKET;
+        }
+        // Fill in the cuckoo_path buckets and keys from the beginning to the
+        // end, using the final pathcode to figure out which bucket the path
+        // starts on. Since data could have been modified between slot_search
+        // and the computation of the cuckoo path, this could be an invalid
+        // cuckoo_path.
+        CuckooRecord* curr = cuckoo_path;
+        if (x.pathcode == 0) {
+            curr->bucket = i1;
+            lock(ti, curr->bucket);
+            if (!ti->buckets_[curr->bucket].occupied(curr->slot)) {
+                // We can terminate here
+                unlock(ti, curr->bucket);
+                return 0;
+            }
+            curr->key = ti->buckets_[curr->bucket].key(curr->slot);
+            unlock(ti, curr->bucket);
+        } else {
+            assert(x.pathcode == 1);
+            curr->bucket = i2;
+            lock(ti, curr->bucket);
+            if (!ti->buckets_[curr->bucket].occupied(curr->slot)) {
+                // We can terminate here
+                unlock(ti, curr->bucket);
+                return 0;
+            }
+            curr->key = ti->buckets_[curr->bucket].key(curr->slot);
+            unlock(ti, curr->bucket);
+        }
+        for (int i = 1; i <= x.depth; ++i) {
+            CuckooRecord* prev = curr++;
+            const size_t prevhv = hashed_key(prev->key);
+            assert(prev->bucket == index_hash(ti, prevhv) ||
+                   prev->bucket == alt_index(ti, prevhv, index_hash(ti,
+                                                                    prevhv)));
+            // We get the bucket that this slot is on by computing the alternate
+            // index of the previous bucket
+            curr->bucket = alt_index(ti, prevhv, prev->bucket);
+            lock(ti, curr->bucket);
+            if (!ti->buckets_[curr->bucket].occupied(curr->slot)) {
+                // We can terminate here
+                unlock(ti, curr->bucket);
+                return i;
+            }
+            curr->key = ti->buckets_[curr->bucket].key(curr->slot);
+            unlock(ti, curr->bucket);
+        }
+        return x.depth;
+    }
+
+
+    // cuckoopath_move moves keys along the given cuckoo path in order to make
+    // an empty slot in one of the buckets in cuckoo_insert. Before the start of
+    // this function, the two insert-locked buckets were unlocked in run_cuckoo.
+    // At the end of the function, if the function returns true (success), then
+    // the last bucket it looks at (which is either i1 or i2 in run_cuckoo)
+    // remains locked. If the function is unsuccessful, then both insert-locked
+    // buckets will be unlocked.
+    static bool cuckoopath_move(
+        TableInfo* ti, CuckooRecord* cuckoo_path, size_t depth,
+        const size_t i1, const size_t i2) {
+        if (depth == 0) {
+            // There is a chance that depth == 0, when try_add_to_bucket sees i1
+            // and i2 as full and cuckoopath_search finds one empty. In this
+            // case, we lock both buckets. If the bucket that cuckoopath_search
+            // found empty isn't empty anymore, we unlock them and return false.
+            // Otherwise, the bucket is empty and insertable, so we hold the
+            // locks and return true.
+            const size_t bucket = cuckoo_path[0].bucket;
+            assert(bucket == i1 || bucket == i2);
+            lock_two(ti, i1, i2);
+            if (!ti->buckets_[bucket].occupied(cuckoo_path[0].slot)) {
+                return true;
+            } else {
+                unlock_two(ti, i1, i2);
+                return false;
+            }
+        }
+
+        while (depth > 0) {
+            CuckooRecord* from = cuckoo_path + depth - 1;
+            CuckooRecord* to   = cuckoo_path + depth;
+            size_t fb = from->bucket;
+            size_t fs = from->slot;
+            size_t tb = to->bucket;
+            size_t ts = to->slot;
+
+            size_t ob = 0;
+            if (depth == 1) {
+                // Even though we are only swapping out of i1 or i2, we have to
+                // lock both of them along with the slot we are swapping to,
+                // since at the end of this function, i1 and i2 must be locked.
+                ob = (fb == i1) ? i2 : i1;
+                lock_three(ti, fb, tb, ob);
+            } else {
+                lock_two(ti, fb, tb);
+            }
+
+            // We plan to kick out fs, but let's check if it is still there;
+            // there's a small chance we've gotten scooped by a later cuckoo. If
+            // that happened, just... try again. Also the slot we are filling in
+            // may have already been filled in by another thread, or the slot we
+            // are moving from may be empty, both of which invalidate the swap.
+            if (!eqfn()(ti->buckets_[fb].key(fs), from->key) ||
+                ti->buckets_[tb].occupied(ts) ||
+                !ti->buckets_[fb].occupied(fs)) {
+                if (depth == 1) {
+                    unlock_three(ti, fb, tb, ob);
+                } else {
+                    unlock_two(ti, fb, tb);
+                }
+                return false;
+            }
+
+            if (!is_simple) {
+                ti->buckets_[tb].partial(ts) = ti->buckets_[fb].partial(fs);
+            }
+            ti->buckets_[tb].setKV(ts, ti->buckets_[fb].key(fs),
+                                   std::move(ti->buckets_[fb].val(fs)));
+            ti->buckets_[fb].eraseKV(fs);
+            if (depth == 1) {
+                // Don't unlock fb or ob, since they are needed in
+                // cuckoo_insert. Only unlock tb if it doesn't unlock the same
+                // bucket as fb or ob.
+                if (lock_ind(tb) != lock_ind(fb) &&
+                    lock_ind(tb) != lock_ind(ob)) {
+                    unlock(ti, tb);
+                }
+            } else {
+                unlock_two(ti, fb, tb);
+            }
+            depth--;
+        }
+        return true;
+    }
+
+    // run_cuckoo performs cuckoo hashing on the table in an attempt to free up
+    // a slot on either i1 or i2. On success, the bucket and slot that was freed
+    // up is stored in insert_bucket and insert_slot. In order to perform the
+    // search and the swaps, it has to unlock both i1 and i2, which can lead to
+    // certain concurrency issues, the details of which are explained in the
+    // function. If run_cuckoo returns ok (success), then the slot it freed up
+    // is still locked. Otherwise it is unlocked.
+    cuckoo_status run_cuckoo(TableInfo* ti, const size_t i1, const size_t i2,
+                             size_t &insert_bucket, size_t &insert_slot) {
+
+        CuckooRecord cuckoo_path[MAX_BFS_PATH_LEN];
+
+        // We must unlock i1 and i2 here, so that cuckoopath_search and
+        // cuckoopath_move can lock buckets as desired without deadlock.
+        // cuckoopath_move has to look at either i1 or i2 as its last slot, and
+        // it will lock both buckets and leave them locked after finishing. This
+        // way, we know that if cuckoopath_move succeeds, then the buckets
+        // needed for insertion are still locked. If cuckoopath_move fails, the
+        // buckets are unlocked and we try again. This unlocking does present
+        // two problems. The first is that another insert on the same key runs
+        // and, finding that the key isn't in the table, inserts the key into
+        // the table. Then we insert the key into the table, causing a
+        // duplication. To check for this, we search i1 and i2 for the key we
+        // are trying to insert before doing so (this is done in cuckoo_insert,
+        // and requires that both i1 and i2 are locked). Another problem is that
+        // an expansion runs and changes table_info, meaning the cuckoopath_move
+        // and cuckoo_insert would have operated on an old version of the table,
+        // so the insert would be invalid. For this, we check that ti ==
+        // table_info.load() after cuckoopath_move, signaling to the outer
+        // insert to try again if the comparison fails.
+        unlock_two(ti, i1, i2);
+
+        bool done = false;
+        while (!done) {
+            int depth = cuckoopath_search(ti, cuckoo_path, i1, i2);
+            if (depth < 0) {
+                break;
+            }
+
+            if (cuckoopath_move(ti, cuckoo_path, depth, i1, i2)) {
+                insert_bucket = cuckoo_path[0].bucket;
+                insert_slot = cuckoo_path[0].slot;
+                assert(insert_bucket == i1 || insert_bucket == i2);
+                assert(!ti->locks_[lock_ind(i1)].try_lock());
+                assert(!ti->locks_[lock_ind(i2)].try_lock());
+                assert(!ti->buckets_[insert_bucket].occupied(insert_slot));
+                done = true;
+                break;
+            }
+        }
+
+        if (!done) {
+            return failure;
+        } else if (ti != table_info.load()) {
+            // Unlock i1 and i2 and signal to cuckoo_insert to try again. Since
+            // we set the hazard pointer to be ti, this check isn't susceptible
+            // to an ABA issue, since a new pointer can't have the same address
+            // as ti.
+            unlock_two(ti, i1, i2);
+            return failure_under_expansion;
+        }
+        return ok;
+    }
+
+    // try_read_from_bucket will search the bucket for the given key and store
+    // the associated value if it finds it.
+    ENABLE_IF(static, value_copy_assignable, bool) try_read_from_bucket(
+        const TableInfo* ti, const partial_t partial,
+        const key_type &key, mapped_type &val, const size_t i) {
+        for (size_t j = 0; j < SLOT_PER_BUCKET; ++j) {
+            if (!ti->buckets_[i].occupied(j)) {
+                continue;
+            }
+            if (!is_simple && partial != ti->buckets_[i].partial(j)) {
+                continue;
+            }
+            if (eqfn()(key, ti->buckets_[i].key(j))) {
+                val = ti->buckets_[i].val(j);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // check_in_bucket will search the bucket for the given key and return true
+    // if the key is in the bucket, and false if it isn't.
+    static bool check_in_bucket(
+        const TableInfo* ti, const partial_t partial,
+        const key_type &key, const size_t i) {
+        for (size_t j = 0; j < SLOT_PER_BUCKET; ++j) {
+            if (!ti->buckets_[i].occupied(j)) {
+                continue;
+            }
+            if (!is_simple && partial != ti->buckets_[i].partial(j)) {
+                continue;
+            }
+            if (eqfn()(key, ti->buckets_[i].key(j))) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // add_to_bucket will insert the given key-value pair into the slot.
+    template <class V>
+    static void add_to_bucket(TableInfo* ti, const partial_t partial,
+                              const key_type &key, V val,
+                              const size_t i, const size_t j) {
+        assert(!ti->buckets_[i].occupied(j));
+        if (!is_simple) {
+            ti->buckets_[i].partial(j) = partial;
+        }
+        ti->buckets_[i].setKV(j, key, std::forward<V>(val));
+        ti->num_inserts[counterid].num.fetch_add(1, std::memory_order_relaxed);
+    }
+
+    // try_find_insert_bucket will search the bucket and store the index of an
+    // empty slot if it finds one, or -1 if it doesn't. Regardless, it will
+    // search the entire bucket and return false if it finds the key already in
+    // the table (duplicate key error) and true otherwise.
+    static bool try_find_insert_bucket(
+        TableInfo* ti, const partial_t partial,
+        const key_type &key, const size_t i, int& j) {
+        j = -1;
+        bool found_empty = false;
+        for (size_t k = 0; k < SLOT_PER_BUCKET; ++k) {
+            if (ti->buckets_[i].occupied(k)) {
+                if (!is_simple && partial != ti->buckets_[i].partial(k)) {
+                    continue;
+                }
+                if (eqfn()(key, ti->buckets_[i].key(k))) {
+                    return false;
+                }
+            } else {
+                if (!found_empty) {
+                    found_empty = true;
+                    j = k;
+                }
+            }
+        }
+        return true;
+    }
+
+    // try_del_from_bucket will search the bucket for the given key, and set the
+    // slot of the key to empty if it finds it.
+    static bool try_del_from_bucket(TableInfo* ti, const partial_t partial,
+                                    const key_type &key, const size_t i) {
+        for (size_t j = 0; j < SLOT_PER_BUCKET; ++j) {
+            if (!ti->buckets_[i].occupied(j)) {
+                continue;
+            }
+            if (!is_simple && ti->buckets_[i].partial(j) != partial) {
+                continue;
+            }
+            if (eqfn()(ti->buckets_[i].key(j), key)) {
+                ti->buckets_[i].eraseKV(j);
+                ti->num_deletes[counterid].num.fetch_add(
+                    1, std::memory_order_relaxed);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // try_update_bucket will search the bucket for the given key and change its
+    // associated value if it finds it.
+    ENABLE_IF(static, value_copy_assignable, bool) try_update_bucket(
+        TableInfo* ti, const partial_t partial,
+        const key_type &key, const mapped_type &value, const size_t i) {
+        for (size_t j = 0; j < SLOT_PER_BUCKET; ++j) {
+            if (!ti->buckets_[i].occupied(j)) {
+                continue;
+            }
+            if (!is_simple && ti->buckets_[i].partial(j) != partial) {
+                continue;
+            }
+            if (eqfn()(ti->buckets_[i].key(j), key)) {
+                ti->buckets_[i].val(j) = value;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // try_update_bucket_fn will search the bucket for the given key and change
+    // its associated value with the given function if it finds it.
+    template <typename Updater>
+    static bool try_update_bucket_fn(
+        TableInfo* ti, const partial_t partial,
+        const key_type &key, Updater fn, const size_t i) {
+        for (size_t j = 0; j < SLOT_PER_BUCKET; ++j) {
+            if (!ti->buckets_[i].occupied(j)) {
+                continue;
+            }
+            if (!is_simple && ti->buckets_[i].partial(j) != partial) {
+                continue;
+            }
+            if (eqfn()(ti->buckets_[i].key(j), key)) {
+                fn(ti->buckets_[i].val(j));
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // cuckoo_find searches the table for the given key and value, storing the
+    // value in the val if it finds the key. It expects the locks to be taken
+    // and released outside the function.
+    ENABLE_IF(static, value_copy_assignable, cuckoo_status)
+        cuckoo_find(const key_type& key, mapped_type& val,
+                    const size_t hv, const TableInfo* ti,
+                    const size_t i1, const size_t i2) {
+        const partial_t partial = partial_key(hv);
+        if (try_read_from_bucket(ti, partial, key, val, i1)) {
+            return ok;
+        }
+        if (try_read_from_bucket(ti, partial, key, val, i2)) {
+            return ok;
+        }
+        return failure_key_not_found;
+    }
+
+    // cuckoo_contains searches the table for the given key, returning true if
+    // it's in the table and false otherwise. It expects the locks to be taken
+    // and released outside the function.
+    static bool cuckoo_contains(const key_type& key,
+                                const size_t hv, const TableInfo* ti,
+                                const size_t i1, const size_t i2) {
+        const partial_t partial = partial_key(hv);
+        if (check_in_bucket(ti, partial, key, i1)) {
+            return true;
+        }
+        if (check_in_bucket(ti, partial, key, i2)) {
+            return true;
+        }
+        return false;
+    }
+
+    // cuckoo_insert tries to insert the given key-value pair into an empty slot
+    // in i1 or i2, performing cuckoo hashing if necessary. It expects the locks
+    // to be taken outside the function, but they are released here, since
+    // different scenarios require different handling of the locks. Before
+    // inserting, it checks that the key isn't already in the table. cuckoo
+    // hashing presents multiple concurrency issues, which are explained in the
+    // function.
+    template <class V>
+    cuckoo_status cuckoo_insert(const key_type &key, V val,
+                                const size_t hv, TableInfo* ti,
+                                const size_t i1, const size_t i2) {
+        int res1, res2;
+        const partial_t partial = partial_key(hv);
+        if (!try_find_insert_bucket(ti, partial, key, i1, res1)) {
+            unlock_two(ti, i1, i2);
+            return failure_key_duplicated;
+        }
+        if (!try_find_insert_bucket(ti, partial, key, i2, res2)) {
+            unlock_two(ti, i1, i2);
+            return failure_key_duplicated;
+        }
+        if (res1 != -1) {
+            add_to_bucket(ti, partial, key, std::forward<V>(val), i1, res1);
+            unlock_two(ti, i1, i2);
+            return ok;
+        }
+        if (res2 != -1) {
+            add_to_bucket(ti, partial, key, std::forward<V>(val), i2, res2);
+            unlock_two(ti, i1, i2);
+            return ok;
+        }
+
+        // we are unlucky, so let's perform cuckoo hashing
+        size_t insert_bucket = 0;
+        size_t insert_slot = 0;
+        cuckoo_status st = run_cuckoo(ti, i1, i2, insert_bucket, insert_slot);
+        if (st == failure_under_expansion) {
+            // The run_cuckoo operation operated on an old version of the table,
+            // so we have to try again. We signal to the calling insert method
+            // to try again by returning failure_under_expansion.
+            return failure_under_expansion;
+        } else if (st == ok) {
+            assert(!ti->locks_[lock_ind(i1)].try_lock());
+            assert(!ti->locks_[lock_ind(i2)].try_lock());
+            assert(!ti->buckets_[insert_bucket].occupied(insert_slot));
+            assert(insert_bucket == index_hash(ti, hv) ||
+                   insert_bucket == alt_index(ti, hv, index_hash(ti, hv)));
+            // Since we unlocked the buckets during run_cuckoo, another insert
+            // could have inserted the same key into either i1 or i2, so we
+            // check for that before doing the insert.
+            if (cuckoo_contains(key, hv, ti, i1, i2)) {
+                unlock_two(ti, i1, i2);
+                return failure_key_duplicated;
+            }
+            add_to_bucket(ti, partial, key, std::forward<V>(val),
+                          insert_bucket, insert_slot);
+            unlock_two(ti, i1, i2);
+            return ok;
+        }
+        assert(st == failure);
+        LIBCUCKOO_DBG("hash table is full (hashpower = %zu, hash_items = %zu,"
+                      "load factor = %.2f), need to increase hashpower\n",
+                      ti->hashpower_, cuckoo_size(ti), cuckoo_loadfactor(ti));
+        return failure_table_full;
+    }
+
+    // We run cuckoo_insert in a loop until it succeeds in insert and upsert, so
+    // we pulled out the loop to avoid duplicating it. This should be called
+    // directly after snapshot_and_lock_two, and by the end of the function, the
+    // hazard pointer will have been unset.
+    template <class V>
+    bool cuckoo_insert_loop(const key_type& key, V val,
+                            size_t hv, TableInfo* ti, size_t i1, size_t i2) {
+        cuckoo_status st = cuckoo_insert(key, std::forward<V>(val),
+                                         hv, ti, i1, i2);
+        while (st != ok) {
+            // If the insert failed with failure_key_duplicated, it returns here
+            if (st == failure_key_duplicated) {
+                return false;
+            }
+            // If it failed with failure_under_expansion, the insert operated on
+            // an old version of the table, so we just try again. If it's
+            // failure_table_full, we have to expand the table before trying
+            // again.
+            if (st == failure_table_full) {
+                if (cuckoo_expand_simple(ti->hashpower_+1) ==
+                    failure_under_expansion) {
+                    LIBCUCKOO_DBG("expansion is on-going\n");
+                }
+            }
+            std::tie(ti, i1, i2) = snapshot_and_lock_two(hv);
+            st = cuckoo_insert(key, std::forward<V>(val), hv, ti, i1, i2);
+        }
+        return true;
+    }
+
+    // cuckoo_delete searches the table for the given key and sets the slot with
+    // that key to empty if it finds it. It expects the locks to be taken and
+    // released outside the function.
+    cuckoo_status cuckoo_delete(const key_type &key, const size_t hv,
+                                TableInfo* ti, const size_t i1,
+                                const size_t i2) {
+        const partial_t partial = partial_key(hv);
+        if (try_del_from_bucket(ti, partial, key, i1)) {
+            return ok;
+        }
+        if (try_del_from_bucket(ti, partial, key, i2)) {
+            return ok;
+        }
+        return failure_key_not_found;
+    }
+
+    // cuckoo_update searches the table for the given key and updates its value
+    // if it finds it. It expects the locks to be taken and released outside the
+    // function.
+    ENABLE_IF(, value_copy_assignable, cuckoo_status)
+    cuckoo_update(const key_type &key, const mapped_type &val,
+                                const size_t hv, TableInfo* ti,
+                                const size_t i1, const size_t i2) {
+        const partial_t partial = partial_key(hv);
+        if (try_update_bucket(ti, partial, key, val, i1)) {
+            return ok;
+        }
+        if (try_update_bucket(ti, partial, key, val, i2)) {
+            return ok;
+        }
+        return failure_key_not_found;
+    }
+
+    // cuckoo_update_fn searches the table for the given key and runs the given
+    // function on its value if it finds it, assigning the result of the
+    // function to the value. It expects the locks to be taken and released
+    // outside the function.
+    template <typename Updater>
+    cuckoo_status cuckoo_update_fn(const key_type &key, Updater fn,
+                                   const size_t hv, TableInfo* ti,
+                                   const size_t i1, const size_t i2) {
+        const partial_t partial = partial_key(hv);
+        if (try_update_bucket_fn(ti, partial, key, fn, i1)) {
+            return ok;
+        }
+        if (try_update_bucket_fn(ti, partial, key, fn, i2)) {
+            return ok;
+        }
+        return failure_key_not_found;
+    }
+
+    // cuckoo_init initializes the hashtable, given an initial hashpower as the
+    // argument.
+    cuckoo_status cuckoo_init(const size_t hashpower) {
+        table_info.store(new TableInfo(hashpower));
+        cuckoo_clear(table_info.load());
+        return ok;
+    }
+
+    // cuckoo_clear empties the table, calling the destructors of all the
+    // elements it removes from the table. It assumes the locks are taken as
+    // necessary.
+    cuckoo_status cuckoo_clear(TableInfo* ti) {
+        const size_t num_buckets = ti->buckets_.size();
+        ti->buckets_.clear();
+        ti->buckets_.resize(num_buckets);
+        for (size_t i = 0; i < ti->num_inserts.size(); ++i) {
+            ti->num_inserts[i].num.store(0);
+            ti->num_deletes[i].num.store(0);
+        }
+        return ok;
+    }
+
+    // cuckoo_size returns the number of elements in the given table.
+    size_t cuckoo_size(const TableInfo* ti) const {
+        size_t inserts = 0;
+        size_t deletes = 0;
+        for (size_t i = 0; i < ti->num_inserts.size(); ++i) {
+            inserts += ti->num_inserts[i].num.load();
+            deletes += ti->num_deletes[i].num.load();
+        }
+        return inserts-deletes;
+    }
+
+    // cuckoo_loadfactor returns the load factor of the given table.
+    double cuckoo_loadfactor(const TableInfo* ti) const {
+        return static_cast<double>(cuckoo_size(ti)) / SLOT_PER_BUCKET /
+            hashsize(ti->hashpower_);
+    }
+
+    // insert_into_table is a helper function used by cuckoo_expand_simple to
+    // fill up the new table.
+    static void insert_into_table(
+        cuckoohash_map<Key, T, Hash, Pred>& new_map, const TableInfo* old_ti,
+        size_t i, size_t end) {
+        for (;i < end; ++i) {
+            for (size_t j = 0; j < SLOT_PER_BUCKET; ++j) {
+                if (old_ti->buckets_[i].occupied(j)) {
+                    new_map.insert(
+                        old_ti->buckets_[i].key(j),
+                        std::move((mapped_type&)old_ti->buckets_[i].val(j)));
+                }
+            }
+        }
+    }
+
+    // cuckoo_expand_simple is a simpler version of expansion than
+    // cuckoo_expand, which will double the size of the existing hash table. It
+    // needs to take all the bucket locks, since no other operations can change
+    // the table during expansion. If some other thread is holding the expansion
+    // thread at the time, then it will return failure_under_expansion.
+    cuckoo_status cuckoo_expand_simple(size_t n) {
+        TableInfo* ti = snapshot_and_lock_all();
+        assert(ti == table_info.load());
+        AllUnlocker au(ti);
+        HazardPointerUnsetter hpu;
+        if (n <= ti->hashpower_) {
+            // Most likely another expansion ran before this one could grab the
+            // locks
+            return failure_under_expansion;
+        }
+
+        // Creates a new hash table with hashpower n and adds all the
+        // elements from the old buckets
+        cuckoohash_map<Key, T, Hash, Pred> new_map(hashsize(n) * SLOT_PER_BUCKET);
+        const size_t threadnum = kNumCores();
+        const size_t buckets_per_thread =
+            hashsize(ti->hashpower_) / threadnum;
+        std::vector<std::thread> insertion_threads(threadnum);
+        for (size_t i = 0; i < threadnum-1; ++i) {
+            insertion_threads[i] = std::thread(
+                insert_into_table, std::ref(new_map),
+                ti, i*buckets_per_thread, (i+1)*buckets_per_thread);
+        }
+        insertion_threads[threadnum-1] = std::thread(
+            insert_into_table, std::ref(new_map), ti,
+            (threadnum-1)*buckets_per_thread, hashsize(ti->hashpower_));
+        for (size_t i = 0; i < threadnum; ++i) {
+            insertion_threads[i].join();
+        }
+        // Sets this table_info to new_map's. It then sets new_map's
+        // table_info to nullptr, so that it doesn't get deleted when
+        // new_map goes out of scope
+        table_info.store(new_map.table_info.load());
+        new_map.table_info.store(nullptr);
+
+        // Rather than deleting ti now, we store it in old_table_infos. We then
+        // run a delete_unused routine to delete all the old table pointers.
+        old_table_infos.push_back(std::move(std::unique_ptr<TableInfo>(ti)));
+        global_hazard_pointers.delete_unused(old_table_infos);
+        return ok;
+    }
+
+    // Iterator definitions
+    friend class const_iterator;
+    friend class iterator;
+
+public:
+    //! A const_iterator is an iterator through the table that is thread safe.
+    //! For the duration of its existence, it takes all the locks on the table
+    //! it is given, thereby ensuring that no other threads can modify the table
+    //! while the iterator is in use. Note that this also means that only one
+    //! iterator can be active on a table at one time and furthermore that all
+    //! operations on the table, except the \ref size, \ref empty, \ref
+    //! hashpower, \ref bucket_count, and \ref load_factor methods, will stall
+    //! until the iterator loses its lock. For this reason, we suggest using the
+    //! \ref snapshot_table method if possible, since it is less error-prone.
+    //! The iterator allows movement forward and backward through the table as
+    //! well as dereferencing items in the table. It maintains the invariant
+    //! that the iterator is either an end iterator (which points past the end
+    //! of the table), or points to a filled slot. As soon as the iterator
+    //! looses its lock on the table, all dereference and movement operations
+    //! will throw an exception.
+    class const_iterator {
+        // The constructor locks the entire table, retrying until
+        // snapshot_and_lock_all succeeds. Then it calculates end_pos and
+        // begin_pos and sets index and slot to the beginning or end of the
+        // table, based on the boolean argument. We keep this constructor
+        // private (but expose it to the cuckoohash_map class), since we don't
+        // want users calling it.
+        const_iterator(const cuckoohash_map<Key, T, Hash, Pred>& hm,
+                       bool is_end) : hm_(hm) {
+            cuckoohash_map<Key, T, Hash, Pred>::check_hazard_pointer();
+            ti_ = hm_.snapshot_and_lock_all();
+            assert(ti_ == hm_.table_info.load());
+
+            has_table_lock = true;
+
+            index_ = slot_ = 0;
+
+            set_end(end_pos.first, end_pos.second);
+            set_begin(begin_pos.first, begin_pos.second);
+            if (is_end) {
+                index_ = end_pos.first;
+                slot_ = end_pos.second;
+            } else {
+                index_ = begin_pos.first;
+                slot_ = begin_pos.second;
+            }
+        }
+
+        friend class cuckoohash_map<Key, T, Hash, Pred>;
+
+    public:
+        //! This is an rvalue-reference constructor that takes the lock from \p
+        //! it and copies its state. To create an iterator from scratch, call
+        //! the \ref cbegin or \ref cend methods of cuckoohash_map.
+        const_iterator(const_iterator&& it)
+            : hm_(it.hm_) {
+            if (this == &it) {
+                return;
+            }
+            memcpy(this, &it, sizeof(const_iterator));
+            it.has_table_lock = false;
+        }
+
+        //! The assignment operator behaves identically to the rvalue-reference
+        //! constructor.
+        const_iterator* operator=(const_iterator&& it) {
+            if (this == &it) {
+                return this;
+            }
+            memcpy(this, &it, sizeof(const_iterator));
+            it.has_table_lock = false;
+            return this;
+        }
+
+        //! release unlocks the table, thereby freeing it up for other
+        //! operations, but also invalidating all future operations with this
+        //! iterator.
+        void release() {
+            if (has_table_lock) {
+                AllUnlocker au(ti_);
+                cuckoohash_map<Key, T, Hash, Pred>::HazardPointerUnsetter hpu;
+                has_table_lock = false;
+            }
+        }
+
+        //! The destructor simply calls \ref release.
+        ~const_iterator() {
+            release();
+        }
+
+        //! is_end returns true if the iterator is at end_pos, which means it is
+        //! past the end of the table.
+        bool is_end() const {
+            return (index_ == end_pos.first && slot_ == end_pos.second);
+        }
+
+        //! is_begin returns true if the iterator is at begin_pos, which means
+        //! it is at the first item in the table.
+        bool is_begin() const {
+            return (index_ == begin_pos.first && slot_ == begin_pos.second);
+        }
+
+    protected:
+        // For the arrow dereference operator, we return a pointer to a
+        // lightweight pair consisting of const references to the key and value
+        // under the iterator.
+        typedef std::pair<const Key&, const T&> ref_pair;
+        // Since we can't initialize a ref_pair before knowing what it points
+        // to, we use std::aligned_storage to reserve unititialized space for
+        // the object, which we then construct with placement new. Since this
+        // isn't really part of the logical iterator state, we make it mutable.
+        mutable typename std::aligned_storage<sizeof(ref_pair),
+                                              alignof(ref_pair)>::type data;
+
+    public:
+        //! The dereference operator returns a value_type copied from the
+        //! key-value pair under the iterator.
+        value_type operator*() const {
+            check_lock();
+            if (is_end()) {
+                throw end_dereference;
+            }
+            assert(ti_->buckets_[index_].occupied(slot_));
+            return {ti_->buckets_[index_].key(slot_),
+                    ti_->buckets_[index_].val(slot_)};
+        }
+
+        //! The arrow dereference operator returns a pointer to an internal
+        //! std::pair which contains const references to the key and value
+        //! under the iterator.
+        ref_pair* operator->() const {
+            check_lock();
+            if (is_end()) {
+                throw end_dereference;
+            }
+            assert(ti_->buckets_[index_].occupied(slot_));
+            ref_pair* data_ptr =
+                static_cast<ref_pair*>(static_cast<void*>(&data));
+            new (data_ptr) ref_pair(ti_->buckets_[index_].key(slot_),
+                                    ti_->buckets_[index_].val(slot_));
+            return data_ptr;
+        }
+
+        //! The prefix increment operator moves the iterator forwards to the
+        //! next nonempty slot. If it reaches the end of the table, it becomes
+        //! an end iterator. It throws an exception if the iterator is already
+        //! at the end of the table.
+        const_iterator* operator++() {
+            check_lock();
+            if (is_end()) {
+                throw end_increment;
+            }
+            forward_filled_slot(index_, slot_);
+            return this;
+        }
+
+        //! The postfix increment operator behaves identically to the prefix
+        //! increment operator.
+        const_iterator* operator++(int) {
+            check_lock();
+            if (is_end()) {
+                throw end_increment;
+            }
+            forward_filled_slot(index_, slot_);
+            return this;
+        }
+
+        //! The prefix decrement operator moves the iterator backwards to the
+        //! previous nonempty slot. If we aren't at the beginning, then the
+        //! backward_filled_slot operation should not fail. If we are, it throws
+        //! an exception.
+        const_iterator* operator--() {
+            check_lock();
+            if (is_begin()) {
+                throw begin_decrement;
+            }
+            backward_filled_slot(index_, slot_);
+            return this;
+        }
+
+        //! The postfix decrement operator behaves identically to the prefix
+        //! decrement operator.
+        const_iterator* operator--(int) {
+            check_lock();
+            if (is_begin()) {
+                throw begin_decrement;
+            }
+            backward_filled_slot(index_, slot_);
+            return this;
+        }
+
+    protected:
+        // A pointer to the associated hashmap
+        const cuckoohash_map<Key, T, Hash, Pred>& hm_;
+
+        // The hashmap's table info
+        typename cuckoohash_map<Key, T, Hash, Pred>::TableInfo* ti_;
+
+        // Indicates whether the iterator has the table lock
+        bool has_table_lock;
+
+        // Stores the bucket and slot of the end iterator, which is one past the
+        // end of the table. It is initialized during the iterator's
+        // constructor.
+        std::pair<size_t, size_t> end_pos;
+
+        // Stotres the bucket and slot of the begin iterator, which is the first
+        // filled position in the table. It is initialized during the iterator's
+        // constructor. If the table is empty, it points past the end of the
+        // table, to the same position as end_pos.
+        std::pair<size_t, size_t> begin_pos;
+
+        // The bucket index of the item being pointed to
+        size_t index_;
+
+        // The slot in the bucket of the item being pointed to
+        size_t slot_;
+
+        // set_end sets the given index and slot to one past the last position
+        // in the table.
+        void set_end(size_t& index, size_t& slot) {
+            index = hm_.bucket_count();
+            slot = 0;
+        }
+
+        // set_begin sets the given pair to the position of the first element in
+        // the table.
+        void set_begin(size_t& index, size_t& slot) {
+            if (hm_.empty()) {
+                set_end(index, slot);
+            } else {
+                index = slot = 0;
+                // There must be a filled slot somewhere in the table
+                if (!ti_->buckets_[index].occupied(slot)) {
+                    forward_filled_slot(index, slot);
+                    assert(!is_end());
+                }
+            }
+        }
+
+        // forward_slot moves the given index and slot to the next available
+        // slot in the forwards direction. It returns true if it successfully
+        // advances, and false if it has reached the end of the table, in which
+        // case it sets index and slot to end_pos.
+        bool forward_slot(size_t& index, size_t& slot) {
+            if (slot < SLOT_PER_BUCKET-1) {
+                ++slot;
+                return true;
+            } else if (index < hm_.bucket_count()-1) {
+                ++index;
+                slot = 0;
+                return true;
+            } else {
+                set_end(index, slot);
+                return false;
+            }
+        }
+
+        // backward_slot moves index and slot to the next available slot in the
+        // backwards direction. It returns true if it successfully advances, and
+        // false if it has reached the beginning of the table, setting the index
+        // and slot back to begin_pos.
+        bool backward_slot(size_t& index, size_t& slot) {
+            if (slot > 0) {
+                --slot;
+                return true;
+            } else if (index > 0) {
+                --index;
+                slot = SLOT_PER_BUCKET-1;
+                return true;
+            } else {
+                set_begin(index, slot);
+                return false;
+            }
+        }
+
+        // forward_filled_slot moves index and slot to the next filled slot.
+        bool forward_filled_slot(size_t& index, size_t& slot) {
+            bool res = forward_slot(index, slot);
+            if (!res) {
+                return false;
+            }
+            while (!ti_->buckets_[index].occupied(slot)) {
+                res = forward_slot(index, slot);
+                if (!res) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        // backward_filled_slot moves index and slot to the previous filled
+        // slot.
+        bool backward_filled_slot(size_t& index, size_t& slot) {
+            bool res = backward_slot(index, slot);
+            if (!res) {
+                return false;
+            }
+            while (!ti_->buckets_[index].occupied(slot)) {
+                res = backward_slot(index, slot);
+                if (!res) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+
+        // check_lock throws an exception if the iterator doesn't have a
+        // lock.
+        void check_lock() const {
+            if (!has_table_lock) {
+                throw std::runtime_error(
+                    "Iterator does not have a lock on the table");
+            }
+        }
+
+        // Other error messages
+        static const std::out_of_range end_dereference;
+        static const std::out_of_range end_increment;
+        static const std::out_of_range begin_decrement;
+    };
+
+
+    //! An iterator supports the same operations as the const_iterator and
+    //! provides an additional \ref set_value method to allow changing values in
+    //! the table.
+    class iterator : public const_iterator {
+        // This constructor does the same thing as the private const_iterator
+        // one.
+        iterator(cuckoohash_map<Key, T, Hash, Pred>& hm, bool is_end)
+            : const_iterator(hm, is_end) {}
+
+        friend class cuckoohash_map<Key, T, Hash, Pred>;
+
+    public:
+        //! This constructor is identical to the rvalue-reference constructor of
+        //! const_iterator.
+        iterator(iterator&& it)
+            : const_iterator(std::move(it)) {}
+
+        //! This constructor allows converting from a const_iterator to an
+        //! iterator.
+        iterator(const_iterator&& it)
+            : const_iterator(std::move(it)) {}
+
+        // The assignment operator behaves identically to the rvalue-reference
+        // constructor.
+        iterator* operator=(iterator&& it) {
+            if (this == &it) {
+                return this;
+            }
+            memcpy(this, &it, sizeof(iterator));
+            it.has_table_lock = false;
+            return this;
+        }
+
+        //! set_value sets the value pointed to by the iterator to \p val. This
+        //! involves modifying the hash table itself, but since we have a lock
+        //! on the table, we are okay. We are only changing the value in the
+        //! bucket, so the element will retain it's position in the table.
+        void set_value(const mapped_type val) {
+            this->check_lock();
+            if (this->is_end()) {
+                throw this->end_dereference;
+            }
+            assert(this->ti_->buckets_[this->index_].occupied(this->slot_));
+            this->ti_->buckets_[this->index_].val(this->slot_) = val;
+        }
+    };
+
+// Public iterator functions
+public:
+    //! cbegin returns a const_iterator to the first filled slot in the
+    //! table.
+    const_iterator cbegin() const {
+        return const_iterator(*this, false);
+    }
+
+    //! cend returns a const_iterator set past the end of the table.
+    const_iterator cend() const {
+        return const_iterator(*this, true);
+    }
+
+    //! begin returns an iterator to the first filled slot in the table.
+    iterator begin() {
+        return iterator(*this, false);
+    }
+
+    //! end returns an iterator set past the end of the table.
+    iterator end() {
+        return iterator(*this, true);
+    }
+
+    //! snapshot_table allocates a vector and, using a const_iterator stores all
+    //! the elements currently in the table.
+    std::vector<value_type> snapshot_table() const {
+        std::vector<value_type> items;
+        items.reserve(size());
+        for (auto it = cbegin(); !it.is_end(); ++it) {
+            items.push_back(*it);
+        }
+        return items;
+    }
+};
+
+// Initializing the static members
+template <class Key, class T, class Hash, class Pred>
+    __thread typename cuckoohash_map<Key, T, Hash, Pred>::TableInfo**
+    cuckoohash_map<Key, T, Hash, Pred>::hazard_pointer = nullptr;
+
+template <class Key, class T, class Hash, class Pred>
+    __thread int cuckoohash_map<Key, T, Hash, Pred>::counterid = -1;
+
+template <class Key, class T, class Hash, class Pred>
+    typename cuckoohash_map<Key, T, Hash, Pred>::GlobalHazardPointerList
+    cuckoohash_map<Key, T, Hash, Pred>::global_hazard_pointers;
+
+template <class Key, class T, class Hash, class Pred>
+    const std::out_of_range
+    cuckoohash_map<Key, T, Hash, Pred>::const_iterator::end_dereference(
+        "Cannot dereference: iterator points past the end of the table");
+
+template <class Key, class T, class Hash, class Pred>
+    const std::out_of_range
+    cuckoohash_map<Key, T, Hash, Pred>::const_iterator::end_increment(
+        "Cannot increment: iterator points past the end of the table");
+
+template <class Key, class T, class Hash, class Pred>
+    const std::out_of_range
+    cuckoohash_map<Key, T, Hash, Pred>::const_iterator::begin_decrement(
+        "Cannot decrement: iterator points to the beginning of the table");
+
+template <class Key, class T, class Hash, class Pred>
+std::allocator<Key> cuckoohash_map<Key, T, Hash, Pred>::Bucket::key_allocator;
+
+template <class Key, class T, class Hash, class Pred>
+std::allocator<T> cuckoohash_map<Key, T, Hash, Pred>::Bucket::value_allocator;
+
+#endif // _CUCKOOHASH_MAP_HH
diff --git a/include/cuckoohash_util.h b/include/cuckoohash_util.h
new file mode 100644
index 0000000..79a6343
--- /dev/null
+++ b/include/cuckoohash_util.h
@@ -0,0 +1,23 @@
+#ifndef _CUCKOOHASH_UTIL_H
+#define _CUCKOOHASH_UTIL_H
+
+#include <pthread.h>
+#include "cuckoohash_config.h" // for LIBCUCKOO_DEBUG
+
+#if LIBCUCKOO_DEBUG
+#  define LIBCUCKOO_DBG(fmt, args...)                                   \
+     fprintf(stderr, "\x1b[32m""[libcuckoo:%s:%d:%lu] " fmt"" "\x1b[0m", \
+             __FILE__,__LINE__, (unsigned long)pthread_self(), ##args)
+#else
+#  define LIBCUCKOO_DBG(fmt, args...)  do {} while (0)
+#endif
+
+// For enabling certain methods based on a condition. Here's an example.
+// ENABLE_IF(some_cond, type, static, inline) method() {
+//     ...
+// }
+#define ENABLE_IF(preamble, condition, return_type)                     \
+     template <class Bogus=void*>                                       \
+          preamble typename std::enable_if<sizeof(Bogus) &&             \
+          condition, return_type>::type
+#endif // _CUCKOOHASH_UTIL_H
diff --git a/include/eigen3/Eigen/Array b/include/eigen3/Eigen/Array
new file mode 100644
index 0000000..3d004fb
--- /dev/null
+++ b/include/eigen3/Eigen/Array
@@ -0,0 +1,11 @@
+#ifndef EIGEN_ARRAY_MODULE_H
+#define EIGEN_ARRAY_MODULE_H
+
+// include Core first to handle Eigen2 support macros
+#include "Core"
+
+#ifndef EIGEN2_SUPPORT
+  #error The Eigen/Array header does no longer exist in Eigen3. All that functionality has moved to Eigen/Core.
+#endif
+
+#endif // EIGEN_ARRAY_MODULE_H
diff --git a/include/eigen3/Eigen/Cholesky b/include/eigen3/Eigen/Cholesky
new file mode 100644
index 0000000..f727f5d
--- /dev/null
+++ b/include/eigen3/Eigen/Cholesky
@@ -0,0 +1,32 @@
+#ifndef EIGEN_CHOLESKY_MODULE_H
+#define EIGEN_CHOLESKY_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup Cholesky_Module Cholesky module
+  *
+  *
+  *
+  * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
+  * Those decompositions are accessible via the following MatrixBase methods:
+  *  - MatrixBase::llt(),
+  *  - MatrixBase::ldlt()
+  *
+  * \code
+  * #include <Eigen/Cholesky>
+  * \endcode
+  */
+
+#include "src/misc/Solve.h"
+#include "src/Cholesky/LLT.h"
+#include "src/Cholesky/LDLT.h"
+#ifdef EIGEN_USE_LAPACKE
+#include "src/Cholesky/LLT_MKL.h"
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_CHOLESKY_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/Eigen/CholmodSupport b/include/eigen3/Eigen/CholmodSupport
new file mode 100644
index 0000000..745b884
--- /dev/null
+++ b/include/eigen3/Eigen/CholmodSupport
@@ -0,0 +1,45 @@
+#ifndef EIGEN_CHOLMODSUPPORT_MODULE_H
+#define EIGEN_CHOLMODSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+extern "C" {
+  #include <cholmod.h>
+}
+
+/** \ingroup Support_modules
+  * \defgroup CholmodSupport_Module CholmodSupport module
+  *
+  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * It provides the two following main factorization classes:
+  * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
+  * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
+  *
+  * For the sake of completeness, this module also propose the two following classes:
+  * - class CholmodSimplicialLLT
+  * - class CholmodSimplicialLDLT
+  * Note that these classes does not bring any particular advantage compared to the built-in
+  * SimplicialLLT and SimplicialLDLT factorization classes.
+  *
+  * \code
+  * #include <Eigen/CholmodSupport>
+  * \endcode
+  *
+  * In order to use this module, the cholmod headers must be accessible from the include paths, and your binary must be linked to the cholmod library and its dependencies.
+  * The dependencies depend on how cholmod has been compiled.
+  * For a cmake based project, you can use our FindCholmod.cmake module to help you in this task.
+  *
+  */
+
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
+#include "src/CholmodSupport/CholmodSupport.h"
+
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_CHOLMODSUPPORT_MODULE_H
+
diff --git a/include/eigen3/Eigen/Core b/include/eigen3/Eigen/Core
new file mode 100644
index 0000000..c87f99d
--- /dev/null
+++ b/include/eigen3/Eigen/Core
@@ -0,0 +1,376 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2007-2011 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_H
+#define EIGEN_CORE_H
+
+// first thing Eigen does: stop the compiler from committing suicide
+#include "src/Core/util/DisableStupidWarnings.h"
+
+// then include this file where all our macros are defined. It's really important to do it first because
+// it's where we do all the alignment settings (platform detection and honoring the user's will if he
+// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
+#include "src/Core/util/Macros.h"
+
+// Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
+// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
+#if defined(__MINGW32__) && EIGEN_GNUC_AT_LEAST(4,6)
+  #pragma GCC optimize ("-fno-ipa-cp-clone")
+#endif
+
+#include <complex>
+
+// this include file manages BLAS and MKL related macros
+// and inclusion of their respective header files
+#include "src/Core/util/MKL_support.h"
+
+// if alignment is disabled, then disable vectorization. Note: EIGEN_ALIGN is the proper check, it takes into
+// account both the user's will (EIGEN_DONT_ALIGN) and our own platform checks
+#if !EIGEN_ALIGN
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
+#ifdef _MSC_VER
+  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+  #if (_MSC_VER >= 1500) // 2008 or later
+    // Remember that usage of defined() in a #define is undefined by the standard.
+    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(_M_X64)
+      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
+    #endif
+  #endif
+#else
+  // Remember that usage of defined() in a #define is undefined by the standard
+  #if (defined __SSE2__) && ( (!defined __GNUC__) || (defined __INTEL_COMPILER) || EIGEN_GNUC_AT_LEAST(4,2) )
+    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+  #endif
+#endif
+
+#ifndef EIGEN_DONT_VECTORIZE
+
+  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+    // Defines symbols for compile-time detection of which instructions are
+    // used.
+    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SSE
+    #define EIGEN_VECTORIZE_SSE2
+
+    // Detect sse3/ssse3/sse4:
+    // gcc and icc defines __SSE3__, ...
+    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+    // want to force the use of those instructions with msvc.
+    #ifdef __SSE3__
+      #define EIGEN_VECTORIZE_SSE3
+    #endif
+    #ifdef __SSSE3__
+      #define EIGEN_VECTORIZE_SSSE3
+    #endif
+    #ifdef __SSE4_1__
+      #define EIGEN_VECTORIZE_SSE4_1
+    #endif
+    #ifdef __SSE4_2__
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+
+    // include files
+
+    // This extern "C" works around a MINGW-w64 compilation issue
+    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
+    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
+    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
+    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
+    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
+    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
+    extern "C" {
+      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
+      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+      #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110
+        #include <immintrin.h>
+      #else
+        #include <emmintrin.h>
+        #include <xmmintrin.h>
+        #ifdef  EIGEN_VECTORIZE_SSE3
+        #include <pmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSSE3
+        #include <tmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_1
+        #include <smmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_2
+        #include <nmmintrin.h>
+        #endif
+      #endif
+    } // end extern "C"
+  #elif defined __ALTIVEC__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ALTIVEC
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+  #elif defined  __ARM_NEON__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_NEON
+    #include <arm_neon.h>
+  #endif
+#endif
+
+#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
+  #define EIGEN_HAS_OPENMP
+#endif
+
+#ifdef EIGEN_HAS_OPENMP
+#include <omp.h>
+#endif
+
+// MSVC for windows mobile does not have the errno.h file
+#if !(defined(_MSC_VER) && defined(_WIN32_WCE)) && !defined(__ARMCC_VERSION)
+#define EIGEN_HAS_ERRNO
+#endif
+
+#ifdef EIGEN_HAS_ERRNO
+#include <cerrno>
+#endif
+#include <cstddef>
+#include <cstdlib>
+#include <cmath>
+#include <cassert>
+#include <functional>
+#include <iosfwd>
+#include <cstring>
+#include <string>
+#include <limits>
+#include <climits> // for CHAR_BIT
+// for min/max:
+#include <algorithm>
+
+// for outputting debug info
+#ifdef EIGEN_DEBUG_ASSIGN
+#include <iostream>
+#endif
+
+// required for __cpuid, needs to be included after cmath
+#if defined(_MSC_VER) && (defined(_M_IX86)||defined(_M_X64)) && (!defined(_WIN32_WCE))
+  #include <intrin.h>
+#endif
+
+#if defined(_CPPUNWIND) || defined(__EXCEPTIONS)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+  #include <new>
+#endif
+
+/** \brief Namespace containing all symbols from the %Eigen library. */
+namespace Eigen {
+
+inline static const char *SimdInstructionSetsInUse(void) {
+#if defined(EIGEN_VECTORIZE_SSE4_2)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "SSE, SSE2, SSE3, SSSE3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "SSE, SSE2, SSE3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "SSE, SSE2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
+#else
+  return "None";
+#endif
+}
+
+} // end namespace Eigen
+
+#define STAGE10_FULL_EIGEN2_API             10
+#define STAGE20_RESOLVE_API_CONFLICTS       20
+#define STAGE30_FULL_EIGEN3_API             30
+#define STAGE40_FULL_EIGEN3_STRICTNESS      40
+#define STAGE99_NO_EIGEN2_SUPPORT           99
+
+#if   defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE40_FULL_EIGEN3_STRICTNESS
+#elif defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
+#elif defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE20_RESOLVE_API_CONFLICTS
+#elif defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE10_FULL_EIGEN2_API
+#elif defined EIGEN2_SUPPORT
+  // default to stage 3, that's what it's always meant
+  #define EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
+  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
+#else
+  #define EIGEN2_SUPPORT_STAGE STAGE99_NO_EIGEN2_SUPPORT
+#endif
+
+#ifdef EIGEN2_SUPPORT
+#undef minor
+#endif
+
+// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
+// ensure QNX/QCC support
+using std::size_t;
+// gcc 4.6.0 wants std:: for ptrdiff_t 
+using std::ptrdiff_t;
+
+/** \defgroup Core_Module Core module
+  * This is the main module of Eigen providing dense matrix and vector support
+  * (both fixed and dynamic size) with all the features corresponding to a BLAS library
+  * and much more...
+  *
+  * \code
+  * #include <Eigen/Core>
+  * \endcode
+  */
+
+#include "src/Core/util/Constants.h"
+#include "src/Core/util/ForwardDeclarations.h"
+#include "src/Core/util/Meta.h"
+#include "src/Core/util/StaticAssert.h"
+#include "src/Core/util/XprHelper.h"
+#include "src/Core/util/Memory.h"
+
+#include "src/Core/NumTraits.h"
+#include "src/Core/MathFunctions.h"
+#include "src/Core/GenericPacketMath.h"
+
+#if defined EIGEN_VECTORIZE_SSE
+  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/SSE/Complex.h"
+#elif defined EIGEN_VECTORIZE_ALTIVEC
+  #include "src/Core/arch/AltiVec/PacketMath.h"
+  #include "src/Core/arch/AltiVec/Complex.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/Complex.h"
+#endif
+
+#include "src/Core/arch/Default/Settings.h"
+
+#include "src/Core/Functors.h"
+#include "src/Core/DenseCoeffsBase.h"
+#include "src/Core/DenseBase.h"
+#include "src/Core/MatrixBase.h"
+#include "src/Core/EigenBase.h"
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
+                                // at least confirmed with Doxygen 1.5.5 and 1.5.6
+  #include "src/Core/Assign.h"
+#endif
+
+#include "src/Core/util/BlasUtil.h"
+#include "src/Core/DenseStorage.h"
+#include "src/Core/NestByValue.h"
+#include "src/Core/ForceAlignedAccess.h"
+#include "src/Core/ReturnByValue.h"
+#include "src/Core/NoAlias.h"
+#include "src/Core/PlainObjectBase.h"
+#include "src/Core/Matrix.h"
+#include "src/Core/Array.h"
+#include "src/Core/CwiseBinaryOp.h"
+#include "src/Core/CwiseUnaryOp.h"
+#include "src/Core/CwiseNullaryOp.h"
+#include "src/Core/CwiseUnaryView.h"
+#include "src/Core/SelfCwiseBinaryOp.h"
+#include "src/Core/Dot.h"
+#include "src/Core/StableNorm.h"
+#include "src/Core/MapBase.h"
+#include "src/Core/Stride.h"
+#include "src/Core/Map.h"
+#include "src/Core/Block.h"
+#include "src/Core/VectorBlock.h"
+#include "src/Core/Ref.h"
+#include "src/Core/Transpose.h"
+#include "src/Core/DiagonalMatrix.h"
+#include "src/Core/Diagonal.h"
+#include "src/Core/DiagonalProduct.h"
+#include "src/Core/PermutationMatrix.h"
+#include "src/Core/Transpositions.h"
+#include "src/Core/Redux.h"
+#include "src/Core/Visitor.h"
+#include "src/Core/Fuzzy.h"
+#include "src/Core/IO.h"
+#include "src/Core/Swap.h"
+#include "src/Core/CommaInitializer.h"
+#include "src/Core/Flagged.h"
+#include "src/Core/ProductBase.h"
+#include "src/Core/GeneralProduct.h"
+#include "src/Core/TriangularMatrix.h"
+#include "src/Core/SelfAdjointView.h"
+#include "src/Core/products/GeneralBlockPanelKernel.h"
+#include "src/Core/products/Parallelizer.h"
+#include "src/Core/products/CoeffBasedProduct.h"
+#include "src/Core/products/GeneralMatrixVector.h"
+#include "src/Core/products/GeneralMatrixMatrix.h"
+#include "src/Core/SolveTriangular.h"
+#include "src/Core/products/GeneralMatrixMatrixTriangular.h"
+#include "src/Core/products/SelfadjointMatrixVector.h"
+#include "src/Core/products/SelfadjointMatrixMatrix.h"
+#include "src/Core/products/SelfadjointProduct.h"
+#include "src/Core/products/SelfadjointRank2Update.h"
+#include "src/Core/products/TriangularMatrixVector.h"
+#include "src/Core/products/TriangularMatrixMatrix.h"
+#include "src/Core/products/TriangularSolverMatrix.h"
+#include "src/Core/products/TriangularSolverVector.h"
+#include "src/Core/BandMatrix.h"
+#include "src/Core/CoreIterators.h"
+
+#include "src/Core/BooleanRedux.h"
+#include "src/Core/Select.h"
+#include "src/Core/VectorwiseOp.h"
+#include "src/Core/Random.h"
+#include "src/Core/Replicate.h"
+#include "src/Core/Reverse.h"
+#include "src/Core/ArrayBase.h"
+#include "src/Core/ArrayWrapper.h"
+
+#ifdef EIGEN_USE_BLAS
+#include "src/Core/products/GeneralMatrixMatrix_MKL.h"
+#include "src/Core/products/GeneralMatrixVector_MKL.h"
+#include "src/Core/products/GeneralMatrixMatrixTriangular_MKL.h"
+#include "src/Core/products/SelfadjointMatrixMatrix_MKL.h"
+#include "src/Core/products/SelfadjointMatrixVector_MKL.h"
+#include "src/Core/products/TriangularMatrixMatrix_MKL.h"
+#include "src/Core/products/TriangularMatrixVector_MKL.h"
+#include "src/Core/products/TriangularSolverMatrix_MKL.h"
+#endif // EIGEN_USE_BLAS
+
+#ifdef EIGEN_USE_MKL_VML
+#include "src/Core/Assign_MKL.h"
+#endif
+
+#include "src/Core/GlobalFunctions.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#ifdef EIGEN2_SUPPORT
+#include "Eigen2Support"
+#endif
+
+#endif // EIGEN_CORE_H
diff --git a/include/eigen3/Eigen/Dense b/include/eigen3/Eigen/Dense
new file mode 100644
index 0000000..5768910
--- /dev/null
+++ b/include/eigen3/Eigen/Dense
@@ -0,0 +1,7 @@
+#include "Core"
+#include "LU"
+#include "Cholesky"
+#include "QR"
+#include "SVD"
+#include "Geometry"
+#include "Eigenvalues"
diff --git a/include/eigen3/Eigen/Eigen b/include/eigen3/Eigen/Eigen
new file mode 100644
index 0000000..19b40ea
--- /dev/null
+++ b/include/eigen3/Eigen/Eigen
@@ -0,0 +1,2 @@
+#include "Dense"
+//#include "Sparse"
diff --git a/include/eigen3/Eigen/Eigen2Support b/include/eigen3/Eigen/Eigen2Support
new file mode 100644
index 0000000..6aa009d
--- /dev/null
+++ b/include/eigen3/Eigen/Eigen2Support
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2SUPPORT_H
+#define EIGEN2SUPPORT_H
+
+#if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H))
+#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before including any Eigen header
+#endif
+
+#ifndef EIGEN_NO_EIGEN2_DEPRECATED_WARNING
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+#warning "Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)"
+#else
+#pragma message ("Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)")
+#endif
+
+#endif // EIGEN_NO_EIGEN2_DEPRECATED_WARNING
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \ingroup Support_modules
+  * \defgroup Eigen2Support_Module Eigen2 support module
+  *
+  * \warning Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3.
+  *
+  * This module provides a couple of deprecated functions improving the compatibility with Eigen2.
+  * 
+  * To use it, define EIGEN2_SUPPORT before including any Eigen header
+  * \code
+  * #define EIGEN2_SUPPORT
+  * \endcode
+  *
+  */
+
+#include "src/Eigen2Support/Macros.h"
+#include "src/Eigen2Support/Memory.h"
+#include "src/Eigen2Support/Meta.h"
+#include "src/Eigen2Support/Lazy.h"
+#include "src/Eigen2Support/Cwise.h"
+#include "src/Eigen2Support/CwiseOperators.h"
+#include "src/Eigen2Support/TriangularSolver.h"
+#include "src/Eigen2Support/Block.h"
+#include "src/Eigen2Support/VectorBlock.h"
+#include "src/Eigen2Support/Minor.h"
+#include "src/Eigen2Support/MathFunctions.h"
+
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+// Eigen2 used to include iostream
+#include<iostream>
+
+#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
+using Eigen::Matrix##SizeSuffix##TypeSuffix; \
+using Eigen::Vector##SizeSuffix##TypeSuffix; \
+using Eigen::RowVector##SizeSuffix##TypeSuffix;
+
+#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(TypeSuffix) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
+
+#define EIGEN_USING_MATRIX_TYPEDEFS \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(i) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(f) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(d) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cf) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cd)
+
+#define USING_PART_OF_NAMESPACE_EIGEN \
+EIGEN_USING_MATRIX_TYPEDEFS \
+using Eigen::Matrix; \
+using Eigen::MatrixBase; \
+using Eigen::ei_random; \
+using Eigen::ei_real; \
+using Eigen::ei_imag; \
+using Eigen::ei_conj; \
+using Eigen::ei_abs; \
+using Eigen::ei_abs2; \
+using Eigen::ei_sqrt; \
+using Eigen::ei_exp; \
+using Eigen::ei_log; \
+using Eigen::ei_sin; \
+using Eigen::ei_cos;
+
+#endif // EIGEN2SUPPORT_H
diff --git a/include/eigen3/Eigen/Eigenvalues b/include/eigen3/Eigen/Eigenvalues
new file mode 100644
index 0000000..53c5a73
--- /dev/null
+++ b/include/eigen3/Eigen/Eigenvalues
@@ -0,0 +1,48 @@
+#ifndef EIGEN_EIGENVALUES_MODULE_H
+#define EIGEN_EIGENVALUES_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include "Cholesky"
+#include "Jacobi"
+#include "Householder"
+#include "LU"
+#include "Geometry"
+
+/** \defgroup Eigenvalues_Module Eigenvalues module
+  *
+  *
+  *
+  * This module mainly provides various eigenvalue solvers.
+  * This module also provides some MatrixBase methods, including:
+  *  - MatrixBase::eigenvalues(),
+  *  - MatrixBase::operatorNorm()
+  *
+  * \code
+  * #include <Eigen/Eigenvalues>
+  * \endcode
+  */
+
+#include "src/Eigenvalues/Tridiagonalization.h"
+#include "src/Eigenvalues/RealSchur.h"
+#include "src/Eigenvalues/EigenSolver.h"
+#include "src/Eigenvalues/SelfAdjointEigenSolver.h"
+#include "src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h"
+#include "src/Eigenvalues/HessenbergDecomposition.h"
+#include "src/Eigenvalues/ComplexSchur.h"
+#include "src/Eigenvalues/ComplexEigenSolver.h"
+#include "src/Eigenvalues/RealQZ.h"
+#include "src/Eigenvalues/GeneralizedEigenSolver.h"
+#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
+#ifdef EIGEN_USE_LAPACKE
+#include "src/Eigenvalues/RealSchur_MKL.h"
+#include "src/Eigenvalues/ComplexSchur_MKL.h"
+#include "src/Eigenvalues/SelfAdjointEigenSolver_MKL.h"
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_EIGENVALUES_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/Eigen/Geometry b/include/eigen3/Eigen/Geometry
new file mode 100644
index 0000000..efd9d45
--- /dev/null
+++ b/include/eigen3/Eigen/Geometry
@@ -0,0 +1,63 @@
+#ifndef EIGEN_GEOMETRY_MODULE_H
+#define EIGEN_GEOMETRY_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include "SVD"
+#include "LU"
+#include <limits>
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+/** \defgroup Geometry_Module Geometry module
+  *
+  *
+  *
+  * This module provides support for:
+  *  - fixed-size homogeneous transformations
+  *  - translation, scaling, 2D and 3D rotations
+  *  - quaternions
+  *  - \ref MatrixBase::cross() "cross product"
+  *  - \ref MatrixBase::unitOrthogonal() "orthognal vector generation"
+  *  - some linear components: parametrized-lines and hyperplanes
+  *
+  * \code
+  * #include <Eigen/Geometry>
+  * \endcode
+  */
+
+#include "src/Geometry/OrthoMethods.h"
+#include "src/Geometry/EulerAngles.h"
+
+#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+  #include "src/Geometry/Homogeneous.h"
+  #include "src/Geometry/RotationBase.h"
+  #include "src/Geometry/Rotation2D.h"
+  #include "src/Geometry/Quaternion.h"
+  #include "src/Geometry/AngleAxis.h"
+  #include "src/Geometry/Transform.h"
+  #include "src/Geometry/Translation.h"
+  #include "src/Geometry/Scaling.h"
+  #include "src/Geometry/Hyperplane.h"
+  #include "src/Geometry/ParametrizedLine.h"
+  #include "src/Geometry/AlignedBox.h"
+  #include "src/Geometry/Umeyama.h"
+
+  #if defined EIGEN_VECTORIZE_SSE
+    #include "src/Geometry/arch/Geometry_SSE.h"
+  #endif
+#endif
+
+#ifdef EIGEN2_SUPPORT
+#include "src/Eigen2Support/Geometry/All.h"
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_GEOMETRY_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+
diff --git a/include/eigen3/Eigen/Householder b/include/eigen3/Eigen/Householder
new file mode 100644
index 0000000..6e348db
--- /dev/null
+++ b/include/eigen3/Eigen/Householder
@@ -0,0 +1,23 @@
+#ifndef EIGEN_HOUSEHOLDER_MODULE_H
+#define EIGEN_HOUSEHOLDER_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup Householder_Module Householder module
+  * This module provides Householder transformations.
+  *
+  * \code
+  * #include <Eigen/Householder>
+  * \endcode
+  */
+
+#include "src/Householder/Householder.h"
+#include "src/Householder/HouseholderSequence.h"
+#include "src/Householder/BlockHouseholder.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_HOUSEHOLDER_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/Eigen/IterativeLinearSolvers b/include/eigen3/Eigen/IterativeLinearSolvers
new file mode 100644
index 0000000..0f4159d
--- /dev/null
+++ b/include/eigen3/Eigen/IterativeLinearSolvers
@@ -0,0 +1,40 @@
+#ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
+#define EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
+
+#include "SparseCore"
+#include "OrderingMethods"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** 
+  * \defgroup IterativeLinearSolvers_Module IterativeLinearSolvers module
+  *
+  * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
+  * Those solvers are accessible via the following classes:
+  *  - ConjugateGradient for selfadjoint (hermitian) matrices,
+  *  - BiCGSTAB for general square matrices.
+  *
+  * These iterative solvers are associated with some preconditioners:
+  *  - IdentityPreconditioner - not really useful
+  *  - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices.
+  *  - IncompleteILUT - incomplete LU factorization with dual thresholding
+  *
+  * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
+  *
+  * \code
+  * #include <Eigen/IterativeLinearSolvers>
+  * \endcode
+  */
+
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
+#include "src/IterativeLinearSolvers/IterativeSolverBase.h"
+#include "src/IterativeLinearSolvers/BasicPreconditioners.h"
+#include "src/IterativeLinearSolvers/ConjugateGradient.h"
+#include "src/IterativeLinearSolvers/BiCGSTAB.h"
+#include "src/IterativeLinearSolvers/IncompleteLUT.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
diff --git a/include/eigen3/Eigen/Jacobi b/include/eigen3/Eigen/Jacobi
new file mode 100644
index 0000000..ba8a4dc
--- /dev/null
+++ b/include/eigen3/Eigen/Jacobi
@@ -0,0 +1,26 @@
+#ifndef EIGEN_JACOBI_MODULE_H
+#define EIGEN_JACOBI_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup Jacobi_Module Jacobi module
+  * This module provides Jacobi and Givens rotations.
+  *
+  * \code
+  * #include <Eigen/Jacobi>
+  * \endcode
+  *
+  * In addition to listed classes, it defines the two following MatrixBase methods to apply a Jacobi or Givens rotation:
+  *  - MatrixBase::applyOnTheLeft()
+  *  - MatrixBase::applyOnTheRight().
+  */
+
+#include "src/Jacobi/Jacobi.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_JACOBI_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+
diff --git a/include/eigen3/Eigen/LU b/include/eigen3/Eigen/LU
new file mode 100644
index 0000000..db57955
--- /dev/null
+++ b/include/eigen3/Eigen/LU
@@ -0,0 +1,41 @@
+#ifndef EIGEN_LU_MODULE_H
+#define EIGEN_LU_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup LU_Module LU module
+  * This module includes %LU decomposition and related notions such as matrix inversion and determinant.
+  * This module defines the following MatrixBase methods:
+  *  - MatrixBase::inverse()
+  *  - MatrixBase::determinant()
+  *
+  * \code
+  * #include <Eigen/LU>
+  * \endcode
+  */
+
+#include "src/misc/Solve.h"
+#include "src/misc/Kernel.h"
+#include "src/misc/Image.h"
+#include "src/LU/FullPivLU.h"
+#include "src/LU/PartialPivLU.h"
+#ifdef EIGEN_USE_LAPACKE
+#include "src/LU/PartialPivLU_MKL.h"
+#endif
+#include "src/LU/Determinant.h"
+#include "src/LU/Inverse.h"
+
+#if defined EIGEN_VECTORIZE_SSE
+  #include "src/LU/arch/Inverse_SSE.h"
+#endif
+
+#ifdef EIGEN2_SUPPORT
+  #include "src/Eigen2Support/LU.h"
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_LU_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/Eigen/LeastSquares b/include/eigen3/Eigen/LeastSquares
new file mode 100644
index 0000000..35137c2
--- /dev/null
+++ b/include/eigen3/Eigen/LeastSquares
@@ -0,0 +1,32 @@
+#ifndef EIGEN_REGRESSION_MODULE_H
+#define EIGEN_REGRESSION_MODULE_H
+
+#ifndef EIGEN2_SUPPORT
+#error LeastSquares is only available in Eigen2 support mode (define EIGEN2_SUPPORT)
+#endif
+
+// exclude from normal eigen3-only documentation
+#ifdef EIGEN2_SUPPORT
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include "Eigenvalues"
+#include "Geometry"
+
+/** \defgroup LeastSquares_Module LeastSquares module
+  * This module provides linear regression and related features.
+  *
+  * \code
+  * #include <Eigen/LeastSquares>
+  * \endcode
+  */
+
+#include "src/Eigen2Support/LeastSquares.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN2_SUPPORT
+
+#endif // EIGEN_REGRESSION_MODULE_H
diff --git a/include/eigen3/Eigen/MetisSupport b/include/eigen3/Eigen/MetisSupport
new file mode 100644
index 0000000..6a113f7
--- /dev/null
+++ b/include/eigen3/Eigen/MetisSupport
@@ -0,0 +1,28 @@
+#ifndef EIGEN_METISSUPPORT_MODULE_H
+#define EIGEN_METISSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+extern "C" {
+#include <metis.h>
+}
+
+
+/** \ingroup Support_modules
+  * \defgroup MetisSupport_Module MetisSupport module
+  *
+  * \code
+  * #include <Eigen/MetisSupport>
+  * \endcode
+  * This module defines an interface to the METIS reordering package (http://glaros.dtc.umn.edu/gkhome/views/metis). 
+  * It can be used just as any other built-in method as explained in \link OrderingMethods_Module here. \endlink
+  */
+
+
+#include "src/MetisSupport/MetisSupport.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_METISSUPPORT_MODULE_H
diff --git a/include/eigen3/Eigen/OrderingMethods b/include/eigen3/Eigen/OrderingMethods
new file mode 100644
index 0000000..7c0f1ff
--- /dev/null
+++ b/include/eigen3/Eigen/OrderingMethods
@@ -0,0 +1,66 @@
+#ifndef EIGEN_ORDERINGMETHODS_MODULE_H
+#define EIGEN_ORDERINGMETHODS_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** 
+  * \defgroup OrderingMethods_Module OrderingMethods module
+  *
+  * This module is currently for internal use only
+  * 
+  * It defines various built-in and external ordering methods for sparse matrices. 
+  * They are typically used to reduce the number of elements during 
+  * the sparse matrix decomposition (LLT, LU, QR).
+  * Precisely, in a preprocessing step, a permutation matrix P is computed using 
+  * those ordering methods and applied to the columns of the matrix. 
+  * Using for instance the sparse Cholesky decomposition, it is expected that 
+  * the nonzeros elements in LLT(A*P) will be much smaller than that in LLT(A).
+  * 
+  * 
+  * Usage : 
+  * \code
+  * #include <Eigen/OrderingMethods>
+  * \endcode
+  * 
+  * A simple usage is as a template parameter in the sparse decomposition classes : 
+  * 
+  * \code 
+  * SparseLU<MatrixType, COLAMDOrdering<int> > solver;
+  * \endcode 
+  * 
+  * \code 
+  * SparseQR<MatrixType, COLAMDOrdering<int> > solver;
+  * \endcode
+  * 
+  * It is possible as well to call directly a particular ordering method for your own purpose, 
+  * \code 
+  * AMDOrdering<int> ordering;
+  * PermutationMatrix<Dynamic, Dynamic, int> perm;
+  * SparseMatrix<double> A; 
+  * //Fill the matrix ...
+  * 
+  * ordering(A, perm); // Call AMD
+  * \endcode
+  * 
+  * \note Some of these methods (like AMD or METIS), need the sparsity pattern 
+  * of the input matrix to be symmetric. When the matrix is structurally unsymmetric, 
+  * Eigen computes internally the pattern of \f$A^T*A\f$ before calling the method.
+  * If your matrix is already symmetric (at leat in structure), you can avoid that
+  * by calling the method with a SelfAdjointView type.
+  * 
+  * \code
+  *  // Call the ordering on the pattern of the lower triangular matrix A
+  * ordering(A.selfadjointView<Lower>(), perm);
+  * \endcode
+  */
+
+#ifndef EIGEN_MPL2_ONLY
+#include "src/OrderingMethods/Amd.h"
+#endif
+
+#include "src/OrderingMethods/Ordering.h"
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_ORDERINGMETHODS_MODULE_H
diff --git a/include/eigen3/Eigen/PaStiXSupport b/include/eigen3/Eigen/PaStiXSupport
new file mode 100644
index 0000000..7c616ee
--- /dev/null
+++ b/include/eigen3/Eigen/PaStiXSupport
@@ -0,0 +1,46 @@
+#ifndef EIGEN_PASTIXSUPPORT_MODULE_H
+#define EIGEN_PASTIXSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include <complex.h>
+extern "C" {
+#include <pastix_nompi.h>
+#include <pastix.h>
+}
+
+#ifdef complex
+#undef complex
+#endif
+
+/** \ingroup Support_modules
+  * \defgroup PaStiXSupport_Module PaStiXSupport module
+  * 
+  * This module provides an interface to the <a href="http://pastix.gforge.inria.fr/">PaSTiX</a> library.
+  * PaSTiX is a general \b supernodal, \b parallel and \b opensource sparse solver.
+  * It provides the two following main factorization classes:
+  * - class PastixLLT : a supernodal, parallel LLt Cholesky factorization.
+  * - class PastixLDLT: a supernodal, parallel LDLt Cholesky factorization.
+  * - class PastixLU : a supernodal, parallel LU factorization (optimized for a symmetric pattern).
+  * 
+  * \code
+  * #include <Eigen/PaStiXSupport>
+  * \endcode
+  *
+  * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
+  * The dependencies depend on how PaSTiX has been compiled.
+  * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
+  *
+  */
+
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
+#include "src/PaStiXSupport/PaStiXSupport.h"
+
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_PASTIXSUPPORT_MODULE_H
diff --git a/include/eigen3/Eigen/PardisoSupport b/include/eigen3/Eigen/PardisoSupport
new file mode 100644
index 0000000..99330ce
--- /dev/null
+++ b/include/eigen3/Eigen/PardisoSupport
@@ -0,0 +1,30 @@
+#ifndef EIGEN_PARDISOSUPPORT_MODULE_H
+#define EIGEN_PARDISOSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include <mkl_pardiso.h>
+
+#include <unsupported/Eigen/SparseExtra>
+
+/** \ingroup Support_modules
+  * \defgroup PardisoSupport_Module PardisoSupport module
+  *
+  * This module brings support for the Intel(R) MKL PARDISO direct sparse solvers.
+  *
+  * \code
+  * #include <Eigen/PardisoSupport>
+  * \endcode
+  *
+  * In order to use this module, the MKL headers must be accessible from the include paths, and your binary must be linked to the MKL library and its dependencies.
+  * See this \ref TopicUsingIntelMKL "page" for more information on MKL-Eigen integration.
+  * 
+  */
+
+#include "src/PardisoSupport/PardisoSupport.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_PARDISOSUPPORT_MODULE_H
diff --git a/include/eigen3/Eigen/QR b/include/eigen3/Eigen/QR
new file mode 100644
index 0000000..ac5b026
--- /dev/null
+++ b/include/eigen3/Eigen/QR
@@ -0,0 +1,45 @@
+#ifndef EIGEN_QR_MODULE_H
+#define EIGEN_QR_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include "Cholesky"
+#include "Jacobi"
+#include "Householder"
+
+/** \defgroup QR_Module QR module
+  *
+  *
+  *
+  * This module provides various QR decompositions
+  * This module also provides some MatrixBase methods, including:
+  *  - MatrixBase::qr(),
+  *
+  * \code
+  * #include <Eigen/QR>
+  * \endcode
+  */
+
+#include "src/misc/Solve.h"
+#include "src/QR/HouseholderQR.h"
+#include "src/QR/FullPivHouseholderQR.h"
+#include "src/QR/ColPivHouseholderQR.h"
+#ifdef EIGEN_USE_LAPACKE
+#include "src/QR/HouseholderQR_MKL.h"
+#include "src/QR/ColPivHouseholderQR_MKL.h"
+#endif
+
+#ifdef EIGEN2_SUPPORT
+#include "src/Eigen2Support/QR.h"
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#ifdef EIGEN2_SUPPORT
+#include "Eigenvalues"
+#endif
+
+#endif // EIGEN_QR_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/Eigen/QtAlignedMalloc b/include/eigen3/Eigen/QtAlignedMalloc
new file mode 100644
index 0000000..46f7d83
--- /dev/null
+++ b/include/eigen3/Eigen/QtAlignedMalloc
@@ -0,0 +1,34 @@
+
+#ifndef EIGEN_QTMALLOC_MODULE_H
+#define EIGEN_QTMALLOC_MODULE_H
+
+#include "Core"
+
+#if (!EIGEN_MALLOC_ALREADY_ALIGNED)
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+void *qMalloc(size_t size)
+{
+  return Eigen::internal::aligned_malloc(size);
+}
+
+void qFree(void *ptr)
+{
+  Eigen::internal::aligned_free(ptr);
+}
+
+void *qRealloc(void *ptr, size_t size)
+{
+  void* newPtr = Eigen::internal::aligned_malloc(size);
+  memcpy(newPtr, ptr, size);
+  Eigen::internal::aligned_free(ptr);
+  return newPtr;
+}
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif
+
+#endif // EIGEN_QTMALLOC_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/Eigen/SPQRSupport b/include/eigen3/Eigen/SPQRSupport
new file mode 100644
index 0000000..7701644
--- /dev/null
+++ b/include/eigen3/Eigen/SPQRSupport
@@ -0,0 +1,29 @@
+#ifndef EIGEN_SPQRSUPPORT_MODULE_H
+#define EIGEN_SPQRSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include "SuiteSparseQR.hpp"
+
+/** \ingroup Support_modules
+  * \defgroup SPQRSupport_Module SuiteSparseQR module
+  * 
+  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  *
+  * \code
+  * #include <Eigen/SPQRSupport>
+  * \endcode
+  *
+  * In order to use this module, the SPQR headers must be accessible from the include paths, and your binary must be linked to the SPQR library and its dependencies (Cholmod, AMD, COLAMD,...).
+  * For a cmake based project, you can use our FindSPQR.cmake and FindCholmod.Cmake modules
+  *
+  */
+
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+#include "src/CholmodSupport/CholmodSupport.h"
+#include "src/SPQRSupport/SuiteSparseQRSupport.h"
+
+#endif
diff --git a/include/eigen3/Eigen/SVD b/include/eigen3/Eigen/SVD
new file mode 100644
index 0000000..fd31001
--- /dev/null
+++ b/include/eigen3/Eigen/SVD
@@ -0,0 +1,37 @@
+#ifndef EIGEN_SVD_MODULE_H
+#define EIGEN_SVD_MODULE_H
+
+#include "QR"
+#include "Householder"
+#include "Jacobi"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup SVD_Module SVD module
+  *
+  *
+  *
+  * This module provides SVD decomposition for matrices (both real and complex).
+  * This decomposition is accessible via the following MatrixBase method:
+  *  - MatrixBase::jacobiSvd()
+  *
+  * \code
+  * #include <Eigen/SVD>
+  * \endcode
+  */
+
+#include "src/misc/Solve.h"
+#include "src/SVD/JacobiSVD.h"
+#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
+#include "src/SVD/JacobiSVD_MKL.h"
+#endif
+#include "src/SVD/UpperBidiagonalization.h"
+
+#ifdef EIGEN2_SUPPORT
+#include "src/Eigen2Support/SVD.h"
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SVD_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/Eigen/Sparse b/include/eigen3/Eigen/Sparse
new file mode 100644
index 0000000..7cc9c09
--- /dev/null
+++ b/include/eigen3/Eigen/Sparse
@@ -0,0 +1,27 @@
+#ifndef EIGEN_SPARSE_MODULE_H
+#define EIGEN_SPARSE_MODULE_H
+
+/** \defgroup Sparse_Module Sparse meta-module
+  *
+  * Meta-module including all related modules:
+  * - \ref SparseCore_Module
+  * - \ref OrderingMethods_Module
+  * - \ref SparseCholesky_Module
+  * - \ref SparseLU_Module
+  * - \ref SparseQR_Module
+  * - \ref IterativeLinearSolvers_Module
+  *
+  * \code
+  * #include <Eigen/Sparse>
+  * \endcode
+  */
+
+#include "SparseCore"
+#include "OrderingMethods"
+#include "SparseCholesky"
+#include "SparseLU"
+#include "SparseQR"
+#include "IterativeLinearSolvers"
+
+#endif // EIGEN_SPARSE_MODULE_H
+
diff --git a/include/eigen3/Eigen/SparseCholesky b/include/eigen3/Eigen/SparseCholesky
new file mode 100644
index 0000000..9f5056a
--- /dev/null
+++ b/include/eigen3/Eigen/SparseCholesky
@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2013 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSECHOLESKY_MODULE_H
+#define EIGEN_SPARSECHOLESKY_MODULE_H
+
+#include "SparseCore"
+#include "OrderingMethods"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** 
+  * \defgroup SparseCholesky_Module SparseCholesky module
+  *
+  * This module currently provides two variants of the direct sparse Cholesky decomposition for selfadjoint (hermitian) matrices.
+  * Those decompositions are accessible via the following classes:
+  *  - SimplicialLLt,
+  *  - SimplicialLDLt
+  *
+  * Such problems can also be solved using the ConjugateGradient solver from the IterativeLinearSolvers module.
+  *
+  * \code
+  * #include <Eigen/SparseCholesky>
+  * \endcode
+  */
+
+#ifdef EIGEN_MPL2_ONLY
+#error The SparseCholesky module has nothing to offer in MPL2 only mode
+#endif
+
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+#include "src/SparseCholesky/SimplicialCholesky.h"
+
+#ifndef EIGEN_MPL2_ONLY
+#include "src/SparseCholesky/SimplicialCholesky_impl.h"
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPARSECHOLESKY_MODULE_H
diff --git a/include/eigen3/Eigen/SparseCore b/include/eigen3/Eigen/SparseCore
new file mode 100644
index 0000000..9b5be5e
--- /dev/null
+++ b/include/eigen3/Eigen/SparseCore
@@ -0,0 +1,64 @@
+#ifndef EIGEN_SPARSECORE_MODULE_H
+#define EIGEN_SPARSECORE_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include <vector>
+#include <map>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+
+/** 
+  * \defgroup SparseCore_Module SparseCore module
+  *
+  * This module provides a sparse matrix representation, and basic associatd matrix manipulations
+  * and operations.
+  *
+  * See the \ref TutorialSparse "Sparse tutorial"
+  *
+  * \code
+  * #include <Eigen/SparseCore>
+  * \endcode
+  *
+  * This module depends on: Core.
+  */
+
+namespace Eigen {
+
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
+}
+
+#include "src/SparseCore/SparseUtil.h"
+#include "src/SparseCore/SparseMatrixBase.h"
+#include "src/SparseCore/CompressedStorage.h"
+#include "src/SparseCore/AmbiVector.h"
+#include "src/SparseCore/SparseMatrix.h"
+#include "src/SparseCore/MappedSparseMatrix.h"
+#include "src/SparseCore/SparseVector.h"
+#include "src/SparseCore/SparseBlock.h"
+#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseCwiseUnaryOp.h"
+#include "src/SparseCore/SparseCwiseBinaryOp.h"
+#include "src/SparseCore/SparseDot.h"
+#include "src/SparseCore/SparsePermutation.h"
+#include "src/SparseCore/SparseRedux.h"
+#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/ConservativeSparseSparseProduct.h"
+#include "src/SparseCore/SparseSparseProductWithPruning.h"
+#include "src/SparseCore/SparseProduct.h"
+#include "src/SparseCore/SparseDenseProduct.h"
+#include "src/SparseCore/SparseDiagonalProduct.h"
+#include "src/SparseCore/SparseTriangularView.h"
+#include "src/SparseCore/SparseSelfAdjointView.h"
+#include "src/SparseCore/TriangularSolver.h"
+#include "src/SparseCore/SparseView.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPARSECORE_MODULE_H
+
diff --git a/include/eigen3/Eigen/SparseLU b/include/eigen3/Eigen/SparseLU
new file mode 100644
index 0000000..8527a49
--- /dev/null
+++ b/include/eigen3/Eigen/SparseLU
@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSELU_MODULE_H
+#define EIGEN_SPARSELU_MODULE_H
+
+#include "SparseCore"
+
+/** 
+  * \defgroup SparseLU_Module SparseLU module
+  * This module defines a supernodal factorization of general sparse matrices.
+  * The code is fully optimized for supernode-panel updates with specialized kernels.
+  * Please, see the documentation of the SparseLU class for more details.
+  */
+
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
+// Ordering interface
+#include "OrderingMethods"
+
+#include "src/SparseLU/SparseLU_gemm_kernel.h"
+
+#include "src/SparseLU/SparseLU_Structs.h"
+#include "src/SparseLU/SparseLU_SupernodalMatrix.h"
+#include "src/SparseLU/SparseLUImpl.h"
+#include "src/SparseCore/SparseColEtree.h"
+#include "src/SparseLU/SparseLU_Memory.h"
+#include "src/SparseLU/SparseLU_heap_relax_snode.h"
+#include "src/SparseLU/SparseLU_relax_snode.h"
+#include "src/SparseLU/SparseLU_pivotL.h"
+#include "src/SparseLU/SparseLU_panel_dfs.h"
+#include "src/SparseLU/SparseLU_kernel_bmod.h"
+#include "src/SparseLU/SparseLU_panel_bmod.h"
+#include "src/SparseLU/SparseLU_column_dfs.h"
+#include "src/SparseLU/SparseLU_column_bmod.h"
+#include "src/SparseLU/SparseLU_copy_to_ucol.h"
+#include "src/SparseLU/SparseLU_pruneL.h"
+#include "src/SparseLU/SparseLU_Utils.h"
+#include "src/SparseLU/SparseLU.h"
+
+#endif // EIGEN_SPARSELU_MODULE_H
diff --git a/include/eigen3/Eigen/SparseQR b/include/eigen3/Eigen/SparseQR
new file mode 100644
index 0000000..4ee4206
--- /dev/null
+++ b/include/eigen3/Eigen/SparseQR
@@ -0,0 +1,33 @@
+#ifndef EIGEN_SPARSEQR_MODULE_H
+#define EIGEN_SPARSEQR_MODULE_H
+
+#include "SparseCore"
+#include "OrderingMethods"
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup SparseQR_Module SparseQR module
+  * \brief Provides QR decomposition for sparse matrices
+  * 
+  * This module provides a simplicial version of the left-looking Sparse QR decomposition. 
+  * The columns of the input matrix should be reordered to limit the fill-in during the 
+  * decomposition. Built-in methods (COLAMD, AMD) or external  methods (METIS) can be used to this end.
+  * See the \link OrderingMethods_Module OrderingMethods\endlink module for the list 
+  * of built-in and external ordering methods.
+  * 
+  * \code
+  * #include <Eigen/SparseQR>
+  * \endcode
+  * 
+  * 
+  */
+
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
+#include "OrderingMethods"
+#include "src/SparseCore/SparseColEtree.h"
+#include "src/SparseQR/SparseQR.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif
diff --git a/include/eigen3/Eigen/StdDeque b/include/eigen3/Eigen/StdDeque
new file mode 100644
index 0000000..f272347
--- /dev/null
+++ b/include/eigen3/Eigen/StdDeque
@@ -0,0 +1,27 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel at googlemail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STDDEQUE_MODULE_H
+#define EIGEN_STDDEQUE_MODULE_H
+
+#include "Core"
+#include <deque>
+
+#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+
+#define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)
+
+#else
+
+#include "src/StlSupport/StdDeque.h"
+
+#endif
+
+#endif // EIGEN_STDDEQUE_MODULE_H
diff --git a/include/eigen3/Eigen/StdList b/include/eigen3/Eigen/StdList
new file mode 100644
index 0000000..225c1e1
--- /dev/null
+++ b/include/eigen3/Eigen/StdList
@@ -0,0 +1,26 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel at googlemail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STDLIST_MODULE_H
+#define EIGEN_STDLIST_MODULE_H
+
+#include "Core"
+#include <list>
+
+#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */    
+
+#define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)
+
+#else
+
+#include "src/StlSupport/StdList.h"
+
+#endif
+
+#endif // EIGEN_STDLIST_MODULE_H
diff --git a/include/eigen3/Eigen/StdVector b/include/eigen3/Eigen/StdVector
new file mode 100644
index 0000000..6b22627
--- /dev/null
+++ b/include/eigen3/Eigen/StdVector
@@ -0,0 +1,27 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel at googlemail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STDVECTOR_MODULE_H
+#define EIGEN_STDVECTOR_MODULE_H
+
+#include "Core"
+#include <vector>
+
+#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+
+#define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)
+
+#else
+
+#include "src/StlSupport/StdVector.h"
+
+#endif
+
+#endif // EIGEN_STDVECTOR_MODULE_H
diff --git a/include/eigen3/Eigen/SuperLUSupport b/include/eigen3/Eigen/SuperLUSupport
new file mode 100644
index 0000000..575e14f
--- /dev/null
+++ b/include/eigen3/Eigen/SuperLUSupport
@@ -0,0 +1,59 @@
+#ifndef EIGEN_SUPERLUSUPPORT_MODULE_H
+#define EIGEN_SUPERLUSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#ifdef EMPTY
+#define EIGEN_EMPTY_WAS_ALREADY_DEFINED
+#endif
+
+typedef int int_t;
+#include <slu_Cnames.h>
+#include <supermatrix.h>
+#include <slu_util.h>
+
+// slu_util.h defines a preprocessor token named EMPTY which is really polluting,
+// so we remove it in favor of a SUPERLU_EMPTY token.
+// If EMPTY was already defined then we don't undef it.
+
+#if defined(EIGEN_EMPTY_WAS_ALREADY_DEFINED)
+# undef EIGEN_EMPTY_WAS_ALREADY_DEFINED
+#elif defined(EMPTY)
+# undef EMPTY
+#endif
+
+#define SUPERLU_EMPTY (-1)
+
+namespace Eigen { struct SluMatrix; }
+
+/** \ingroup Support_modules
+  * \defgroup SuperLUSupport_Module SuperLUSupport module
+  *
+  * This module provides an interface to the <a href="http://crd-legacy.lbl.gov/~xiaoye/SuperLU/">SuperLU</a> library.
+  * It provides the following factorization class:
+  * - class SuperLU: a supernodal sequential LU factorization.
+  * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
+  *
+  * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
+  *
+  * \code
+  * #include <Eigen/SuperLUSupport>
+  * \endcode
+  *
+  * In order to use this module, the superlu headers must be accessible from the include paths, and your binary must be linked to the superlu library and its dependencies.
+  * The dependencies depend on how superlu has been compiled.
+  * For a cmake based project, you can use our FindSuperLU.cmake module to help you in this task.
+  *
+  */
+
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
+#include "src/SuperLUSupport/SuperLUSupport.h"
+
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SUPERLUSUPPORT_MODULE_H
diff --git a/include/eigen3/Eigen/UmfPackSupport b/include/eigen3/Eigen/UmfPackSupport
new file mode 100644
index 0000000..984f64a
--- /dev/null
+++ b/include/eigen3/Eigen/UmfPackSupport
@@ -0,0 +1,36 @@
+#ifndef EIGEN_UMFPACKSUPPORT_MODULE_H
+#define EIGEN_UMFPACKSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+extern "C" {
+#include <umfpack.h>
+}
+
+/** \ingroup Support_modules
+  * \defgroup UmfPackSupport_Module UmfPackSupport module
+  *
+  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * It provides the following factorization class:
+  * - class UmfPackLU: a multifrontal sequential LU factorization.
+  *
+  * \code
+  * #include <Eigen/UmfPackSupport>
+  * \endcode
+  *
+  * In order to use this module, the umfpack headers must be accessible from the include paths, and your binary must be linked to the umfpack library and its dependencies.
+  * The dependencies depend on how umfpack has been compiled.
+  * For a cmake based project, you can use our FindUmfPack.cmake module to help you in this task.
+  *
+  */
+
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
+#include "src/UmfPackSupport/UmfPackSupport.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_UMFPACKSUPPORT_MODULE_H
diff --git a/include/eigen3/Eigen/src/Cholesky/LDLT.h b/include/eigen3/Eigen/src/Cholesky/LDLT.h
new file mode 100644
index 0000000..02ab938
--- /dev/null
+++ b/include/eigen3/Eigen/src/Cholesky/LDLT.h
@@ -0,0 +1,604 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Keir Mierle <mierle at gmail.com>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2011 Timothy E. Holy <tim.holy at gmail.com >
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LDLT_H
+#define EIGEN_LDLT_H
+
+namespace Eigen { 
+
+namespace internal {
+  template<typename MatrixType, int UpLo> struct LDLT_Traits;
+
+  // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
+  enum SignMatrix { PositiveSemiDef, NegativeSemiDef, ZeroSign, Indefinite };
+}
+
+/** \ingroup Cholesky_Module
+  *
+  * \class LDLT
+  *
+  * \brief Robust Cholesky decomposition of a matrix with pivoting
+  *
+  * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
+  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  *             The other triangular part won't be read.
+  *
+  * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite
+  * matrix \f$ A \f$ such that \f$ A =  P^TLDL^*P \f$, where P is a permutation matrix, L
+  * is lower triangular with a unit diagonal and D is a diagonal matrix.
+  *
+  * The decomposition uses pivoting to ensure stability, so that L will have
+  * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root
+  * on D also stabilizes the computation.
+  *
+  * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
+  * decomposition to determine whether a system of equations has a solution.
+  *
+  * \sa MatrixBase::ldlt(), class LLT
+  */
+template<typename _MatrixType, int _UpLo> class LDLT
+{
+  public:
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options & ~RowMajorBit, // these are the options for the TmpMatrixType, we need a ColMajor matrix here!
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+      UpLo = _UpLo
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;
+
+    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
+    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
+
+    typedef internal::LDLT_Traits<MatrixType,UpLo> Traits;
+
+    /** \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via LDLT::compute(const MatrixType&).
+      */
+    LDLT() 
+      : m_matrix(), 
+        m_transpositions(), 
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false) 
+    {}
+
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa LDLT()
+      */
+    LDLT(Index size)
+      : m_matrix(size, size),
+        m_transpositions(size),
+        m_temporary(size),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false)
+    {}
+
+    /** \brief Constructor with decomposition
+      *
+      * This calculates the decomposition for the input \a matrix.
+      * \sa LDLT(Index size)
+      */
+    LDLT(const MatrixType& matrix)
+      : m_matrix(matrix.rows(), matrix.cols()),
+        m_transpositions(matrix.rows()),
+        m_temporary(matrix.rows()),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false)
+    {
+      compute(matrix);
+    }
+
+    /** Clear any existing decomposition
+     * \sa rankUpdate(w,sigma)
+     */
+    void setZero()
+    {
+      m_isInitialized = false;
+    }
+
+    /** \returns a view of the upper triangular matrix U */
+    inline typename Traits::MatrixU matrixU() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return Traits::getU(m_matrix);
+    }
+
+    /** \returns a view of the lower triangular matrix L */
+    inline typename Traits::MatrixL matrixL() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return Traits::getL(m_matrix);
+    }
+
+    /** \returns the permutation matrix P as a transposition sequence.
+      */
+    inline const TranspositionType& transpositionsP() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return m_transpositions;
+    }
+
+    /** \returns the coefficients of the diagonal matrix D */
+    inline Diagonal<const MatrixType> vectorD() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return m_matrix.diagonal();
+    }
+
+    /** \returns true if the matrix is positive (semidefinite) */
+    inline bool isPositive() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return m_sign == internal::PositiveSemiDef || m_sign == internal::ZeroSign;
+    }
+    
+    #ifdef EIGEN2_SUPPORT
+    inline bool isPositiveDefinite() const
+    {
+      return isPositive();
+    }
+    #endif
+
+    /** \returns true if the matrix is negative (semidefinite) */
+    inline bool isNegative(void) const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
+    }
+
+    /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
+      *
+      * \note_about_checking_solutions
+      *
+      * More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$
+      * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, 
+      * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
+      * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
+      * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
+      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
+      *
+      * \sa MatrixBase::ldlt()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<LDLT, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      eigen_assert(m_matrix.rows()==b.rows()
+                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<LDLT, Rhs>(*this, b.derived());
+    }
+
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
+    {
+      *result = this->solve(b);
+      return true;
+    }
+    #endif
+
+    template<typename Derived>
+    bool solveInPlace(MatrixBase<Derived> &bAndX) const;
+
+    LDLT& compute(const MatrixType& matrix);
+
+    template <typename Derived>
+    LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1);
+
+    /** \returns the internal LDLT decomposition matrix
+      *
+      * TODO: document the storage layout
+      */
+    inline const MatrixType& matrixLDLT() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return m_matrix;
+    }
+
+    MatrixType reconstructedMatrix() const;
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return Success;
+    }
+
+  protected:
+
+    /** \internal
+      * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
+      * The strict upper part is used during the decomposition, the strict lower
+      * part correspond to the coefficients of L (its diagonal is equal to 1 and
+      * is not stored), and the diagonal entries correspond to D.
+      */
+    MatrixType m_matrix;
+    TranspositionType m_transpositions;
+    TmpMatrixType m_temporary;
+    internal::SignMatrix m_sign;
+    bool m_isInitialized;
+};
+
+namespace internal {
+
+template<int UpLo> struct ldlt_inplace;
+
+template<> struct ldlt_inplace<Lower>
+{
+  template<typename MatrixType, typename TranspositionType, typename Workspace>
+  static bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, SignMatrix& sign)
+  {
+    using std::abs;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    eigen_assert(mat.rows()==mat.cols());
+    const Index size = mat.rows();
+
+    if (size <= 1)
+    {
+      transpositions.setIdentity();
+      if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef;
+      else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef;
+      else sign = ZeroSign;
+      return true;
+    }
+
+    for (Index k = 0; k < size; ++k)
+    {
+      // Find largest diagonal element
+      Index index_of_biggest_in_corner;
+      mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
+      index_of_biggest_in_corner += k;
+
+      transpositions.coeffRef(k) = index_of_biggest_in_corner;
+      if(k != index_of_biggest_in_corner)
+      {
+        // apply the transposition while taking care to consider only
+        // the lower triangular part
+        Index s = size-index_of_biggest_in_corner-1; // trailing size after the biggest element
+        mat.row(k).head(k).swap(mat.row(index_of_biggest_in_corner).head(k));
+        mat.col(k).tail(s).swap(mat.col(index_of_biggest_in_corner).tail(s));
+        std::swap(mat.coeffRef(k,k),mat.coeffRef(index_of_biggest_in_corner,index_of_biggest_in_corner));
+        for(int i=k+1;i<index_of_biggest_in_corner;++i)
+        {
+          Scalar tmp = mat.coeffRef(i,k);
+          mat.coeffRef(i,k) = numext::conj(mat.coeffRef(index_of_biggest_in_corner,i));
+          mat.coeffRef(index_of_biggest_in_corner,i) = numext::conj(tmp);
+        }
+        if(NumTraits<Scalar>::IsComplex)
+          mat.coeffRef(index_of_biggest_in_corner,k) = numext::conj(mat.coeff(index_of_biggest_in_corner,k));
+      }
+
+      // partition the matrix:
+      //       A00 |  -  |  -
+      // lu  = A10 | A11 |  -
+      //       A20 | A21 | A22
+      Index rs = size - k - 1;
+      Block<MatrixType,Dynamic,1> A21(mat,k+1,k,rs,1);
+      Block<MatrixType,1,Dynamic> A10(mat,k,0,1,k);
+      Block<MatrixType,Dynamic,Dynamic> A20(mat,k+1,0,rs,k);
+
+      if(k>0)
+      {
+        temp.head(k) = mat.diagonal().real().head(k).asDiagonal() * A10.adjoint();
+        mat.coeffRef(k,k) -= (A10 * temp.head(k)).value();
+        if(rs>0)
+          A21.noalias() -= A20 * temp.head(k);
+      }
+      
+      // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
+      // was smaller than the cutoff value. However, soince LDLT is not rank-revealing
+      // we should only make sure we do not introduce INF or NaN values.
+      // LAPACK also uses 0 as the cutoff value.
+      RealScalar realAkk = numext::real(mat.coeffRef(k,k));
+      if((rs>0) && (abs(realAkk) > RealScalar(0)))
+        A21 /= realAkk;
+
+      if (sign == PositiveSemiDef) {
+        if (realAkk < 0) sign = Indefinite;
+      } else if (sign == NegativeSemiDef) {
+        if (realAkk > 0) sign = Indefinite;
+      } else if (sign == ZeroSign) {
+        if (realAkk > 0) sign = PositiveSemiDef;
+        else if (realAkk < 0) sign = NegativeSemiDef;
+      }
+    }
+
+    return true;
+  }
+
+  // Reference for the algorithm: Davis and Hager, "Multiple Rank
+  // Modifications of a Sparse Cholesky Factorization" (Algorithm 1)
+  // Trivial rearrangements of their computations (Timothy E. Holy)
+  // allow their algorithm to work for rank-1 updates even if the
+  // original matrix is not of full rank.
+  // Here only rank-1 updates are implemented, to reduce the
+  // requirement for intermediate storage and improve accuracy
+  template<typename MatrixType, typename WDerived>
+  static bool updateInPlace(MatrixType& mat, MatrixBase<WDerived>& w, const typename MatrixType::RealScalar& sigma=1)
+  {
+    using numext::isfinite;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+
+    const Index size = mat.rows();
+    eigen_assert(mat.cols() == size && w.size()==size);
+
+    RealScalar alpha = 1;
+
+    // Apply the update
+    for (Index j = 0; j < size; j++)
+    {
+      // Check for termination due to an original decomposition of low-rank
+      if (!(isfinite)(alpha))
+        break;
+
+      // Update the diagonal terms
+      RealScalar dj = numext::real(mat.coeff(j,j));
+      Scalar wj = w.coeff(j);
+      RealScalar swj2 = sigma*numext::abs2(wj);
+      RealScalar gamma = dj*alpha + swj2;
+
+      mat.coeffRef(j,j) += swj2/alpha;
+      alpha += swj2/dj;
+
+
+      // Update the terms of L
+      Index rs = size-j-1;
+      w.tail(rs) -= wj * mat.col(j).tail(rs);
+      if(gamma != 0)
+        mat.col(j).tail(rs) += (sigma*numext::conj(wj)/gamma)*w.tail(rs);
+    }
+    return true;
+  }
+
+  template<typename MatrixType, typename TranspositionType, typename Workspace, typename WType>
+  static bool update(MatrixType& mat, const TranspositionType& transpositions, Workspace& tmp, const WType& w, const typename MatrixType::RealScalar& sigma=1)
+  {
+    // Apply the permutation to the input w
+    tmp = transpositions * w;
+
+    return ldlt_inplace<Lower>::updateInPlace(mat,tmp,sigma);
+  }
+};
+
+template<> struct ldlt_inplace<Upper>
+{
+  template<typename MatrixType, typename TranspositionType, typename Workspace>
+  static EIGEN_STRONG_INLINE bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, SignMatrix& sign)
+  {
+    Transpose<MatrixType> matt(mat);
+    return ldlt_inplace<Lower>::unblocked(matt, transpositions, temp, sign);
+  }
+
+  template<typename MatrixType, typename TranspositionType, typename Workspace, typename WType>
+  static EIGEN_STRONG_INLINE bool update(MatrixType& mat, TranspositionType& transpositions, Workspace& tmp, WType& w, const typename MatrixType::RealScalar& sigma=1)
+  {
+    Transpose<MatrixType> matt(mat);
+    return ldlt_inplace<Lower>::update(matt, transpositions, tmp, w.conjugate(), sigma);
+  }
+};
+
+template<typename MatrixType> struct LDLT_Traits<MatrixType,Lower>
+{
+  typedef const TriangularView<const MatrixType, UnitLower> MatrixL;
+  typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitUpper> MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return m; }
+  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+};
+
+template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
+{
+  typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitLower> MatrixL;
+  typedef const TriangularView<const MatrixType, UnitUpper> MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixU getU(const MatrixType& m) { return m; }
+};
+
+} // end namespace internal
+
+/** Compute / recompute the LDLT decomposition A = L D L^* = U^* D U of \a matrix
+  */
+template<typename MatrixType, int _UpLo>
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+{
+  eigen_assert(a.rows()==a.cols());
+  const Index size = a.rows();
+
+  m_matrix = a;
+
+  m_transpositions.resize(size);
+  m_isInitialized = false;
+  m_temporary.resize(size);
+  m_sign = internal::ZeroSign;
+
+  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);
+
+  m_isInitialized = true;
+  return *this;
+}
+
+/** Update the LDLT decomposition:  given A = L D L^T, efficiently compute the decomposition of A + sigma w w^T.
+ * \param w a vector to be incorporated into the decomposition.
+ * \param sigma a scalar, +1 for updates and -1 for "downdates," which correspond to removing previously-added column vectors. Optional; default value is +1.
+ * \sa setZero()
+  */
+template<typename MatrixType, int _UpLo>
+template<typename Derived>
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename NumTraits<typename MatrixType::Scalar>::Real& sigma)
+{
+  const Index size = w.rows();
+  if (m_isInitialized)
+  {
+    eigen_assert(m_matrix.rows()==size);
+  }
+  else
+  {    
+    m_matrix.resize(size,size);
+    m_matrix.setZero();
+    m_transpositions.resize(size);
+    for (Index i = 0; i < size; i++)
+      m_transpositions.coeffRef(i) = i;
+    m_temporary.resize(size);
+    m_sign = sigma>=0 ? internal::PositiveSemiDef : internal::NegativeSemiDef;
+    m_isInitialized = true;
+  }
+
+  internal::ldlt_inplace<UpLo>::update(m_matrix, m_transpositions, m_temporary, w, sigma);
+
+  return *this;
+}
+
+namespace internal {
+template<typename _MatrixType, int _UpLo, typename Rhs>
+struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
+  : solve_retval_base<LDLT<_MatrixType,_UpLo>, Rhs>
+{
+  typedef LDLT<_MatrixType,_UpLo> LDLTType;
+  EIGEN_MAKE_SOLVE_HELPERS(LDLTType,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    eigen_assert(rhs().rows() == dec().matrixLDLT().rows());
+    // dst = P b
+    dst = dec().transpositionsP() * rhs();
+
+    // dst = L^-1 (P b)
+    dec().matrixL().solveInPlace(dst);
+
+    // dst = D^-1 (L^-1 P b)
+    // more precisely, use pseudo-inverse of D (see bug 241)
+    using std::abs;
+    using std::max;
+    typedef typename LDLTType::MatrixType MatrixType;
+    typedef typename LDLTType::RealScalar RealScalar;
+    const typename Diagonal<const MatrixType>::RealReturnType vectorD(dec().vectorD());
+    // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
+    // as motivated by LAPACK's xGELSS:
+    // RealScalar tolerance = (max)(vectorD.array().abs().maxCoeff() *NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
+    // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
+    // diagonal element is not well justified and to numerical issues in some cases.
+    // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
+    RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+    
+    for (Index i = 0; i < vectorD.size(); ++i) {
+      if(abs(vectorD(i)) > tolerance)
+        dst.row(i) /= vectorD(i);
+      else
+        dst.row(i).setZero();
+    }
+
+    // dst = L^-T (D^-1 L^-1 P b)
+    dec().matrixU().solveInPlace(dst);
+
+    // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
+    dst = dec().transpositionsP().transpose() * dst;
+  }
+};
+}
+
+/** \internal use x = ldlt_object.solve(x);
+  *
+  * This is the \em in-place version of solve().
+  *
+  * \param bAndX represents both the right-hand side matrix b and result x.
+  *
+  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
+  *
+  * This version avoids a copy when the right hand side matrix b is not
+  * needed anymore.
+  *
+  * \sa LDLT::solve(), MatrixBase::ldlt()
+  */
+template<typename MatrixType,int _UpLo>
+template<typename Derived>
+bool LDLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
+{
+  eigen_assert(m_isInitialized && "LDLT is not initialized.");
+  eigen_assert(m_matrix.rows() == bAndX.rows());
+
+  bAndX = this->solve(bAndX);
+
+  return true;
+}
+
+/** \returns the matrix represented by the decomposition,
+ * i.e., it returns the product: P^T L D L^* P.
+ * This function is provided for debug purpose. */
+template<typename MatrixType, int _UpLo>
+MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
+{
+  eigen_assert(m_isInitialized && "LDLT is not initialized.");
+  const Index size = m_matrix.rows();
+  MatrixType res(size,size);
+
+  // P
+  res.setIdentity();
+  res = transpositionsP() * res;
+  // L^* P
+  res = matrixU() * res;
+  // D(L^*P)
+  res = vectorD().real().asDiagonal() * res;
+  // L(DL^*P)
+  res = matrixL() * res;
+  // P^T (LDL^*P)
+  res = transpositionsP().transpose() * res;
+
+  return res;
+}
+
+/** \cholesky_module
+  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+  */
+template<typename MatrixType, unsigned int UpLo>
+inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
+SelfAdjointView<MatrixType, UpLo>::ldlt() const
+{
+  return LDLT<PlainObject,UpLo>(m_matrix);
+}
+
+/** \cholesky_module
+  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+  */
+template<typename Derived>
+inline const LDLT<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::ldlt() const
+{
+  return LDLT<PlainObject>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_LDLT_H
diff --git a/include/eigen3/Eigen/src/Cholesky/LLT.h b/include/eigen3/Eigen/src/Cholesky/LLT.h
new file mode 100644
index 0000000..2e6189f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Cholesky/LLT.h
@@ -0,0 +1,490 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LLT_H
+#define EIGEN_LLT_H
+
+namespace Eigen { 
+
+namespace internal{
+template<typename MatrixType, int UpLo> struct LLT_Traits;
+}
+
+/** \ingroup Cholesky_Module
+  *
+  * \class LLT
+  *
+  * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features
+  *
+  * \param MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
+  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  *             The other triangular part won't be read.
+  *
+  * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
+  * matrix A such that A = LL^* = U^*U, where L is lower triangular.
+  *
+  * While the Cholesky decomposition is particularly useful to solve selfadjoint problems like  D^*D x = b,
+  * for that purpose, we recommend the Cholesky decomposition without square root which is more stable
+  * and even faster. Nevertheless, this standard Cholesky decomposition remains useful in many other
+  * situations like generalised eigen problems with hermitian matrices.
+  *
+  * Remember that Cholesky decompositions are not rank-revealing. This LLT decomposition is only stable on positive definite matrices,
+  * use LDLT instead for the semidefinite case. Also, do not use a Cholesky decomposition to determine whether a system of equations
+  * has a solution.
+  *
+  * Example: \include LLT_example.cpp
+  * Output: \verbinclude LLT_example.out
+  *    
+  * \sa MatrixBase::llt(), class LDLT
+  */
+ /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
+  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
+  * the strict lower part does not have to store correct values.
+  */
+template<typename _MatrixType, int _UpLo> class LLT
+{
+  public:
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef typename MatrixType::Index Index;
+
+    enum {
+      PacketSize = internal::packet_traits<Scalar>::size,
+      AlignmentMask = int(PacketSize)-1,
+      UpLo = _UpLo
+    };
+
+    typedef internal::LLT_Traits<MatrixType,UpLo> Traits;
+
+    /**
+      * \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via LLT::compute(const MatrixType&).
+      */
+    LLT() : m_matrix(), m_isInitialized(false) {}
+
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa LLT()
+      */
+    LLT(Index size) : m_matrix(size, size),
+                    m_isInitialized(false) {}
+
+    LLT(const MatrixType& matrix)
+      : m_matrix(matrix.rows(), matrix.cols()),
+        m_isInitialized(false)
+    {
+      compute(matrix);
+    }
+
+    /** \returns a view of the upper triangular matrix U */
+    inline typename Traits::MatrixU matrixU() const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      return Traits::getU(m_matrix);
+    }
+
+    /** \returns a view of the lower triangular matrix L */
+    inline typename Traits::MatrixL matrixL() const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      return Traits::getL(m_matrix);
+    }
+
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * Since this LLT class assumes anyway that the matrix A is invertible, the solution
+      * theoretically exists and is unique regardless of b.
+      *
+      * Example: \include LLT_solve.cpp
+      * Output: \verbinclude LLT_solve.out
+      *
+      * \sa solveInPlace(), MatrixBase::llt()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<LLT, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(m_matrix.rows()==b.rows()
+                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<LLT, Rhs>(*this, b.derived());
+    }
+
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
+    {
+      *result = this->solve(b);
+      return true;
+    }
+    
+    bool isPositiveDefinite() const { return true; }
+    #endif
+
+    template<typename Derived>
+    void solveInPlace(MatrixBase<Derived> &bAndX) const;
+
+    LLT& compute(const MatrixType& matrix);
+
+    /** \returns the LLT decomposition matrix
+      *
+      * TODO: document the storage layout
+      */
+    inline const MatrixType& matrixLLT() const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      return m_matrix;
+    }
+
+    MatrixType reconstructedMatrix() const;
+
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      return m_info;
+    }
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+
+    template<typename VectorType>
+    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+
+  protected:
+    /** \internal
+      * Used to compute and store L
+      * The strict upper part is not used and even not initialized.
+      */
+    MatrixType m_matrix;
+    bool m_isInitialized;
+    ComputationInfo m_info;
+};
+
+namespace internal {
+
+template<typename Scalar, int UpLo> struct llt_inplace;
+
+template<typename MatrixType, typename VectorType>
+static typename MatrixType::Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
+{
+  using std::sqrt;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::ColXpr ColXpr;
+  typedef typename internal::remove_all<ColXpr>::type ColXprCleaned;
+  typedef typename ColXprCleaned::SegmentReturnType ColXprSegment;
+  typedef Matrix<Scalar,Dynamic,1> TempVectorType;
+  typedef typename TempVectorType::SegmentReturnType TempVecSegment;
+
+  Index n = mat.cols();
+  eigen_assert(mat.rows()==n && vec.size()==n);
+
+  TempVectorType temp;
+
+  if(sigma>0)
+  {
+    // This version is based on Givens rotations.
+    // It is faster than the other one below, but only works for updates,
+    // i.e., for sigma > 0
+    temp = sqrt(sigma) * vec;
+
+    for(Index i=0; i<n; ++i)
+    {
+      JacobiRotation<Scalar> g;
+      g.makeGivens(mat(i,i), -temp(i), &mat(i,i));
+
+      Index rs = n-i-1;
+      if(rs>0)
+      {
+        ColXprSegment x(mat.col(i).tail(rs));
+        TempVecSegment y(temp.tail(rs));
+        apply_rotation_in_the_plane(x, y, g);
+      }
+    }
+  }
+  else
+  {
+    temp = vec;
+    RealScalar beta = 1;
+    for(Index j=0; j<n; ++j)
+    {
+      RealScalar Ljj = numext::real(mat.coeff(j,j));
+      RealScalar dj = numext::abs2(Ljj);
+      Scalar wj = temp.coeff(j);
+      RealScalar swj2 = sigma*numext::abs2(wj);
+      RealScalar gamma = dj*beta + swj2;
+
+      RealScalar x = dj + swj2/beta;
+      if (x<=RealScalar(0))
+        return j;
+      RealScalar nLjj = sqrt(x);
+      mat.coeffRef(j,j) = nLjj;
+      beta += swj2/dj;
+
+      // Update the terms of L
+      Index rs = n-j-1;
+      if(rs)
+      {
+        temp.tail(rs) -= (wj/Ljj) * mat.col(j).tail(rs);
+        if(gamma != 0)
+          mat.col(j).tail(rs) = (nLjj/Ljj) * mat.col(j).tail(rs) + (nLjj * sigma*numext::conj(wj)/gamma)*temp.tail(rs);
+      }
+    }
+  }
+  return -1;
+}
+
+template<typename Scalar> struct llt_inplace<Scalar, Lower>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template<typename MatrixType>
+  static typename MatrixType::Index unblocked(MatrixType& mat)
+  {
+    using std::sqrt;
+    typedef typename MatrixType::Index Index;
+    
+    eigen_assert(mat.rows()==mat.cols());
+    const Index size = mat.rows();
+    for(Index k = 0; k < size; ++k)
+    {
+      Index rs = size-k-1; // remaining size
+
+      Block<MatrixType,Dynamic,1> A21(mat,k+1,k,rs,1);
+      Block<MatrixType,1,Dynamic> A10(mat,k,0,1,k);
+      Block<MatrixType,Dynamic,Dynamic> A20(mat,k+1,0,rs,k);
+
+      RealScalar x = numext::real(mat.coeff(k,k));
+      if (k>0) x -= A10.squaredNorm();
+      if (x<=RealScalar(0))
+        return k;
+      mat.coeffRef(k,k) = x = sqrt(x);
+      if (k>0 && rs>0) A21.noalias() -= A20 * A10.adjoint();
+      if (rs>0) A21 *= RealScalar(1)/x;
+    }
+    return -1;
+  }
+
+  template<typename MatrixType>
+  static typename MatrixType::Index blocked(MatrixType& m)
+  {
+    typedef typename MatrixType::Index Index;
+    eigen_assert(m.rows()==m.cols());
+    Index size = m.rows();
+    if(size<32)
+      return unblocked(m);
+
+    Index blockSize = size/8;
+    blockSize = (blockSize/16)*16;
+    blockSize = (std::min)((std::max)(blockSize,Index(8)), Index(128));
+
+    for (Index k=0; k<size; k+=blockSize)
+    {
+      // partition the matrix:
+      //       A00 |  -  |  -
+      // lu  = A10 | A11 |  -
+      //       A20 | A21 | A22
+      Index bs = (std::min)(blockSize, size-k);
+      Index rs = size - k - bs;
+      Block<MatrixType,Dynamic,Dynamic> A11(m,k,   k,   bs,bs);
+      Block<MatrixType,Dynamic,Dynamic> A21(m,k+bs,k,   rs,bs);
+      Block<MatrixType,Dynamic,Dynamic> A22(m,k+bs,k+bs,rs,rs);
+
+      Index ret;
+      if((ret=unblocked(A11))>=0) return k+ret;
+      if(rs>0) A11.adjoint().template triangularView<Upper>().template solveInPlace<OnTheRight>(A21);
+      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,-1); // bottleneck
+    }
+    return -1;
+  }
+
+  template<typename MatrixType, typename VectorType>
+  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  {
+    return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
+  }
+};
+  
+template<typename Scalar> struct llt_inplace<Scalar, Upper>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  template<typename MatrixType>
+  static EIGEN_STRONG_INLINE typename MatrixType::Index unblocked(MatrixType& mat)
+  {
+    Transpose<MatrixType> matt(mat);
+    return llt_inplace<Scalar, Lower>::unblocked(matt);
+  }
+  template<typename MatrixType>
+  static EIGEN_STRONG_INLINE typename MatrixType::Index blocked(MatrixType& mat)
+  {
+    Transpose<MatrixType> matt(mat);
+    return llt_inplace<Scalar, Lower>::blocked(matt);
+  }
+  template<typename MatrixType, typename VectorType>
+  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  {
+    Transpose<MatrixType> matt(mat);
+    return llt_inplace<Scalar, Lower>::rankUpdate(matt, vec.conjugate(), sigma);
+  }
+};
+
+template<typename MatrixType> struct LLT_Traits<MatrixType,Lower>
+{
+  typedef const TriangularView<const MatrixType, Lower> MatrixL;
+  typedef const TriangularView<const typename MatrixType::AdjointReturnType, Upper> MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return m; }
+  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static bool inplace_decomposition(MatrixType& m)
+  { return llt_inplace<typename MatrixType::Scalar, Lower>::blocked(m)==-1; }
+};
+
+template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
+{
+  typedef const TriangularView<const typename MatrixType::AdjointReturnType, Lower> MatrixL;
+  typedef const TriangularView<const MatrixType, Upper> MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static bool inplace_decomposition(MatrixType& m)
+  { return llt_inplace<typename MatrixType::Scalar, Upper>::blocked(m)==-1; }
+};
+
+} // end namespace internal
+
+/** Computes / recomputes the Cholesky decomposition A = LL^* = U^*U of \a matrix
+  *
+  * \returns a reference to *this
+  *
+  * Example: \include TutorialLinAlgComputeTwice.cpp
+  * Output: \verbinclude TutorialLinAlgComputeTwice.out
+  */
+template<typename MatrixType, int _UpLo>
+LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+{
+  eigen_assert(a.rows()==a.cols());
+  const Index size = a.rows();
+  m_matrix.resize(size, size);
+  m_matrix = a;
+
+  m_isInitialized = true;
+  bool ok = Traits::inplace_decomposition(m_matrix);
+  m_info = ok ? Success : NumericalIssue;
+
+  return *this;
+}
+
+/** Performs a rank one update (or dowdate) of the current decomposition.
+  * If A = LL^* before the rank one update,
+  * then after it we have LL^* = A + sigma * v v^* where \a v must be a vector
+  * of same dimension.
+  */
+template<typename _MatrixType, int _UpLo>
+template<typename VectorType>
+LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
+  eigen_assert(v.size()==m_matrix.cols());
+  eigen_assert(m_isInitialized);
+  if(internal::llt_inplace<typename MatrixType::Scalar, UpLo>::rankUpdate(m_matrix,v,sigma)>=0)
+    m_info = NumericalIssue;
+  else
+    m_info = Success;
+
+  return *this;
+}
+    
+namespace internal {
+template<typename _MatrixType, int UpLo, typename Rhs>
+struct solve_retval<LLT<_MatrixType, UpLo>, Rhs>
+  : solve_retval_base<LLT<_MatrixType, UpLo>, Rhs>
+{
+  typedef LLT<_MatrixType,UpLo> LLTType;
+  EIGEN_MAKE_SOLVE_HELPERS(LLTType,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dst = rhs();
+    dec().solveInPlace(dst);
+  }
+};
+}
+
+/** \internal use x = llt_object.solve(x);
+  * 
+  * This is the \em in-place version of solve().
+  *
+  * \param bAndX represents both the right-hand side matrix b and result x.
+  *
+  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
+  *
+  * This version avoids a copy when the right hand side matrix b is not
+  * needed anymore.
+  *
+  * \sa LLT::solve(), MatrixBase::llt()
+  */
+template<typename MatrixType, int _UpLo>
+template<typename Derived>
+void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
+{
+  eigen_assert(m_isInitialized && "LLT is not initialized.");
+  eigen_assert(m_matrix.rows()==bAndX.rows());
+  matrixL().solveInPlace(bAndX);
+  matrixU().solveInPlace(bAndX);
+}
+
+/** \returns the matrix represented by the decomposition,
+ * i.e., it returns the product: L L^*.
+ * This function is provided for debug purpose. */
+template<typename MatrixType, int _UpLo>
+MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
+{
+  eigen_assert(m_isInitialized && "LLT is not initialized.");
+  return matrixL() * matrixL().adjoint().toDenseMatrix();
+}
+
+/** \cholesky_module
+  * \returns the LLT decomposition of \c *this
+  */
+template<typename Derived>
+inline const LLT<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::llt() const
+{
+  return LLT<PlainObject>(derived());
+}
+
+/** \cholesky_module
+  * \returns the LLT decomposition of \c *this
+  */
+template<typename MatrixType, unsigned int UpLo>
+inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
+SelfAdjointView<MatrixType, UpLo>::llt() const
+{
+  return LLT<PlainObject,UpLo>(m_matrix);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_LLT_H
diff --git a/include/eigen3/Eigen/src/Cholesky/LLT_MKL.h b/include/eigen3/Eigen/src/Cholesky/LLT_MKL.h
new file mode 100644
index 0000000..64daa44
--- /dev/null
+++ b/include/eigen3/Eigen/src/Cholesky/LLT_MKL.h
@@ -0,0 +1,102 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *     LLt decomposition based on LAPACKE_?potrf function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_LLT_MKL_H
+#define EIGEN_LLT_MKL_H
+
+#include "Eigen/src/Core/util/MKL_support.h"
+#include <iostream>
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Scalar> struct mkl_llt;
+
+#define EIGEN_MKL_LLT(EIGTYPE, MKLTYPE, MKLPREFIX) \
+template<> struct mkl_llt<EIGTYPE> \
+{ \
+  template<typename MatrixType> \
+  static inline typename MatrixType::Index potrf(MatrixType& m, char uplo) \
+  { \
+    lapack_int matrix_order; \
+    lapack_int size, lda, info, StorageOrder; \
+    EIGTYPE* a; \
+    eigen_assert(m.rows()==m.cols()); \
+    /* Set up parameters for ?potrf */ \
+    size = m.rows(); \
+    StorageOrder = MatrixType::Flags&RowMajorBit?RowMajor:ColMajor; \
+    matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
+    a = &(m.coeffRef(0,0)); \
+    lda = m.outerStride(); \
+\
+    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
+    info = (info==0) ? Success : NumericalIssue; \
+    return info; \
+  } \
+}; \
+template<> struct llt_inplace<EIGTYPE, Lower> \
+{ \
+  template<typename MatrixType> \
+  static typename MatrixType::Index blocked(MatrixType& m) \
+  { \
+    return mkl_llt<EIGTYPE>::potrf(m, 'L'); \
+  } \
+  template<typename MatrixType, typename VectorType> \
+  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  { return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } \
+}; \
+template<> struct llt_inplace<EIGTYPE, Upper> \
+{ \
+  template<typename MatrixType> \
+  static typename MatrixType::Index blocked(MatrixType& m) \
+  { \
+    return mkl_llt<EIGTYPE>::potrf(m, 'U'); \
+  } \
+  template<typename MatrixType, typename VectorType> \
+  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  { \
+    Transpose<MatrixType> matt(mat); \
+    return llt_inplace<EIGTYPE, Lower>::rankUpdate(matt, vec.conjugate(), sigma); \
+  } \
+};
+
+EIGEN_MKL_LLT(double, double, d)
+EIGEN_MKL_LLT(float, float, s)
+EIGEN_MKL_LLT(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_LLT(scomplex, MKL_Complex8, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_LLT_MKL_H
diff --git a/include/eigen3/Eigen/src/CholmodSupport/CholmodSupport.h b/include/eigen3/Eigen/src/CholmodSupport/CholmodSupport.h
new file mode 100644
index 0000000..c449960
--- /dev/null
+++ b/include/eigen3/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -0,0 +1,607 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CHOLMODSUPPORT_H
+#define EIGEN_CHOLMODSUPPORT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Scalar, typename CholmodType>
+void cholmod_configure_matrix(CholmodType& mat)
+{
+  if (internal::is_same<Scalar,float>::value)
+  {
+    mat.xtype = CHOLMOD_REAL;
+    mat.dtype = CHOLMOD_SINGLE;
+  }
+  else if (internal::is_same<Scalar,double>::value)
+  {
+    mat.xtype = CHOLMOD_REAL;
+    mat.dtype = CHOLMOD_DOUBLE;
+  }
+  else if (internal::is_same<Scalar,std::complex<float> >::value)
+  {
+    mat.xtype = CHOLMOD_COMPLEX;
+    mat.dtype = CHOLMOD_SINGLE;
+  }
+  else if (internal::is_same<Scalar,std::complex<double> >::value)
+  {
+    mat.xtype = CHOLMOD_COMPLEX;
+    mat.dtype = CHOLMOD_DOUBLE;
+  }
+  else
+  {
+    eigen_assert(false && "Scalar type not supported by CHOLMOD");
+  }
+}
+
+} // namespace internal
+
+/** Wraps the Eigen sparse matrix \a mat into a Cholmod sparse matrix object.
+  * Note that the data are shared.
+  */
+template<typename _Scalar, int _Options, typename _Index>
+cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
+{
+  cholmod_sparse res;
+  res.nzmax   = mat.nonZeros();
+  res.nrow    = mat.rows();;
+  res.ncol    = mat.cols();
+  res.p       = mat.outerIndexPtr();
+  res.i       = mat.innerIndexPtr();
+  res.x       = mat.valuePtr();
+  res.z       = 0;
+  res.sorted  = 1;
+  if(mat.isCompressed())
+  {
+    res.packed  = 1;
+    res.nz = 0;
+  }
+  else
+  {
+    res.packed  = 0;
+    res.nz = mat.innerNonZeroPtr();
+  }
+
+  res.dtype   = 0;
+  res.stype   = -1;
+  
+  if (internal::is_same<_Index,int>::value)
+  {
+    res.itype = CHOLMOD_INT;
+  }
+  else if (internal::is_same<_Index,UF_long>::value)
+  {
+    res.itype = CHOLMOD_LONG;
+  }
+  else
+  {
+    eigen_assert(false && "Index type not supported yet");
+  }
+
+  // setup res.xtype
+  internal::cholmod_configure_matrix<_Scalar>(res);
+  
+  res.stype = 0;
+  
+  return res;
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>& mat)
+{
+  cholmod_sparse res = viewAsCholmod(mat.const_cast_derived());
+  return res;
+}
+
+/** Returns a view of the Eigen sparse matrix \a mat as Cholmod sparse matrix.
+  * The data are not copied but shared. */
+template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
+cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
+{
+  cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
+  
+  if(UpLo==Upper) res.stype =  1;
+  if(UpLo==Lower) res.stype = -1;
+
+  return res;
+}
+
+/** Returns a view of the Eigen \b dense matrix \a mat as Cholmod dense matrix.
+  * The data are not copied but shared. */
+template<typename Derived>
+cholmod_dense viewAsCholmod(MatrixBase<Derived>& mat)
+{
+  EIGEN_STATIC_ASSERT((internal::traits<Derived>::Flags&RowMajorBit)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  typedef typename Derived::Scalar Scalar;
+
+  cholmod_dense res;
+  res.nrow   = mat.rows();
+  res.ncol   = mat.cols();
+  res.nzmax  = res.nrow * res.ncol;
+  res.d      = Derived::IsVectorAtCompileTime ? mat.derived().size() : mat.derived().outerStride();
+  res.x      = (void*)(mat.derived().data());
+  res.z      = 0;
+
+  internal::cholmod_configure_matrix<Scalar>(res);
+
+  return res;
+}
+
+/** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix.
+  * The data are not copied but shared. */
+template<typename Scalar, int Flags, typename Index>
+MappedSparseMatrix<Scalar,Flags,Index> viewAsEigen(cholmod_sparse& cm)
+{
+  return MappedSparseMatrix<Scalar,Flags,Index>
+         (cm.nrow, cm.ncol, static_cast<Index*>(cm.p)[cm.ncol],
+          static_cast<Index*>(cm.p), static_cast<Index*>(cm.i),static_cast<Scalar*>(cm.x) );
+}
+
+enum CholmodMode {
+  CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt
+};
+
+
+/** \ingroup CholmodSupport_Module
+  * \class CholmodBase
+  * \brief The base class for the direct Cholesky factorization of Cholmod
+  * \sa class CholmodSupernodalLLT, class CholmodSimplicialLDLT, class CholmodSimplicialLLT
+  */
+template<typename _MatrixType, int _UpLo, typename Derived>
+class CholmodBase : internal::noncopyable
+{
+  public:
+    typedef _MatrixType MatrixType;
+    enum { UpLo = _UpLo };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef MatrixType CholMatrixType;
+    typedef typename MatrixType::Index Index;
+
+  public:
+
+    CholmodBase()
+      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+    {
+      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
+      cholmod_start(&m_cholmod);
+    }
+
+    CholmodBase(const MatrixType& matrix)
+      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+    {
+      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
+      cholmod_start(&m_cholmod);
+      compute(matrix);
+    }
+
+    ~CholmodBase()
+    {
+      if(m_cholmodFactor)
+        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
+      cholmod_finish(&m_cholmod);
+    }
+    
+    inline Index cols() const { return m_cholmodFactor->n; }
+    inline Index rows() const { return m_cholmodFactor->n; }
+    
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    Derived& compute(const MatrixType& matrix)
+    {
+      analyzePattern(matrix);
+      factorize(matrix);
+      return derived();
+    }
+    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<CholmodBase, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<CholmodBase, Rhs>(*this, b.derived());
+    }
+    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::sparse_solve_retval<CholmodBase, Rhs>
+    solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::sparse_solve_retval<CholmodBase, Rhs>(*this, b.derived());
+    }
+    
+    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      * 
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& matrix)
+    {
+      if(m_cholmodFactor)
+      {
+        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
+        m_cholmodFactor = 0;
+      }
+      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
+      m_cholmodFactor = cholmod_analyze(&A, &m_cholmod);
+      
+      this->m_isInitialized = true;
+      this->m_info = Success;
+      m_analysisIsOk = true;
+      m_factorizationIsOk = false;
+    }
+    
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    void factorize(const MatrixType& matrix)
+    {
+      eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
+      cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);
+      
+      // If the factorization failed, minor is the column at which it did. On success minor == n.
+      this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
+      m_factorizationIsOk = true;
+    }
+    
+    /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
+     *  See the Cholmod user guide for details. */
+    cholmod_common& cholmod() { return m_cholmod; }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    {
+      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
+      const Index size = m_cholmodFactor->n;
+      EIGEN_UNUSED_VARIABLE(size);
+      eigen_assert(size==b.rows());
+
+      // note: cd stands for Cholmod Dense
+      Rhs& b_ref(b.const_cast_derived());
+      cholmod_dense b_cd = viewAsCholmod(b_ref);
+      cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
+      if(!x_cd)
+      {
+        this->m_info = NumericalIssue;
+      }
+      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+      dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
+      cholmod_free_dense(&x_cd, &m_cholmod);
+    }
+    
+    /** \internal */
+    template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
+    void _solve(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+    {
+      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
+      const Index size = m_cholmodFactor->n;
+      EIGEN_UNUSED_VARIABLE(size);
+      eigen_assert(size==b.rows());
+
+      // note: cs stands for Cholmod Sparse
+      cholmod_sparse b_cs = viewAsCholmod(b);
+      cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
+      if(!x_cs)
+      {
+        this->m_info = NumericalIssue;
+      }
+      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+      dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
+      cholmod_free_sparse(&x_cs, &m_cholmod);
+    }
+    #endif // EIGEN_PARSED_BY_DOXYGEN
+    
+    
+    /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
+      *
+      * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
+      * \c d_ii = \a offset + \c d_ii
+      *
+      * The default is \a offset=0.
+      *
+      * \returns a reference to \c *this.
+      */
+    Derived& setShift(const RealScalar& offset)
+    {
+      m_shiftOffset[0] = offset;
+      return derived();
+    }
+    
+    template<typename Stream>
+    void dumpMemory(Stream& /*s*/)
+    {}
+    
+  protected:
+    mutable cholmod_common m_cholmod;
+    cholmod_factor* m_cholmodFactor;
+    RealScalar m_shiftOffset[2];
+    mutable ComputationInfo m_info;
+    bool m_isInitialized;
+    int m_factorizationIsOk;
+    int m_analysisIsOk;
+};
+
+/** \ingroup CholmodSupport_Module
+  * \class CholmodSimplicialLLT
+  * \brief A simplicial direct Cholesky (LLT) factorization and solver based on Cholmod
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a simplicial LL^T Cholesky factorization
+  * using the Cholmod library.
+  * This simplicial variant is equivalent to Eigen's built-in SimplicialLLT class. Therefore, it has little practical interest.
+  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
+  * X and B can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  *
+  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
+  *
+  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLLT
+  */
+template<typename _MatrixType, int _UpLo = Lower>
+class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT<_MatrixType, _UpLo> >
+{
+    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
+    using Base::m_cholmod;
+    
+  public:
+    
+    typedef _MatrixType MatrixType;
+    
+    CholmodSimplicialLLT() : Base() { init(); }
+
+    CholmodSimplicialLLT(const MatrixType& matrix) : Base()
+    {
+      init();
+      compute(matrix);
+    }
+
+    ~CholmodSimplicialLLT() {}
+  protected:
+    void init()
+    {
+      m_cholmod.final_asis = 0;
+      m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
+      m_cholmod.final_ll = 1;
+    }
+};
+
+
+/** \ingroup CholmodSupport_Module
+  * \class CholmodSimplicialLDLT
+  * \brief A simplicial direct Cholesky (LDLT) factorization and solver based on Cholmod
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a simplicial LDL^T Cholesky factorization
+  * using the Cholmod library.
+  * This simplicial variant is equivalent to Eigen's built-in SimplicialLDLT class. Therefore, it has little practical interest.
+  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
+  * X and B can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  *
+  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
+  *
+  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLDLT
+  */
+template<typename _MatrixType, int _UpLo = Lower>
+class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT<_MatrixType, _UpLo> >
+{
+    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
+    using Base::m_cholmod;
+    
+  public:
+    
+    typedef _MatrixType MatrixType;
+    
+    CholmodSimplicialLDLT() : Base() { init(); }
+
+    CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
+    {
+      init();
+      compute(matrix);
+    }
+
+    ~CholmodSimplicialLDLT() {}
+  protected:
+    void init()
+    {
+      m_cholmod.final_asis = 1;
+      m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
+    }
+};
+
+/** \ingroup CholmodSupport_Module
+  * \class CholmodSupernodalLLT
+  * \brief A supernodal Cholesky (LLT) factorization and solver based on Cholmod
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a supernodal LL^T Cholesky factorization
+  * using the Cholmod library.
+  * This supernodal variant performs best on dense enough problems, e.g., 3D FEM, or very high order 2D FEM.
+  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
+  * X and B can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  *
+  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
+  *
+  * \sa \ref TutorialSparseDirectSolvers
+  */
+template<typename _MatrixType, int _UpLo = Lower>
+class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT<_MatrixType, _UpLo> >
+{
+    typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
+    using Base::m_cholmod;
+    
+  public:
+    
+    typedef _MatrixType MatrixType;
+    
+    CholmodSupernodalLLT() : Base() { init(); }
+
+    CholmodSupernodalLLT(const MatrixType& matrix) : Base()
+    {
+      init();
+      compute(matrix);
+    }
+
+    ~CholmodSupernodalLLT() {}
+  protected:
+    void init()
+    {
+      m_cholmod.final_asis = 1;
+      m_cholmod.supernodal = CHOLMOD_SUPERNODAL;
+    }
+};
+
+/** \ingroup CholmodSupport_Module
+  * \class CholmodDecomposition
+  * \brief A general Cholesky factorization and solver based on Cholmod
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a LL^T or LDL^T Cholesky factorization
+  * using the Cholmod library. The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
+  * X and B can be either dense or sparse.
+  *
+  * This variant permits to change the underlying Cholesky method at runtime.
+  * On the other hand, it does not provide access to the result of the factorization.
+  * The default is to let Cholmod automatically choose between a simplicial and supernodal factorization.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  *
+  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
+  *
+  * \sa \ref TutorialSparseDirectSolvers
+  */
+template<typename _MatrixType, int _UpLo = Lower>
+class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecomposition<_MatrixType, _UpLo> >
+{
+    typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
+    using Base::m_cholmod;
+    
+  public:
+    
+    typedef _MatrixType MatrixType;
+    
+    CholmodDecomposition() : Base() { init(); }
+
+    CholmodDecomposition(const MatrixType& matrix) : Base()
+    {
+      init();
+      compute(matrix);
+    }
+
+    ~CholmodDecomposition() {}
+    
+    void setMode(CholmodMode mode)
+    {
+      switch(mode)
+      {
+        case CholmodAuto:
+          m_cholmod.final_asis = 1;
+          m_cholmod.supernodal = CHOLMOD_AUTO;
+          break;
+        case CholmodSimplicialLLt:
+          m_cholmod.final_asis = 0;
+          m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
+          m_cholmod.final_ll = 1;
+          break;
+        case CholmodSupernodalLLt:
+          m_cholmod.final_asis = 1;
+          m_cholmod.supernodal = CHOLMOD_SUPERNODAL;
+          break;
+        case CholmodLDLt:
+          m_cholmod.final_asis = 1;
+          m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
+          break;
+        default:
+          break;
+      }
+    }
+  protected:
+    void init()
+    {
+      m_cholmod.final_asis = 1;
+      m_cholmod.supernodal = CHOLMOD_AUTO;
+    }
+};
+
+namespace internal {
+  
+template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
+struct solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
+  : solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
+{
+  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
+struct sparse_solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
+  : sparse_solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
+{
+  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CHOLMODSUPPORT_H
diff --git a/include/eigen3/Eigen/src/Core/Array.h b/include/eigen3/Eigen/src/Core/Array.h
new file mode 100644
index 0000000..0ab03ef
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Array.h
@@ -0,0 +1,308 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARRAY_H
+#define EIGEN_ARRAY_H
+
+namespace Eigen {
+
+/** \class Array 
+  * \ingroup Core_Module
+  *
+  * \brief General-purpose arrays with easy API for coefficient-wise operations
+  *
+  * The %Array class is very similar to the Matrix class. It provides
+  * general-purpose one- and two-dimensional arrays. The difference between the
+  * %Array and the %Matrix class is primarily in the API: the API for the
+  * %Array class provides easy access to coefficient-wise operations, while the
+  * API for the %Matrix class provides easy access to linear-algebra
+  * operations.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
+  *
+  * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy
+  */
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  typedef ArrayXpr XprKind;
+  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
+};
+}
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+class Array
+  : public PlainObjectBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  public:
+
+    typedef PlainObjectBase<Array> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Array)
+
+    enum { Options = _Options };
+    typedef typename Base::PlainObject PlainObject;
+
+  protected:
+    template <typename Derived, typename OtherDerived, bool IsVector>
+    friend struct internal::conservative_resize_like_impl;
+
+    using Base::m_storage;
+
+  public:
+
+    using Base::base;
+    using Base::coeff;
+    using Base::coeffRef;
+
+    /**
+      * The usage of
+      *   using Base::operator=;
+      * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
+      * the usage of 'using'. This should be done only for operator=.
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Array& operator=(const EigenBase<OtherDerived> &other)
+    {
+      return Base::operator=(other);
+    }
+
+    /** Copies the value of the expression \a other into \c *this with automatic resizing.
+      *
+      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
+      * it will be initialized.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
+    {
+      return Base::_set(other);
+    }
+
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    EIGEN_STRONG_INLINE Array& operator=(const Array& other)
+    {
+      return Base::_set(other);
+    }
+
+    /** Default constructor.
+      *
+      * For fixed-size matrices, does nothing.
+      *
+      * For dynamic-size matrices, creates an empty matrix of size 0. Does not allocate any array. Such a matrix
+      * is called a null matrix. This constructor is the unique way to create null matrices: resizing
+      * a matrix to 0 is not supported.
+      *
+      * \sa resize(Index,Index)
+      */
+    EIGEN_STRONG_INLINE Array() : Base()
+    {
+      Base::_check_template_params();
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    // FIXME is it still needed ??
+    /** \internal */
+    Array(internal::constructor_without_unaligned_array_assert)
+      : Base(internal::constructor_without_unaligned_array_assert())
+    {
+      Base::_check_template_params();
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+#endif
+
+    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Matrix() instead.
+      */
+    EIGEN_STRONG_INLINE explicit Array(Index dim)
+      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Array)
+      eigen_assert(dim >= 0);
+      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename T0, typename T1>
+    EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1)
+    {
+      Base::_check_template_params();
+      this->template _init2<T0,T1>(val0, val1);
+    }
+    #else
+    /** constructs an uninitialized matrix with \a rows rows and \a cols columns.
+      *
+      * This is useful for dynamic-size matrices. For fixed-size matrices,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Matrix() instead. */
+    Array(Index rows, Index cols);
+    /** constructs an initialized 2D vector with given coefficients */
+    Array(const Scalar& val0, const Scalar& val1);
+    #endif
+
+    /** constructs an initialized 3D vector with given coefficients */
+    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 3)
+      m_storage.data()[0] = val0;
+      m_storage.data()[1] = val1;
+      m_storage.data()[2] = val2;
+    }
+    /** constructs an initialized 4D vector with given coefficients */
+    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 4)
+      m_storage.data()[0] = val0;
+      m_storage.data()[1] = val1;
+      m_storage.data()[2] = val2;
+      m_storage.data()[3] = val3;
+    }
+
+    explicit Array(const Scalar *data);
+
+    /** Constructor copying the value of the expression \a other */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
+             : Base(other.rows() * other.cols(), other.rows(), other.cols())
+    {
+      Base::_check_template_params();
+      Base::_set_noalias(other);
+    }
+    /** Copy constructor */
+    EIGEN_STRONG_INLINE Array(const Array& other)
+            : Base(other.rows() * other.cols(), other.rows(), other.cols())
+    {
+      Base::_check_template_params();
+      Base::_set_noalias(other);
+    }
+    /** Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
+    {
+      Base::_check_template_params();
+      Base::resize(other.rows(), other.cols());
+      other.evalTo(*this);
+    }
+
+    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
+      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
+    {
+      Base::_check_template_params();
+      Base::_resize_to_match(other);
+      *this = other;
+    }
+
+    /** Override MatrixBase::swap() since for dynamic-sized matrices of same type it is enough to swap the
+      * data pointers.
+      */
+    template<typename OtherDerived>
+    void swap(ArrayBase<OtherDerived> const & other)
+    { this->_swap(other.derived()); }
+
+    inline Index innerStride() const { return 1; }
+    inline Index outerStride() const { return this->innerSize(); }
+
+    #ifdef EIGEN_ARRAY_PLUGIN
+    #include EIGEN_ARRAY_PLUGIN
+    #endif
+
+  private:
+
+    template<typename MatrixType, typename OtherDerived, bool SwapPointers>
+    friend struct internal::matrix_swap_impl;
+};
+
+/** \defgroup arraytypedefs Global array typedefs
+  * \ingroup Core_Module
+  *
+  * Eigen defines several typedef shortcuts for most common 1D and 2D array types.
+  *
+  * The general patterns are the following:
+  *
+  * \c ArrayRowsColsType where \c Rows and \c Cols can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size,
+  * and where \c Type can be \c i for integer, \c f for float, \c d for double, \c cf for complex float, \c cd
+  * for complex double.
+  *
+  * For example, \c Array33d is a fixed-size 3x3 array type of doubles, and \c ArrayXXf is a dynamic-size matrix of floats.
+  *
+  * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
+  * a fixed-size 1D array of 4 complex floats.
+  *
+  * \sa class Array
+  */
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)   \
+/** \ingroup arraytypedefs */                                    \
+typedef Array<Type, Size, Size> Array##SizeSuffix##SizeSuffix##TypeSuffix;  \
+/** \ingroup arraytypedefs */                                    \
+typedef Array<Type, Size, 1>    Array##SizeSuffix##TypeSuffix;
+
+#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, Size)         \
+/** \ingroup arraytypedefs */                                    \
+typedef Array<Type, Size, Dynamic> Array##Size##X##TypeSuffix;  \
+/** \ingroup arraytypedefs */                                    \
+typedef Array<Type, Dynamic, Size> Array##X##Size##TypeSuffix;
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
+EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 2, 2) \
+EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 3, 3) \
+EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 4, 4) \
+EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Dynamic, X) \
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 2) \
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 3) \
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 4)
+
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(int,                  i)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(float,                f)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(double,               d)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<float>,  cf)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
+
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS
+
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE
+
+#define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
+using Eigen::Matrix##SizeSuffix##TypeSuffix; \
+using Eigen::Vector##SizeSuffix##TypeSuffix; \
+using Eigen::RowVector##SizeSuffix##TypeSuffix;
+
+#define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(TypeSuffix) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
+
+#define EIGEN_USING_ARRAY_TYPEDEFS \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(i) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(f) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(d) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cf) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cd)
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARRAY_H
diff --git a/include/eigen3/Eigen/src/Core/ArrayBase.h b/include/eigen3/Eigen/src/Core/ArrayBase.h
new file mode 100644
index 0000000..3885260
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/ArrayBase.h
@@ -0,0 +1,228 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARRAYBASE_H
+#define EIGEN_ARRAYBASE_H
+
+namespace Eigen { 
+
+template<typename ExpressionType> class MatrixWrapper;
+
+/** \class ArrayBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for all 1D and 2D array, and related expressions
+  *
+  * An array is similar to a dense vector or matrix. While matrices are mathematical
+  * objects with well defined linear algebra operators, an array is just a collection
+  * of scalar values arranged in a one or two dimensionnal fashion. As the main consequence,
+  * all operations applied to an array are performed coefficient wise. Furthermore,
+  * arrays support scalar math functions of the c++ standard library (e.g., std::sin(x)), and convenient
+  * constructors allowing to easily write generic code working for both scalar values
+  * and arrays.
+  *
+  * This class is the base that is inherited by all array expression types.
+  *
+  * \tparam Derived is the derived type, e.g., an array or an expression type.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
+  *
+  * \sa class MatrixBase, \ref TopicClassHierarchy
+  */
+template<typename Derived> class ArrayBase
+  : public DenseBase<Derived>
+{
+  public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** The base class for a given storage type. */
+    typedef ArrayBase StorageBaseType;
+
+    typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;
+
+    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
+                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    typedef DenseBase<Derived> Base;
+    using Base::RowsAtCompileTime;
+    using Base::ColsAtCompileTime;
+    using Base::SizeAtCompileTime;
+    using Base::MaxRowsAtCompileTime;
+    using Base::MaxColsAtCompileTime;
+    using Base::MaxSizeAtCompileTime;
+    using Base::IsVectorAtCompileTime;
+    using Base::Flags;
+    using Base::CoeffReadCost;
+
+    using Base::derived;
+    using Base::const_cast_derived;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::coeff;
+    using Base::coeffRef;
+    using Base::lazyAssign;
+    using Base::operator=;
+    using Base::operator+=;
+    using Base::operator-=;
+    using Base::operator*=;
+    using Base::operator/=;
+
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal the plain matrix type corresponding to this expression. Note that is not necessarily
+      * exactly the return type of eval(): in the case of plain matrices, the return type of eval() is a const
+      * reference to a matrix, not a matrix! It is however guaranteed that the return type of eval() is either
+      * PlainObject or const PlainObject&.
+      */
+    typedef Array<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainObject;
+
+
+    /** \internal Represents a matrix with all coefficients equal to one another*/
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
+#   include "../plugins/CommonCwiseUnaryOps.h"
+#   include "../plugins/MatrixCwiseUnaryOps.h"
+#   include "../plugins/ArrayCwiseUnaryOps.h"
+#   include "../plugins/CommonCwiseBinaryOps.h"
+#   include "../plugins/MatrixCwiseBinaryOps.h"
+#   include "../plugins/ArrayCwiseBinaryOps.h"
+#   ifdef EIGEN_ARRAYBASE_PLUGIN
+#     include EIGEN_ARRAYBASE_PLUGIN
+#   endif
+#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+
+    /** Special case of the template operator=, in order to prevent the compiler
+      * from generating a default operator= (issue hit with g++ 4.1)
+      */
+    Derived& operator=(const ArrayBase& other)
+    {
+      return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+    }
+
+    Derived& operator+=(const Scalar& scalar)
+    { return *this = derived() + scalar; }
+    Derived& operator-=(const Scalar& scalar)
+    { return *this = derived() - scalar; }
+
+    template<typename OtherDerived>
+    Derived& operator+=(const ArrayBase<OtherDerived>& other);
+    template<typename OtherDerived>
+    Derived& operator-=(const ArrayBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    Derived& operator*=(const ArrayBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    Derived& operator/=(const ArrayBase<OtherDerived>& other);
+
+  public:
+    ArrayBase<Derived>& array() { return *this; }
+    const ArrayBase<Derived>& array() const { return *this; }
+
+    /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
+      * \sa MatrixBase::array() */
+    MatrixWrapper<Derived> matrix() { return derived(); }
+    const MatrixWrapper<const Derived> matrix() const { return derived(); }
+
+//     template<typename Dest>
+//     inline void evalTo(Dest& dst) const { dst = matrix(); }
+
+  protected:
+    ArrayBase() : Base() {}
+
+  private:
+    explicit ArrayBase(Index);
+    ArrayBase(Index,Index);
+    template<typename OtherDerived> explicit ArrayBase(const ArrayBase<OtherDerived>&);
+  protected:
+    // mixing arrays and matrices is not legal
+    template<typename OtherDerived> Derived& operator+=(const MatrixBase<OtherDerived>& )
+    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+    // mixing arrays and matrices is not legal
+    template<typename OtherDerived> Derived& operator-=(const MatrixBase<OtherDerived>& )
+    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+};
+
+/** replaces \c *this by \c *this - \a other.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived &
+ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
+{
+  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
+  return derived();
+}
+
+/** replaces \c *this by \c *this + \a other.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived &
+ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
+{
+  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
+  return derived();
+}
+
+/** replaces \c *this by \c *this * \a other coefficient wise.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived &
+ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
+{
+  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
+  return derived();
+}
+
+/** replaces \c *this by \c *this / \a other coefficient wise.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived &
+ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
+{
+  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARRAYBASE_H
diff --git a/include/eigen3/Eigen/src/Core/ArrayWrapper.h b/include/eigen3/Eigen/src/Core/ArrayWrapper.h
new file mode 100644
index 0000000..b4641e2
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/ArrayWrapper.h
@@ -0,0 +1,264 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARRAYWRAPPER_H
+#define EIGEN_ARRAYWRAPPER_H
+
+namespace Eigen { 
+
+/** \class ArrayWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a mathematical vector or matrix as an array object
+  *
+  * This class is the return type of MatrixBase::array(), and most of the time
+  * this is the only way it is use.
+  *
+  * \sa MatrixBase::array(), class MatrixWrapper
+  */
+
+namespace internal {
+template<typename ExpressionType>
+struct traits<ArrayWrapper<ExpressionType> >
+  : public traits<typename remove_all<typename ExpressionType::Nested>::type >
+{
+  typedef ArrayXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    Flags = Flags0 & ~NestByRefBit
+  };
+};
+}
+
+template<typename ExpressionType>
+class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
+{
+  public:
+    typedef ArrayBase<ArrayWrapper> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper)
+
+    typedef typename internal::conditional<
+                       internal::is_lvalue<ExpressionType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+
+    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+
+    inline ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+
+    inline Index rows() const { return m_expression.rows(); }
+    inline Index cols() const { return m_expression.cols(); }
+    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index innerStride() const { return m_expression.innerStride(); }
+
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline const Scalar* data() const { return m_expression.data(); }
+
+    inline CoeffReturnType coeff(Index rowId, Index colId) const
+    {
+      return m_expression.coeff(rowId, colId);
+    }
+
+    inline Scalar& coeffRef(Index rowId, Index colId)
+    {
+      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+    }
+
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+    }
+
+    inline CoeffReturnType coeff(Index index) const
+    {
+      return m_expression.coeff(index);
+    }
+
+    inline Scalar& coeffRef(Index index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index rowId, Index colId) const
+    {
+      return m_expression.template packet<LoadMode>(rowId, colId);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return m_expression.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& val)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+    }
+
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const { dst = m_expression; }
+
+    const typename internal::remove_all<NestedExpressionType>::type& 
+    nestedExpression() const 
+    {
+      return m_expression;
+    }
+
+    /** Forwards the resizing request to the nested expression
+      * \sa DenseBase::resize(Index)  */
+    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    /** Forwards the resizing request to the nested expression
+      * \sa DenseBase::resize(Index,Index)*/
+    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+
+  protected:
+    NestedExpressionType m_expression;
+};
+
+/** \class MatrixWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Expression of an array as a mathematical vector or matrix
+  *
+  * This class is the return type of ArrayBase::matrix(), and most of the time
+  * this is the only way it is use.
+  *
+  * \sa MatrixBase::matrix(), class ArrayWrapper
+  */
+
+namespace internal {
+template<typename ExpressionType>
+struct traits<MatrixWrapper<ExpressionType> >
+ : public traits<typename remove_all<typename ExpressionType::Nested>::type >
+{
+  typedef MatrixXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    Flags = Flags0 & ~NestByRefBit
+  };
+};
+}
+
+template<typename ExpressionType>
+class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
+{
+  public:
+    typedef MatrixBase<MatrixWrapper<ExpressionType> > Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper)
+
+    typedef typename internal::conditional<
+                       internal::is_lvalue<ExpressionType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+
+    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+
+    inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}
+
+    inline Index rows() const { return m_expression.rows(); }
+    inline Index cols() const { return m_expression.cols(); }
+    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index innerStride() const { return m_expression.innerStride(); }
+
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline const Scalar* data() const { return m_expression.data(); }
+
+    inline CoeffReturnType coeff(Index rowId, Index colId) const
+    {
+      return m_expression.coeff(rowId, colId);
+    }
+
+    inline Scalar& coeffRef(Index rowId, Index colId)
+    {
+      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+    }
+
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return m_expression.derived().coeffRef(rowId, colId);
+    }
+
+    inline CoeffReturnType coeff(Index index) const
+    {
+      return m_expression.coeff(index);
+    }
+
+    inline Scalar& coeffRef(Index index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index rowId, Index colId) const
+    {
+      return m_expression.template packet<LoadMode>(rowId, colId);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return m_expression.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& val)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+    }
+
+    const typename internal::remove_all<NestedExpressionType>::type& 
+    nestedExpression() const 
+    {
+      return m_expression;
+    }
+
+    /** Forwards the resizing request to the nested expression
+      * \sa DenseBase::resize(Index)  */
+    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    /** Forwards the resizing request to the nested expression
+      * \sa DenseBase::resize(Index,Index)*/
+    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+
+  protected:
+    NestedExpressionType m_expression;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARRAYWRAPPER_H
diff --git a/include/eigen3/Eigen/src/Core/Assign.h b/include/eigen3/Eigen/src/Core/Assign.h
new file mode 100644
index 0000000..1dccc2f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Assign.h
@@ -0,0 +1,583 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Michael Olbrich <michael.olbrich at gmx.net>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGN_H
+#define EIGEN_ASSIGN_H
+
+namespace Eigen {
+
+namespace internal {
+
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for traversal and unrolling       *
+***************************************************************************/
+
+template <typename Derived, typename OtherDerived>
+struct assign_traits
+{
+public:
+  enum {
+    DstIsAligned = Derived::Flags & AlignedBit,
+    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
+    SrcIsAligned = OtherDerived::Flags & AlignedBit,
+    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
+  };
+
+private:
+  enum {
+    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
+              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
+              : int(Derived::RowsAtCompileTime),
+    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
+              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
+              : int(Derived::MaxRowsAtCompileTime),
+    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
+    PacketSize = packet_traits<typename Derived::Scalar>::size
+  };
+
+  enum {
+    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
+    MightVectorize = StorageOrdersAgree
+                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
+    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
+                       && int(DstIsAligned) && int(SrcIsAligned),
+    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
+    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
+                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
+      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+         so it's only good for large enough sizes. */
+    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
+      /* slice vectorization can be slow, so we only want it if the slices are big, which is
+         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+         in a fixed-size matrix */
+  };
+
+public:
+  enum {
+    Traversal = int(MayInnerVectorize)  ? int(InnerVectorizedTraversal)
+              : int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
+              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
+              : int(MayLinearize)       ? int(LinearTraversal)
+                                        : int(DefaultTraversal),
+    Vectorized = int(Traversal) == InnerVectorizedTraversal
+              || int(Traversal) == LinearVectorizedTraversal
+              || int(Traversal) == SliceVectorizedTraversal
+  };
+
+private:
+  enum {
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
+    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
+                       && int(OtherDerived::CoeffReadCost) != Dynamic
+                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
+    MayUnrollInner      = int(InnerSize) != Dynamic
+                       && int(OtherDerived::CoeffReadCost) != Dynamic
+                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
+  };
+
+public:
+  enum {
+    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
+                ? (
+                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
+                  : int(MayUnrollInner)      ? int(InnerUnrolling)
+                                             : int(NoUnrolling)
+                  )
+              : int(Traversal) == int(LinearVectorizedTraversal)
+                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
+              : int(Traversal) == int(LinearTraversal)
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) )
+              : int(NoUnrolling)
+  };
+
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    EIGEN_DEBUG_VAR(DstIsAligned)
+    EIGEN_DEBUG_VAR(SrcIsAligned)
+    EIGEN_DEBUG_VAR(JointAlignment)
+    EIGEN_DEBUG_VAR(InnerSize)
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(StorageOrdersAgree)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearize)
+    EIGEN_DEBUG_VAR(MayInnerVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    EIGEN_DEBUG_VAR(Traversal)
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(MayUnrollCompletely)
+    EIGEN_DEBUG_VAR(MayUnrollInner)
+    EIGEN_DEBUG_VAR(Unrolling)
+  }
+#endif
+};
+
+/***************************************************************************
+* Part 2 : meta-unrollers
+***************************************************************************/
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_DefaultTraversal_CompleteUnrolling
+{
+  enum {
+    outer = Index / Derived1::InnerSizeAtCompileTime,
+    inner = Index % Derived1::InnerSizeAtCompileTime
+  };
+
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    dst.copyCoeffByOuterInner(outer, inner, src);
+    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_DefaultTraversal_InnerUnrolling
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
+  {
+    dst.copyCoeffByOuterInner(outer, Index, src);
+    assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, outer);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_LinearTraversal_CompleteUnrolling
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    dst.copyCoeff(Index, src);
+    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_innervec_CompleteUnrolling
+{
+  enum {
+    outer = Index / Derived1::InnerSizeAtCompileTime,
+    inner = Index % Derived1::InnerSizeAtCompileTime,
+    JointAlignment = assign_traits<Derived1,Derived2>::JointAlignment
+  };
+
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    dst.template copyPacketByOuterInner<Derived2, Aligned, JointAlignment>(outer, inner, src);
+    assign_innervec_CompleteUnrolling<Derived1, Derived2,
+      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_innervec_InnerUnrolling
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
+  {
+    dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, Index, src);
+    assign_innervec_InnerUnrolling<Derived1, Derived2,
+      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, outer);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_innervec_InnerUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+template<typename Derived1, typename Derived2,
+         int Traversal = assign_traits<Derived1, Derived2>::Traversal,
+         int Unrolling = assign_traits<Derived1, Derived2>::Unrolling,
+         int Version = Specialized>
+struct assign_impl;
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Derived1, typename Derived2, int Unrolling, int Version>
+struct assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling, Version>
+{
+  static inline void run(Derived1 &, const Derived2 &) { }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index innerSize = dst.innerSize();
+    const Index outerSize = dst.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      for(Index inner = 0; inner < innerSize; ++inner)
+        dst.copyCoeffByOuterInner(outer, inner, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling, Version>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
+      ::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index outerSize = dst.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
+        ::run(dst, src, outer);
+  }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index size = dst.size();
+    for(Index i = 0; i < size; ++i)
+      dst.copyCoeff(i, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, LinearTraversal, CompleteUnrolling, Version>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
+      ::run(dst, src);
+  }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index innerSize = dst.innerSize();
+    const Index outerSize = dst.outerSize();
+    const Index packetSize = packet_traits<typename Derived1::Scalar>::size;
+    for(Index outer = 0; outer < outerSize; ++outer)
+      for(Index inner = 0; inner < innerSize; inner+=packetSize)
+        dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, inner, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, CompleteUnrolling, Version>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
+      ::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, InnerUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index outerSize = dst.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      assign_innervec_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
+        ::run(dst, src, outer);
+  }
+};
+
+/***************************
+*** Linear vectorization ***
+***************************/
+
+template <bool IsAligned = false>
+struct unaligned_assign_impl
+{
+  template <typename Derived, typename OtherDerived>
+  static EIGEN_STRONG_INLINE void run(const Derived&, OtherDerived&, typename Derived::Index, typename Derived::Index) {}
+};
+
+template <>
+struct unaligned_assign_impl<false>
+{
+  // MSVC must not inline this functions. If it does, it fails to optimize the
+  // packet access path.
+#ifdef _MSC_VER
+  template <typename Derived, typename OtherDerived>
+  static EIGEN_DONT_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
+#else
+  template <typename Derived, typename OtherDerived>
+  static EIGEN_STRONG_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
+#endif
+  {
+    for (typename Derived::Index index = start; index < end; ++index)
+      dst.copyCoeff(index, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index size = dst.size();
+    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
+    enum {
+      packetSize = PacketTraits::size,
+      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
+      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
+    };
+    const Index alignedStart = assign_traits<Derived1,Derived2>::DstIsAligned ? 0
+                             : internal::first_aligned(&dst.coeffRef(0), size);
+    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
+
+    unaligned_assign_impl<assign_traits<Derived1,Derived2>::DstIsAligned!=0>::run(src,dst,0,alignedStart);
+
+    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
+    {
+      dst.template copyPacket<Derived2, dstAlignment, srcAlignment>(index, src);
+    }
+
+    unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, CompleteUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    enum { size = Derived1::SizeAtCompileTime,
+           packetSize = packet_traits<typename Derived1::Scalar>::size,
+           alignedSize = (size/packetSize)*packetSize };
+
+    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
+    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
+  }
+};
+
+/**************************
+*** Slice vectorization ***
+***************************/
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
+    enum {
+      packetSize = PacketTraits::size,
+      alignable = PacketTraits::AlignedOnScalar,
+      dstAlignment = alignable ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
+      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
+    };
+    const Index packetAlignedMask = packetSize - 1;
+    const Index innerSize = dst.innerSize();
+    const Index outerSize = dst.outerSize();
+    const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0;
+    Index alignedStart = ((!alignable) || assign_traits<Derived1,Derived2>::DstIsAligned) ? 0
+                       : internal::first_aligned(&dst.coeffRef(0,0), innerSize);
+
+    for(Index outer = 0; outer < outerSize; ++outer)
+    {
+      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
+      // do the non-vectorizable part of the assignment
+      for(Index inner = 0; inner<alignedStart ; ++inner)
+        dst.copyCoeffByOuterInner(outer, inner, src);
+
+      // do the vectorizable part of the assignment
+      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
+        dst.template copyPacketByOuterInner<Derived2, dstAlignment, Unaligned>(outer, inner, src);
+
+      // do the non-vectorizable part of the assignment
+      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
+        dst.copyCoeffByOuterInner(outer, inner, src);
+
+      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
+    }
+  }
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* Part 4 : implementation of DenseBase methods
+***************************************************************************/
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+  ::lazyAssign(const DenseBase<OtherDerived>& other)
+{
+  enum{
+    SameType = internal::is_same<typename Derived::Scalar,typename OtherDerived::Scalar>::value
+  };
+
+  EIGEN_STATIC_ASSERT_LVALUE(Derived)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
+  EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+#ifdef EIGEN_DEBUG_ASSIGN
+  internal::assign_traits<Derived, OtherDerived>::debug();
+#endif
+  eigen_assert(rows() == other.rows() && cols() == other.cols());
+  internal::assign_impl<Derived, OtherDerived, int(SameType) ? int(internal::assign_traits<Derived, OtherDerived>::Traversal)
+                                                       : int(InvalidTraversal)>::run(derived(),other.derived());
+#ifndef EIGEN_NO_DEBUG
+  checkTransposeAliasing(other.derived());
+#endif
+  return derived();
+}
+
+namespace internal {
+
+template<typename Derived, typename OtherDerived,
+         bool EvalBeforeAssigning = (int(internal::traits<OtherDerived>::Flags) & EvalBeforeAssigningBit) != 0,
+         bool NeedToTranspose = ((int(Derived::RowsAtCompileTime) == 1 && int(OtherDerived::ColsAtCompileTime) == 1)
+                              |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+                                  // revert to || as soon as not needed anymore.
+                                  (int(Derived::ColsAtCompileTime) == 1 && int(OtherDerived::RowsAtCompileTime) == 1))
+                              && int(Derived::SizeAtCompileTime) != 1>
+struct assign_selector;
+
+template<typename Derived, typename OtherDerived>
+struct assign_selector<Derived,OtherDerived,false,false> {
+  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
+  template<typename ActualDerived, typename ActualOtherDerived>
+  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { other.evalTo(dst); return dst; }
+};
+template<typename Derived, typename OtherDerived>
+struct assign_selector<Derived,OtherDerived,true,false> {
+  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.eval()); }
+};
+template<typename Derived, typename OtherDerived>
+struct assign_selector<Derived,OtherDerived,false,true> {
+  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
+  template<typename ActualDerived, typename ActualOtherDerived>
+  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { Transpose<ActualDerived> dstTrans(dst); other.evalTo(dstTrans); return dst; }
+};
+template<typename Derived, typename OtherDerived>
+struct assign_selector<Derived,OtherDerived,true,true> {
+  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
+};
+
+} // end namespace internal
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
+{
+  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+}
+
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
+{
+  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+}
+
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
+{
+  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+}
+
+template<typename Derived>
+template <typename OtherDerived>
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
+{
+  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+}
+
+template<typename Derived>
+template <typename OtherDerived>
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
+{
+  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+{
+  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ASSIGN_H
diff --git a/include/eigen3/Eigen/src/Core/Assign_MKL.h b/include/eigen3/Eigen/src/Core/Assign_MKL.h
new file mode 100644
index 0000000..7772951
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Assign_MKL.h
@@ -0,0 +1,224 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   MKL VML support for coefficient-wise unary Eigen expressions like a=b.sin()
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_ASSIGN_VML_H
+#define EIGEN_ASSIGN_VML_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Op> struct vml_call
+{ enum { IsSupported = 0 }; };
+
+template<typename Dst, typename Src, typename UnaryOp>
+class vml_assign_traits
+{
+  private:
+    enum {
+      DstHasDirectAccess = Dst::Flags & DirectAccessBit,
+      SrcHasDirectAccess = Src::Flags & DirectAccessBit,
+
+      StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
+      InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
+                : int(Dst::Flags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
+                : int(Dst::RowsAtCompileTime),
+      InnerMaxSize  = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
+                    : int(Dst::Flags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
+                    : int(Dst::MaxRowsAtCompileTime),
+      MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
+
+      MightEnableVml =  vml_call<UnaryOp>::IsSupported && StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess
+                     && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
+      MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
+      VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
+      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD,
+      MayEnableVml = MightEnableVml && LargeEnough,
+      MayLinearize = MayEnableVml && MightLinearize
+    };
+  public:
+    enum {
+      Traversal = MayLinearize ? LinearVectorizedTraversal
+                : MayEnableVml ? InnerVectorizedTraversal
+                : DefaultTraversal
+    };
+};
+
+template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling,
+         int VmlTraversal = vml_assign_traits<Derived1, Derived2, UnaryOp>::Traversal >
+struct vml_assign_impl
+  : assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>
+{
+};
+
+template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
+struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, InnerVectorizedTraversal>
+{
+  typedef typename Derived1::Scalar Scalar;
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
+  {
+    // in case we want to (or have to) skip VML at runtime we can call:
+    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
+    const Index innerSize = dst.innerSize();
+    const Index outerSize = dst.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer) {
+      const Scalar *src_ptr = src.IsRowMajor ?  &(src.nestedExpression().coeffRef(outer,0)) :
+                                                &(src.nestedExpression().coeffRef(0, outer));
+      Scalar *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));
+      vml_call<UnaryOp>::run(src.functor(), innerSize, src_ptr, dst_ptr );
+    }
+  }
+};
+
+template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
+struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, LinearVectorizedTraversal>
+{
+  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
+  {
+    // in case we want to (or have to) skip VML at runtime we can call:
+    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
+    vml_call<UnaryOp>::run(src.functor(), dst.size(), src.nestedExpression().data(), dst.data() );
+  }
+};
+
+// Macroses
+
+#define EIGEN_MKL_VML_SPECIALIZE_ASSIGN(TRAVERSAL,UNROLLING) \
+  template<typename Derived1, typename Derived2, typename UnaryOp> \
+  struct assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>, TRAVERSAL, UNROLLING, Specialized>  {  \
+    static inline void run(Derived1 &dst, const Eigen::CwiseUnaryOp<UnaryOp, Derived2> &src) { \
+      vml_assign_impl<Derived1,Derived2,UnaryOp,TRAVERSAL,UNROLLING>::run(dst, src); \
+    } \
+  };
+
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,NoUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,CompleteUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,InnerUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,NoUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,CompleteUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,NoUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,CompleteUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,InnerUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,CompleteUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,NoUnrolling)
+EIGEN_MKL_VML_SPECIALIZE_ASSIGN(SliceVectorizedTraversal,NoUnrolling)
+
+
+#if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
+#define  EIGEN_MKL_VML_MODE VML_HA
+#else
+#define  EIGEN_MKL_VML_MODE VML_LA
+#endif
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)     \
+  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
+    enum { IsSupported = 1 };                                                    \
+    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
+                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
+      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst);                           \
+    }                                                                            \
+  };
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)  \
+  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
+    enum { IsSupported = 1 };                                                    \
+    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
+                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
+      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
+      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst, vmlMode);                  \
+    }                                                                            \
+  };
+
+#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)       \
+  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
+    enum { IsSupported = 1 };                                                    \
+    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& func,        \
+                          int size, const EIGENTYPE* src, EIGENTYPE* dst) {      \
+      EIGENTYPE exponent = func.m_exponent;                                      \
+      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
+      VMLOP(&size, (const VMLTYPE*)src, (const VMLTYPE*)&exponent,               \
+                        (VMLTYPE*)dst, &vmlMode);                                \
+    }                                                                            \
+  };
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                   \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vs##VMLOP, float, float)             \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vd##VMLOP, double, double)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)                \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vc##VMLOP, scomplex, MKL_Complex8)   \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vz##VMLOP, dcomplex, MKL_Complex16)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP)                        \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)
+
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vms##VMLOP, float, float)         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmd##VMLOP, double, double)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)             \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmc##VMLOP, scomplex, MKL_Complex8)  \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmz##VMLOP, dcomplex, MKL_Complex16)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(EIGENOP, VMLOP)                     \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                      \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)
+
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sin,  Sin)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
+//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sqrt, Sqrt)
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr)
+
+// The vm*powx functions are not avaibale in the windows version of MKL.
+#ifndef _WIN32
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmspowx_, float, float)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdpowx_, double, double)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcpowx_, scomplex, MKL_Complex8)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzpowx_, dcomplex, MKL_Complex16)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ASSIGN_VML_H
diff --git a/include/eigen3/Eigen/src/Core/BandMatrix.h b/include/eigen3/Eigen/src/Core/BandMatrix.h
new file mode 100644
index 0000000..ffd7fe8
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/BandMatrix.h
@@ -0,0 +1,334 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BANDMATRIX_H
+#define EIGEN_BANDMATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Derived>
+class BandMatrixBase : public EigenBase<Derived>
+{
+  public:
+
+    enum {
+      Flags = internal::traits<Derived>::Flags,
+      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+      Supers = internal::traits<Derived>::Supers,
+      Subs   = internal::traits<Derived>::Subs,
+      Options = internal::traits<Derived>::Options
+    };
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
+    typedef typename DenseMatrixType::Index Index;
+    typedef typename internal::traits<Derived>::CoefficientsType CoefficientsType;
+    typedef EigenBase<Derived> Base;
+
+  protected:
+    enum {
+      DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic))
+                            ? 1 + Supers + Subs
+                            : Dynamic,
+      SizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime)
+    };
+
+  public:
+    
+    using Base::derived;
+    using Base::rows;
+    using Base::cols;
+
+    /** \returns the number of super diagonals */
+    inline Index supers() const { return derived().supers(); }
+
+    /** \returns the number of sub diagonals */
+    inline Index subs() const { return derived().subs(); }
+    
+    /** \returns an expression of the underlying coefficient matrix */
+    inline const CoefficientsType& coeffs() const { return derived().coeffs(); }
+    
+    /** \returns an expression of the underlying coefficient matrix */
+    inline CoefficientsType& coeffs() { return derived().coeffs(); }
+
+    /** \returns a vector expression of the \a i -th column,
+      * only the meaningful part is returned.
+      * \warning the internal storage must be column major. */
+    inline Block<CoefficientsType,Dynamic,1> col(Index i)
+    {
+      EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+      Index start = 0;
+      Index len = coeffs().rows();
+      if (i<=supers())
+      {
+        start = supers()-i;
+        len = (std::min)(rows(),std::max<Index>(0,coeffs().rows() - (supers()-i)));
+      }
+      else if (i>=rows()-subs())
+        len = std::max<Index>(0,coeffs().rows() - (i + 1 - rows() + subs()));
+      return Block<CoefficientsType,Dynamic,1>(coeffs(), start, i, len, 1);
+    }
+
+    /** \returns a vector expression of the main diagonal */
+    inline Block<CoefficientsType,1,SizeAtCompileTime> diagonal()
+    { return Block<CoefficientsType,1,SizeAtCompileTime>(coeffs(),supers(),0,1,(std::min)(rows(),cols())); }
+
+    /** \returns a vector expression of the main diagonal (const version) */
+    inline const Block<const CoefficientsType,1,SizeAtCompileTime> diagonal() const
+    { return Block<const CoefficientsType,1,SizeAtCompileTime>(coeffs(),supers(),0,1,(std::min)(rows(),cols())); }
+
+    template<int Index> struct DiagonalIntReturnType {
+      enum {
+        ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)),
+        Conjugate = ReturnOpposite && NumTraits<Scalar>::IsComplex,
+        ActualIndex = ReturnOpposite ? -Index : Index,
+        DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic)
+                     ? Dynamic
+                     : (ActualIndex<0
+                     ? EIGEN_SIZE_MIN_PREFER_DYNAMIC(ColsAtCompileTime, RowsAtCompileTime + ActualIndex)
+                     : EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime - ActualIndex))
+      };
+      typedef Block<CoefficientsType,1, DiagonalSize> BuildType;
+      typedef typename internal::conditional<Conjugate,
+                 CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>,BuildType >,
+                 BuildType>::type Type;
+    };
+
+    /** \returns a vector expression of the \a N -th sub or super diagonal */
+    template<int N> inline typename DiagonalIntReturnType<N>::Type diagonal()
+    {
+      return typename DiagonalIntReturnType<N>::BuildType(coeffs(), supers()-N, (std::max)(0,N), 1, diagonalLength(N));
+    }
+
+    /** \returns a vector expression of the \a N -th sub or super diagonal */
+    template<int N> inline const typename DiagonalIntReturnType<N>::Type diagonal() const
+    {
+      return typename DiagonalIntReturnType<N>::BuildType(coeffs(), supers()-N, (std::max)(0,N), 1, diagonalLength(N));
+    }
+
+    /** \returns a vector expression of the \a i -th sub or super diagonal */
+    inline Block<CoefficientsType,1,Dynamic> diagonal(Index i)
+    {
+      eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers()));
+      return Block<CoefficientsType,1,Dynamic>(coeffs(), supers()-i, std::max<Index>(0,i), 1, diagonalLength(i));
+    }
+
+    /** \returns a vector expression of the \a i -th sub or super diagonal */
+    inline const Block<const CoefficientsType,1,Dynamic> diagonal(Index i) const
+    {
+      eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers()));
+      return Block<const CoefficientsType,1,Dynamic>(coeffs(), supers()-i, std::max<Index>(0,i), 1, diagonalLength(i));
+    }
+    
+    template<typename Dest> inline void evalTo(Dest& dst) const
+    {
+      dst.resize(rows(),cols());
+      dst.setZero();
+      dst.diagonal() = diagonal();
+      for (Index i=1; i<=supers();++i)
+        dst.diagonal(i) = diagonal(i);
+      for (Index i=1; i<=subs();++i)
+        dst.diagonal(-i) = diagonal(-i);
+    }
+
+    DenseMatrixType toDenseMatrix() const
+    {
+      DenseMatrixType res(rows(),cols());
+      evalTo(res);
+      return res;
+    }
+
+  protected:
+
+    inline Index diagonalLength(Index i) const
+    { return i<0 ? (std::min)(cols(),rows()+i) : (std::min)(rows(),cols()-i); }
+};
+
+/**
+  * \class BandMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Represents a rectangular matrix with a banded storage
+  *
+  * \param _Scalar Numeric type, i.e. float, double, int
+  * \param Rows Number of rows, or \b Dynamic
+  * \param Cols Number of columns, or \b Dynamic
+  * \param Supers Number of super diagonal
+  * \param Subs Number of sub diagonal
+  * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
+  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to
+  *                 column-major. The latter controls whether the matrix represents a selfadjoint 
+  *                 matrix in which case either Supers of Subs have to be null.
+  *
+  * \sa class TridiagonalMatrix
+  */
+
+template<typename _Scalar, int _Rows, int _Cols, int _Supers, int _Subs, int _Options>
+struct traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef DenseIndex Index;
+  enum {
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _Rows,
+    MaxColsAtCompileTime = _Cols,
+    Flags = LvalueBit,
+    Supers = _Supers,
+    Subs = _Subs,
+    Options = _Options,
+    DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
+  };
+  typedef Matrix<Scalar,DataRowsAtCompileTime,ColsAtCompileTime,Options&RowMajor?RowMajor:ColMajor> CoefficientsType;
+};
+
+template<typename _Scalar, int Rows, int Cols, int Supers, int Subs, int Options>
+class BandMatrix : public BandMatrixBase<BandMatrix<_Scalar,Rows,Cols,Supers,Subs,Options> >
+{
+  public:
+
+    typedef typename internal::traits<BandMatrix>::Scalar Scalar;
+    typedef typename internal::traits<BandMatrix>::Index Index;
+    typedef typename internal::traits<BandMatrix>::CoefficientsType CoefficientsType;
+
+    inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
+      : m_coeffs(1+supers+subs,cols),
+        m_rows(rows), m_supers(supers), m_subs(subs)
+    {
+    }
+
+    /** \returns the number of columns */
+    inline Index rows() const { return m_rows.value(); }
+
+    /** \returns the number of rows */
+    inline Index cols() const { return m_coeffs.cols(); }
+
+    /** \returns the number of super diagonals */
+    inline Index supers() const { return m_supers.value(); }
+
+    /** \returns the number of sub diagonals */
+    inline Index subs() const { return m_subs.value(); }
+
+    inline const CoefficientsType& coeffs() const { return m_coeffs; }
+    inline CoefficientsType& coeffs() { return m_coeffs; }
+
+  protected:
+
+    CoefficientsType m_coeffs;
+    internal::variable_if_dynamic<Index, Rows>   m_rows;
+    internal::variable_if_dynamic<Index, Supers> m_supers;
+    internal::variable_if_dynamic<Index, Subs>   m_subs;
+};
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+class BandMatrixWrapper;
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+struct traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef typename _CoefficientsType::Scalar Scalar;
+  typedef typename _CoefficientsType::StorageKind StorageKind;
+  typedef typename _CoefficientsType::Index Index;
+  enum {
+    CoeffReadCost = internal::traits<_CoefficientsType>::CoeffReadCost,
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _Rows,
+    MaxColsAtCompileTime = _Cols,
+    Flags = LvalueBit,
+    Supers = _Supers,
+    Subs = _Subs,
+    Options = _Options,
+    DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
+  };
+  typedef _CoefficientsType CoefficientsType;
+};
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  public:
+
+    typedef typename internal::traits<BandMatrixWrapper>::Scalar Scalar;
+    typedef typename internal::traits<BandMatrixWrapper>::CoefficientsType CoefficientsType;
+    typedef typename internal::traits<BandMatrixWrapper>::Index Index;
+
+    inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
+      : m_coeffs(coeffs),
+        m_rows(rows), m_supers(supers), m_subs(subs)
+    {
+      EIGEN_UNUSED_VARIABLE(cols);
+      //internal::assert(coeffs.cols()==cols() && (supers()+subs()+1)==coeffs.rows());
+    }
+
+    /** \returns the number of columns */
+    inline Index rows() const { return m_rows.value(); }
+
+    /** \returns the number of rows */
+    inline Index cols() const { return m_coeffs.cols(); }
+
+    /** \returns the number of super diagonals */
+    inline Index supers() const { return m_supers.value(); }
+
+    /** \returns the number of sub diagonals */
+    inline Index subs() const { return m_subs.value(); }
+
+    inline const CoefficientsType& coeffs() const { return m_coeffs; }
+
+  protected:
+
+    const CoefficientsType& m_coeffs;
+    internal::variable_if_dynamic<Index, _Rows>   m_rows;
+    internal::variable_if_dynamic<Index, _Supers> m_supers;
+    internal::variable_if_dynamic<Index, _Subs>   m_subs;
+};
+
+/**
+  * \class TridiagonalMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Represents a tridiagonal matrix with a compact banded storage
+  *
+  * \param _Scalar Numeric type, i.e. float, double, int
+  * \param Size Number of rows and cols, or \b Dynamic
+  * \param _Options Can be 0 or \b SelfAdjoint
+  *
+  * \sa class BandMatrix
+  */
+template<typename Scalar, int Size, int Options>
+class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor>
+{
+    typedef BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor> Base;
+    typedef typename Base::Index Index;
+  public:
+    TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
+
+    inline typename Base::template DiagonalIntReturnType<1>::Type super()
+    { return Base::template diagonal<1>(); }
+    inline const typename Base::template DiagonalIntReturnType<1>::Type super() const
+    { return Base::template diagonal<1>(); }
+    inline typename Base::template DiagonalIntReturnType<-1>::Type sub()
+    { return Base::template diagonal<-1>(); }
+    inline const typename Base::template DiagonalIntReturnType<-1>::Type sub() const
+    { return Base::template diagonal<-1>(); }
+  protected:
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BANDMATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/Block.h b/include/eigen3/Eigen/src/Core/Block.h
new file mode 100644
index 0000000..87bedfa
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Block.h
@@ -0,0 +1,405 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BLOCK_H
+#define EIGEN_BLOCK_H
+
+namespace Eigen { 
+
+/** \class Block
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size block
+  *
+  * \param XprType the type of the expression in which we are taking a block
+  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
+  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
+  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
+  * most of the time this is the only way it is used.
+  *
+  * However, if you want to directly maniputate block expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_Block.cpp
+  * Output: \verbinclude class_Block.out
+  *
+  * \note Even though this expression has dynamic size, in the case where \a XprType
+  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+  * it does not cause a dynamic memory allocation.
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedBlock.cpp
+  * Output: \verbinclude class_FixedBlock.out
+  *
+  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
+  */
+
+namespace internal {
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType>
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::XprKind XprKind;
+  typedef typename nested<XprType>::type XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  enum{
+    MatrixRows = traits<XprType>::RowsAtCompileTime,
+    MatrixCols = traits<XprType>::ColsAtCompileTime,
+    RowsAtCompileTime = MatrixRows == 0 ? 0 : BlockRows,
+    ColsAtCompileTime = MatrixCols == 0 ? 0 : BlockCols,
+    MaxRowsAtCompileTime = BlockRows==0 ? 0
+                         : RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime)
+                         : int(traits<XprType>::MaxRowsAtCompileTime),
+    MaxColsAtCompileTime = BlockCols==0 ? 0
+                         : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
+                         : int(traits<XprType>::MaxColsAtCompileTime),
+    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+               : XprTypeIsRowMajor,
+    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(inner_stride_at_compile_time<XprType>::ret)
+                             : int(outer_stride_at_compile_time<XprType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(outer_stride_at_compile_time<XprType>::ret)
+                             : int(inner_stride_at_compile_time<XprType>::ret),
+    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
+                       && (InnerStrideAtCompileTime == 1)
+                        ? PacketAccessBit : 0,
+    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
+    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
+                                        DirectAccessBit |
+                                        MaskPacketAccessBit |
+                                        MaskAlignedBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit
+  };
+};
+
+template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false,
+         bool HasDirectAccess = internal::has_direct_access<XprType>::ret> class BlockImpl_dense;
+         
+} // end namespace internal
+
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;
+
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class Block
+  : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind>
+{
+    typedef BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind> Impl;
+  public:
+    //typedef typename Impl::Base Base;
+    typedef Impl Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
+  
+    /** Column or Row constructor
+      */
+    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
+    {
+      eigen_assert( (i>=0) && (
+          ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i<xpr.rows())
+        ||((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && i<xpr.cols())));
+    }
+
+    /** Fixed-size constructor
+      */
+    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
+      : Impl(xpr, a_startRow, a_startCol)
+    {
+      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
+      eigen_assert(a_startRow >= 0 && BlockRows >= 1 && a_startRow + BlockRows <= xpr.rows()
+             && a_startCol >= 0 && BlockCols >= 1 && a_startCol + BlockCols <= xpr.cols());
+    }
+
+    /** Dynamic-size constructor
+      */
+    inline Block(XprType& xpr,
+          Index a_startRow, Index a_startCol,
+          Index blockRows, Index blockCols)
+      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols)
+    {
+      eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
+          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
+      eigen_assert(a_startRow >= 0 && blockRows >= 0 && a_startRow  <= xpr.rows() - blockRows
+          && a_startCol >= 0 && blockCols >= 0 && a_startCol <= xpr.cols() - blockCols);
+    }
+};
+         
+// The generic default implementation for dense block simplu forward to the internal::BlockImpl_dense
+// that must be specialized for direct and non-direct access...
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
+  : public internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel>
+{
+    typedef internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel> Impl;
+    typedef typename XprType::Index Index;
+  public:
+    typedef Impl Base;
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+    inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
+    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
+      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
+};
+
+namespace internal {
+
+/** \internal Internal implementation of dense Blocks in the general case. */
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool HasDirectAccess> class BlockImpl_dense
+  : public internal::dense_xpr_base<Block<XprType, BlockRows, BlockCols, InnerPanel> >::type
+{
+    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+  public:
+
+    typedef typename internal::dense_xpr_base<BlockType>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
+
+    class InnerIterator;
+
+    /** Column or Row constructor
+      */
+    inline BlockImpl_dense(XprType& xpr, Index i)
+      : m_xpr(xpr),
+        // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
+        // and it is a column if and only if BlockRows==XprType::RowsAtCompileTime and BlockCols==1,
+        // all other cases are invalid.
+        // The case a 1x1 matrix seems ambiguous, but the result is the same anyway.
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
+        m_blockRows(BlockRows==1 ? 1 : xpr.rows()),
+        m_blockCols(BlockCols==1 ? 1 : xpr.cols())
+    {}
+
+    /** Fixed-size constructor
+      */
+    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
+      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+                    m_blockRows(BlockRows), m_blockCols(BlockCols)
+    {}
+
+    /** Dynamic-size constructor
+      */
+    inline BlockImpl_dense(XprType& xpr,
+          Index a_startRow, Index a_startCol,
+          Index blockRows, Index blockCols)
+      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+                    m_blockRows(blockRows), m_blockCols(blockCols)
+    {}
+
+    inline Index rows() const { return m_blockRows.value(); }
+    inline Index cols() const { return m_blockCols.value(); }
+
+    inline Scalar& coeffRef(Index rowId, Index colId)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(XprType)
+      return m_xpr.const_cast_derived()
+               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+    }
+
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return m_xpr.derived()
+               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+    }
+
+    EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
+    {
+      return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value());
+    }
+
+    inline Scalar& coeffRef(Index index)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(XprType)
+      return m_xpr.const_cast_derived()
+             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return m_xpr.const_cast_derived()
+             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    inline const CoeffReturnType coeff(Index index) const
+    {
+      return m_xpr
+             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(Index rowId, Index colId) const
+    {
+      return m_xpr.template packet<Unaligned>
+              (rowId + m_startRow.value(), colId + m_startCol.value());
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    {
+      m_xpr.const_cast_derived().template writePacket<Unaligned>
+              (rowId + m_startRow.value(), colId + m_startCol.value(), val);
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(Index index) const
+    {
+      return m_xpr.template packet<Unaligned>
+              (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+               m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& val)
+    {
+      m_xpr.const_cast_derived().template writePacket<Unaligned>
+         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val);
+    }
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** \sa MapBase::data() */
+    inline const Scalar* data() const;
+    inline Index innerStride() const;
+    inline Index outerStride() const;
+    #endif
+
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
+    { 
+      return m_xpr; 
+    }
+      
+    Index startRow() const 
+    { 
+      return m_startRow.value(); 
+    }
+      
+    Index startCol() const 
+    { 
+      return m_startCol.value(); 
+    }
+
+  protected:
+
+    const typename XprType::Nested m_xpr;
+    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
+    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
+};
+
+/** \internal Internal implementation of dense Blocks in the direct access case.*/
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
+  : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
+{
+    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+  public:
+
+    typedef MapBase<BlockType> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
+
+    /** Column or Row constructor
+      */
+    inline BlockImpl_dense(XprType& xpr, Index i)
+      : Base(internal::const_cast_ptr(&xpr.coeffRef(
+              (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0,
+              (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)),
+             BlockRows==1 ? 1 : xpr.rows(),
+             BlockCols==1 ? 1 : xpr.cols()),
+        m_xpr(xpr)
+    {
+      init();
+    }
+
+    /** Fixed-size constructor
+      */
+    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol))), m_xpr(xpr)
+    {
+      init();
+    }
+
+    /** Dynamic-size constructor
+      */
+    inline BlockImpl_dense(XprType& xpr,
+          Index startRow, Index startCol,
+          Index blockRows, Index blockCols)
+      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol)), blockRows, blockCols),
+        m_xpr(xpr)
+    {
+      init();
+    }
+
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
+    { 
+      return m_xpr; 
+    }
+      
+    /** \sa MapBase::innerStride() */
+    inline Index innerStride() const
+    {
+      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
+             ? m_xpr.innerStride()
+             : m_xpr.outerStride();
+    }
+
+    /** \sa MapBase::outerStride() */
+    inline Index outerStride() const
+    {
+      return m_outerStride;
+    }
+
+  #ifndef __SUNPRO_CC
+  // FIXME sunstudio is not friendly with the above friend...
+  // META-FIXME there is no 'friend' keyword around here. Is this obsolete?
+  protected:
+  #endif
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal used by allowAligned() */
+    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
+      : Base(data, blockRows, blockCols), m_xpr(xpr)
+    {
+      init();
+    }
+    #endif
+
+  protected:
+    void init()
+    {
+      m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
+                    ? m_xpr.outerStride()
+                    : m_xpr.innerStride();
+    }
+
+    typename XprType::Nested m_xpr;
+    Index m_outerStride;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BLOCK_H
diff --git a/include/eigen3/Eigen/src/Core/BooleanRedux.h b/include/eigen3/Eigen/src/Core/BooleanRedux.h
new file mode 100644
index 0000000..be9f48a
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/BooleanRedux.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ALLANDANY_H
+#define EIGEN_ALLANDANY_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Derived, int UnrollCount>
+struct all_unroller
+{
+  enum {
+    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+  };
+
+  static inline bool run(const Derived &mat)
+  {
+    return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
+  }
+};
+
+template<typename Derived>
+struct all_unroller<Derived, 0>
+{
+  static inline bool run(const Derived &/*mat*/) { return true; }
+};
+
+template<typename Derived>
+struct all_unroller<Derived, Dynamic>
+{
+  static inline bool run(const Derived &) { return false; }
+};
+
+template<typename Derived, int UnrollCount>
+struct any_unroller
+{
+  enum {
+    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+  };
+
+  static inline bool run(const Derived &mat)
+  {
+    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
+  }
+};
+
+template<typename Derived>
+struct any_unroller<Derived, 0>
+{
+  static inline bool run(const Derived & /*mat*/) { return false; }
+};
+
+template<typename Derived>
+struct any_unroller<Derived, Dynamic>
+{
+  static inline bool run(const Derived &) { return false; }
+};
+
+} // end namespace internal
+
+/** \returns true if all coefficients are true
+  *
+  * Example: \include MatrixBase_all.cpp
+  * Output: \verbinclude MatrixBase_all.out
+  *
+  * \sa any(), Cwise::operator<()
+  */
+template<typename Derived>
+inline bool DenseBase<Derived>::all() const
+{
+  enum {
+    unroll = SizeAtCompileTime != Dynamic
+          && CoeffReadCost != Dynamic
+          && NumTraits<Scalar>::AddCost != Dynamic
+          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+  };
+  if(unroll)
+    return internal::all_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+  else
+  {
+    for(Index j = 0; j < cols(); ++j)
+      for(Index i = 0; i < rows(); ++i)
+        if (!coeff(i, j)) return false;
+    return true;
+  }
+}
+
+/** \returns true if at least one coefficient is true
+  *
+  * \sa all()
+  */
+template<typename Derived>
+inline bool DenseBase<Derived>::any() const
+{
+  enum {
+    unroll = SizeAtCompileTime != Dynamic
+          && CoeffReadCost != Dynamic
+          && NumTraits<Scalar>::AddCost != Dynamic
+          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+  };
+  if(unroll)
+    return internal::any_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+  else
+  {
+    for(Index j = 0; j < cols(); ++j)
+      for(Index i = 0; i < rows(); ++i)
+        if (coeff(i, j)) return true;
+    return false;
+  }
+}
+
+/** \returns the number of coefficients which evaluate to true
+  *
+  * \sa all(), any()
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::Index DenseBase<Derived>::count() const
+{
+  return derived().template cast<bool>().template cast<Index>().sum();
+}
+
+/** \returns true is \c *this contains at least one Not A Number (NaN).
+  *
+  * \sa allFinite()
+  */
+template<typename Derived>
+inline bool DenseBase<Derived>::hasNaN() const
+{
+  return !((derived().array()==derived().array()).all());
+}
+
+/** \returns true if \c *this contains only finite numbers, i.e., no NaN and no +/-INF values.
+  *
+  * \sa hasNaN()
+  */
+template<typename Derived>
+inline bool DenseBase<Derived>::allFinite() const
+{
+  return !((derived()-derived()).hasNaN());
+}
+    
+} // end namespace Eigen
+
+#endif // EIGEN_ALLANDANY_H
diff --git a/include/eigen3/Eigen/src/Core/CommaInitializer.h b/include/eigen3/Eigen/src/Core/CommaInitializer.h
new file mode 100644
index 0000000..a036d8c
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/CommaInitializer.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMMAINITIALIZER_H
+#define EIGEN_COMMAINITIALIZER_H
+
+namespace Eigen { 
+
+/** \class CommaInitializer
+  * \ingroup Core_Module
+  *
+  * \brief Helper class used by the comma initializer operator
+  *
+  * This class is internally used to implement the comma initializer feature. It is
+  * the return type of MatrixBase::operator<<, and most of the time this is the only
+  * way it is used.
+  *
+  * \sa \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
+  */
+template<typename XprType>
+struct CommaInitializer
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::Index Index;
+
+  inline CommaInitializer(XprType& xpr, const Scalar& s)
+    : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
+  {
+    m_xpr.coeffRef(0,0) = s;
+  }
+
+  template<typename OtherDerived>
+  inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
+    : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
+  {
+    m_xpr.block(0, 0, other.rows(), other.cols()) = other;
+  }
+
+  /* Copy/Move constructor which transfers ownership. This is crucial in 
+   * absence of return value optimization to avoid assertions during destruction. */
+  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
+  inline CommaInitializer(const CommaInitializer& o)
+  : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
+    // Mark original object as finished. In absence of R-value references we need to const_cast:
+    const_cast<CommaInitializer&>(o).m_row = m_xpr.rows();
+    const_cast<CommaInitializer&>(o).m_col = m_xpr.cols();
+    const_cast<CommaInitializer&>(o).m_currentBlockRows = 0;
+  }
+
+  /* inserts a scalar value in the target matrix */
+  CommaInitializer& operator,(const Scalar& s)
+  {
+    if (m_col==m_xpr.cols())
+    {
+      m_row+=m_currentBlockRows;
+      m_col = 0;
+      m_currentBlockRows = 1;
+      eigen_assert(m_row<m_xpr.rows()
+        && "Too many rows passed to comma initializer (operator<<)");
+    }
+    eigen_assert(m_col<m_xpr.cols()
+      && "Too many coefficients passed to comma initializer (operator<<)");
+    eigen_assert(m_currentBlockRows==1);
+    m_xpr.coeffRef(m_row, m_col++) = s;
+    return *this;
+  }
+
+  /* inserts a matrix expression in the target matrix */
+  template<typename OtherDerived>
+  CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
+  {
+    if(other.cols()==0 || other.rows()==0)
+      return *this;
+    if (m_col==m_xpr.cols())
+    {
+      m_row+=m_currentBlockRows;
+      m_col = 0;
+      m_currentBlockRows = other.rows();
+      eigen_assert(m_row+m_currentBlockRows<=m_xpr.rows()
+        && "Too many rows passed to comma initializer (operator<<)");
+    }
+    eigen_assert(m_col<m_xpr.cols()
+      && "Too many coefficients passed to comma initializer (operator<<)");
+    eigen_assert(m_currentBlockRows==other.rows());
+    if (OtherDerived::SizeAtCompileTime != Dynamic)
+      m_xpr.template block<OtherDerived::RowsAtCompileTime != Dynamic ? OtherDerived::RowsAtCompileTime : 1,
+                              OtherDerived::ColsAtCompileTime != Dynamic ? OtherDerived::ColsAtCompileTime : 1>
+                    (m_row, m_col) = other;
+    else
+      m_xpr.block(m_row, m_col, other.rows(), other.cols()) = other;
+    m_col += other.cols();
+    return *this;
+  }
+
+  inline ~CommaInitializer()
+  {
+    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
+         && m_col == m_xpr.cols()
+         && "Too few coefficients passed to comma initializer (operator<<)");
+  }
+
+  /** \returns the built matrix once all its coefficients have been set.
+    * Calling finished is 100% optional. Its purpose is to write expressions
+    * like this:
+    * \code
+    * quaternion.fromRotationMatrix((Matrix3f() << axis0, axis1, axis2).finished());
+    * \endcode
+    */
+  inline XprType& finished() { return m_xpr; }
+
+  XprType& m_xpr;   // target expression
+  Index m_row;              // current row id
+  Index m_col;              // current col id
+  Index m_currentBlockRows; // current block height
+};
+
+/** \anchor MatrixBaseCommaInitRef
+  * Convenient operator to set the coefficients of a matrix.
+  *
+  * The coefficients must be provided in a row major order and exactly match
+  * the size of the matrix. Otherwise an assertion is raised.
+  *
+  * Example: \include MatrixBase_set.cpp
+  * Output: \verbinclude MatrixBase_set.out
+  * 
+  * \note According the c++ standard, the argument expressions of this comma initializer are evaluated in arbitrary order.
+  *
+  * \sa CommaInitializer::finished(), class CommaInitializer
+  */
+template<typename Derived>
+inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+{
+  return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
+}
+
+/** \sa operator<<(const Scalar&) */
+template<typename Derived>
+template<typename OtherDerived>
+inline CommaInitializer<Derived>
+DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
+{
+  return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMMAINITIALIZER_H
diff --git a/include/eigen3/Eigen/src/Core/CoreIterators.h b/include/eigen3/Eigen/src/Core/CoreIterators.h
new file mode 100644
index 0000000..6da4683
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/CoreIterators.h
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COREITERATORS_H
+#define EIGEN_COREITERATORS_H
+
+namespace Eigen { 
+
+/* This file contains the respective InnerIterator definition of the expressions defined in Eigen/Core
+ */
+
+/** \ingroup SparseCore_Module
+  * \class InnerIterator
+  * \brief An InnerIterator allows to loop over the element of a sparse (or dense) matrix or expression
+  *
+  * todo
+  */
+
+// generic version for dense matrix and expressions
+template<typename Derived> class DenseBase<Derived>::InnerIterator
+{
+  protected:
+    typedef typename Derived::Scalar Scalar;
+    typedef typename Derived::Index Index;
+
+    enum { IsRowMajor = (Derived::Flags&RowMajorBit)==RowMajorBit };
+  public:
+    EIGEN_STRONG_INLINE InnerIterator(const Derived& expr, Index outer)
+      : m_expression(expr), m_inner(0), m_outer(outer), m_end(expr.innerSize())
+    {}
+
+    EIGEN_STRONG_INLINE Scalar value() const
+    {
+      return (IsRowMajor) ? m_expression.coeff(m_outer, m_inner)
+                          : m_expression.coeff(m_inner, m_outer);
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; return *this; }
+
+    EIGEN_STRONG_INLINE Index index() const { return m_inner; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+  protected:
+    const Derived& m_expression;
+    Index m_inner;
+    const Index m_outer;
+    const Index m_end;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_COREITERATORS_H
diff --git a/include/eigen3/Eigen/src/Core/CwiseBinaryOp.h b/include/eigen3/Eigen/src/Core/CwiseBinaryOp.h
new file mode 100644
index 0000000..586f77a
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/CwiseBinaryOp.h
@@ -0,0 +1,229 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_BINARY_OP_H
+#define EIGEN_CWISE_BINARY_OP_H
+
+namespace Eigen {
+
+/** \class CwiseBinaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
+  *
+  * \param BinaryOp template functor implementing the operator
+  * \param Lhs the type of the left-hand side
+  * \param Rhs the type of the right-hand side
+  *
+  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
+  * It is the return type of binary operators, by which we mean only those binary operators where
+  * both the left-hand side and the right-hand side are Eigen expressions.
+  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseBinaryOp types explicitly.
+  *
+  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
+  */
+
+namespace internal {
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  // we must not inherit from traits<Lhs> since it has
+  // the potential to cause problems with MSVC
+  typedef typename remove_all<Lhs>::type Ancestor;
+  typedef typename traits<Ancestor>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = traits<Ancestor>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<Ancestor>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<Ancestor>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<Ancestor>::MaxColsAtCompileTime
+  };
+
+  // even though we require Lhs and Rhs to have the same scalar type (see CwiseBinaryOp constructor),
+  // we still want to handle the case when the result type is different.
+  typedef typename result_of<
+                     BinaryOp(
+                       typename Lhs::Scalar,
+                       typename Rhs::Scalar
+                     )
+                   >::type Scalar;
+  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
+                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<Lhs>::Index,
+                                         typename traits<Rhs>::Index>::type Index;
+  typedef typename Lhs::Nested LhsNested;
+  typedef typename Rhs::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  enum {
+    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+    LhsFlags = _LhsNested::Flags,
+    RhsFlags = _RhsNested::Flags,
+    SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
+    StorageOrdersAgree = (int(Lhs::Flags)&RowMajorBit)==(int(Rhs::Flags)&RowMajorBit),
+    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
+        HereditaryBits
+      | (int(LhsFlags) & int(RhsFlags) &
+           ( AlignedBit
+           | (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + functor_traits<BinaryOp>::Cost
+  };
+};
+} // end namespace internal
+
+// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
+// that would take two operands of different types. If there were such an example, then this check should be
+// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
+// currently they take only one typename Scalar template parameter.
+// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
+// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
+// add together a float matrix and a double matrix.
+#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
+  EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
+                        ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
+                        : int(internal::is_same<LHS, RHS>::value)), \
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
+class CwiseBinaryOpImpl;
+
+template<typename BinaryOp, typename Lhs, typename Rhs>
+class CwiseBinaryOp : internal::no_assignment_operator,
+  public CwiseBinaryOpImpl<
+          BinaryOp, Lhs, Rhs,
+          typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                           typename internal::traits<Rhs>::StorageKind>::ret>
+{
+  public:
+
+    typedef typename CwiseBinaryOpImpl<
+        BinaryOp, Lhs, Rhs,
+        typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                         typename internal::traits<Rhs>::StorageKind>::ret>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)
+
+    typedef typename internal::nested<Lhs>::type LhsNested;
+    typedef typename internal::nested<Rhs>::type RhsNested;
+    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
+    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;
+
+    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
+      : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
+    {
+      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
+      // require the sizes to match
+      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs, Rhs)
+      eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const {
+      // return the fixed size type if available to enable compile time optimizations
+      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
+        return m_rhs.rows();
+      else
+        return m_lhs.rows();
+    }
+    EIGEN_STRONG_INLINE Index cols() const {
+      // return the fixed size type if available to enable compile time optimizations
+      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
+        return m_rhs.cols();
+      else
+        return m_lhs.cols();
+    }
+
+    /** \returns the left hand side nested expression */
+    const _LhsNested& lhs() const { return m_lhs; }
+    /** \returns the right hand side nested expression */
+    const _RhsNested& rhs() const { return m_rhs; }
+    /** \returns the functor representing the binary operation */
+    const BinaryOp& functor() const { return m_functor; }
+
+  protected:
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+    const BinaryOp m_functor;
+};
+
+template<typename BinaryOp, typename Lhs, typename Rhs>
+class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
+  : public internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+{
+    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
+  public:
+
+    typedef typename internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE( Derived )
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
+    {
+      return derived().functor()(derived().lhs().coeff(rowId, colId),
+                                 derived().rhs().coeff(rowId, colId));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
+    {
+      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(rowId, colId),
+                                          derived().rhs().template packet<LoadMode>(rowId, colId));
+    }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      return derived().functor()(derived().lhs().coeff(index),
+                                 derived().rhs().coeff(index));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
+    {
+      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(index),
+                                          derived().rhs().template packet<LoadMode>(index));
+    }
+};
+
+/** replaces \c *this by \c *this - \a other.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived &
+MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
+{
+  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
+  return derived();
+}
+
+/** replaces \c *this by \c *this + \a other.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived &
+MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
+{
+  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CWISE_BINARY_OP_H
diff --git a/include/eigen3/Eigen/src/Core/CwiseNullaryOp.h b/include/eigen3/Eigen/src/Core/CwiseNullaryOp.h
new file mode 100644
index 0000000..a93bab2
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/CwiseNullaryOp.h
@@ -0,0 +1,864 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_NULLARY_OP_H
+#define EIGEN_CWISE_NULLARY_OP_H
+
+namespace Eigen {
+
+/** \class CwiseNullaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression of a matrix where all coefficients are defined by a functor
+  *
+  * \param NullaryOp template functor implementing the operator
+  * \param PlainObjectType the underlying plain matrix/array type
+  *
+  * This class represents an expression of a generic nullary operator.
+  * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
+  * and most of the time this is the only way it is used.
+  *
+  * However, if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
+  */
+
+namespace internal {
+template<typename NullaryOp, typename PlainObjectType>
+struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
+{
+  enum {
+    Flags = (traits<PlainObjectType>::Flags
+      & (  HereditaryBits
+         | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
+         | (functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
+      | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
+    CoeffReadCost = functor_traits<NullaryOp>::Cost
+  };
+};
+}
+
+template<typename NullaryOp, typename PlainObjectType>
+class CwiseNullaryOp : internal::no_assignment_operator,
+  public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
+
+    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
+      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
+    {
+      eigen_assert(nbRows >= 0
+            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
+            &&  nbCols >= 0
+            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
+    {
+      return m_functor(rowId, colId);
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
+    {
+      return m_functor.packetOp(rowId, colId);
+    }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      return m_functor(index);
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
+    {
+      return m_functor.packetOp(index);
+    }
+
+    /** \returns the functor representing the nullary operation */
+    const NullaryOp& functor() const { return m_functor; }
+
+  protected:
+    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
+    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
+    const NullaryOp m_functor;
+};
+
+
+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+template<typename CustomNullaryOp>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
+{
+  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
+}
+
+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+template<typename CustomNullaryOp>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, Derived>(1, size, func);
+  else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
+}
+
+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * This variant is only for fixed-size DenseBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+template<typename CustomNullaryOp>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
+{
+  return CwiseNullaryOp<CustomNullaryOp, Derived>(RowsAtCompileTime, ColsAtCompileTime, func);
+}
+
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this DenseBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a nbRows and \a nbCols as arguments, so Zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(Index nbRows, Index nbCols, const Scalar& value)
+{
+  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_constant_op<Scalar>(value));
+}
+
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this DenseBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(Index size, const Scalar& value)
+{
+  return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
+}
+
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * This variant is only for fixed-size DenseBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(const Scalar& value)
+{
+  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+  return DenseBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_constant_op<Scalar>(value));
+}
+
+/**
+  * \brief Sets a linearly space vector.
+  *
+  * The function generates 'size' equally spaced values in the closed interval [low,high].
+  * This particular version of LinSpaced() uses sequential access, i.e. vector access is
+  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
+  * and yields faster code than the random access version.
+  *
+  * When size is set to 1, a vector of length 1 containing 'high' is returned.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include DenseBase_LinSpaced_seq.cpp
+  * Output: \verbinclude DenseBase_LinSpaced_seq.out
+  *
+  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Index,Scalar,Scalar), CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,false>(low,high,size));
+}
+
+/**
+  * \copydoc DenseBase::LinSpaced(Sequential_t, Index, const Scalar&, const Scalar&)
+  * Special version for fixed size types which does not require the size parameter.
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,false>(low,high,Derived::SizeAtCompileTime));
+}
+
+/**
+  * \brief Sets a linearly space vector.
+  *
+  * The function generates 'size' equally spaced values in the closed interval [low,high].
+  * When size is set to 1, a vector of length 1 containing 'high' is returned.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include DenseBase_LinSpaced.cpp
+  * Output: \verbinclude DenseBase_LinSpaced.out
+  *
+  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Sequential_t,Index,const Scalar&,const Scalar&,Index), CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,true>(low,high,size));
+}
+
+/**
+  * \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&)
+  * Special version for fixed size types which does not require the size parameter.
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,true>(low,high,Derived::SizeAtCompileTime));
+}
+
+/** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
+template<typename Derived>
+bool DenseBase<Derived>::isApproxToConstant
+(const Scalar& val, const RealScalar& prec) const
+{
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < rows(); ++i)
+      if(!internal::isApprox(this->coeff(i, j), val, prec))
+        return false;
+  return true;
+}
+
+/** This is just an alias for isApproxToConstant().
+  *
+  * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
+template<typename Derived>
+bool DenseBase<Derived>::isConstant
+(const Scalar& val, const RealScalar& prec) const
+{
+  return isApproxToConstant(val, prec);
+}
+
+/** Alias for setConstant(): sets all coefficients in this expression to \a val.
+  *
+  * \sa setConstant(), Constant(), class CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
+{
+  setConstant(val);
+}
+
+/** Sets all coefficients in this expression to \a value.
+  *
+  * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
+{
+  return derived() = Constant(rows(), cols(), val);
+}
+
+/** Resizes to the given \a size, and sets all coefficients in this expression to the given \a value.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include Matrix_setConstant_int.cpp
+  * Output: \verbinclude Matrix_setConstant_int.out
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
+{
+  resize(size);
+  return setConstant(val);
+}
+
+/** Resizes to the given size, and sets all coefficients in this expression to the given \a value.
+  *
+  * \param nbRows the new number of rows
+  * \param nbCols the new number of columns
+  * \param val the value to which all coefficients are set
+  *
+  * Example: \include Matrix_setConstant_int_int.cpp
+  * Output: \verbinclude Matrix_setConstant_int_int.out
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(Index nbRows, Index nbCols, const Scalar& val)
+{
+  resize(nbRows, nbCols);
+  return setConstant(val);
+}
+
+/**
+  * \brief Sets a linearly space vector.
+  *
+  * The function generates 'size' equally spaced values in the closed interval [low,high].
+  * When size is set to 1, a vector of length 1 containing 'high' is returned.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include DenseBase_setLinSpaced.cpp
+  * Output: \verbinclude DenseBase_setLinSpaced.out
+  *
+  * \sa CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,false>(low,high,newSize));
+}
+
+/**
+  * \brief Sets a linearly space vector.
+  *
+  * The function fill *this with equally spaced values in the closed interval [low,high].
+  * When size is set to 1, a vector of length 1 containing 'high' is returned.
+  *
+  * \only_for_vectors
+  *
+  * \sa setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return setLinSpaced(size(), low, high);
+}
+
+// zero:
+
+/** \returns an expression of a zero matrix.
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_zero_int_int.cpp
+  * Output: \verbinclude MatrixBase_zero_int_int.out
+  *
+  * \sa Zero(), Zero(Index)
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Zero(Index nbRows, Index nbCols)
+{
+  return Constant(nbRows, nbCols, Scalar(0));
+}
+
+/** \returns an expression of a zero vector.
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Zero() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_zero_int.cpp
+  * Output: \verbinclude MatrixBase_zero_int.out
+  *
+  * \sa Zero(), Zero(Index,Index)
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Zero(Index size)
+{
+  return Constant(size, Scalar(0));
+}
+
+/** \returns an expression of a fixed-size zero matrix or vector.
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * Example: \include MatrixBase_zero.cpp
+  * Output: \verbinclude MatrixBase_zero.out
+  *
+  * \sa Zero(Index), Zero(Index,Index)
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Zero()
+{
+  return Constant(Scalar(0));
+}
+
+/** \returns true if *this is approximately equal to the zero matrix,
+  *          within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isZero.cpp
+  * Output: \verbinclude MatrixBase_isZero.out
+  *
+  * \sa class CwiseNullaryOp, Zero()
+  */
+template<typename Derived>
+bool DenseBase<Derived>::isZero(const RealScalar& prec) const
+{
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < rows(); ++i)
+      if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec))
+        return false;
+  return true;
+}
+
+/** Sets all coefficients in this expression to zero.
+  *
+  * Example: \include MatrixBase_setZero.cpp
+  * Output: \verbinclude MatrixBase_setZero.out
+  *
+  * \sa class CwiseNullaryOp, Zero()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
+{
+  return setConstant(Scalar(0));
+}
+
+/** Resizes to the given \a size, and sets all coefficients in this expression to zero.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include Matrix_setZero_int.cpp
+  * Output: \verbinclude Matrix_setZero_int.out
+  *
+  * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(Index newSize)
+{
+  resize(newSize);
+  return setConstant(Scalar(0));
+}
+
+/** Resizes to the given size, and sets all coefficients in this expression to zero.
+  *
+  * \param nbRows the new number of rows
+  * \param nbCols the new number of columns
+  *
+  * Example: \include Matrix_setZero_int_int.cpp
+  * Output: \verbinclude Matrix_setZero_int_int.out
+  *
+  * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
+{
+  resize(nbRows, nbCols);
+  return setConstant(Scalar(0));
+}
+
+// ones:
+
+/** \returns an expression of a matrix where all coefficients equal one.
+  *
+  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Ones() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_ones_int_int.cpp
+  * Output: \verbinclude MatrixBase_ones_int_int.out
+  *
+  * \sa Ones(), Ones(Index), isOnes(), class Ones
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Ones(Index nbRows, Index nbCols)
+{
+  return Constant(nbRows, nbCols, Scalar(1));
+}
+
+/** \returns an expression of a vector where all coefficients equal one.
+  *
+  * The parameter \a newSize is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Ones() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_ones_int.cpp
+  * Output: \verbinclude MatrixBase_ones_int.out
+  *
+  * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Ones(Index newSize)
+{
+  return Constant(newSize, Scalar(1));
+}
+
+/** \returns an expression of a fixed-size matrix or vector where all coefficients equal one.
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * Example: \include MatrixBase_ones.cpp
+  * Output: \verbinclude MatrixBase_ones.out
+  *
+  * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Ones()
+{
+  return Constant(Scalar(1));
+}
+
+/** \returns true if *this is approximately equal to the matrix where all coefficients
+  *          are equal to 1, within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isOnes.cpp
+  * Output: \verbinclude MatrixBase_isOnes.out
+  *
+  * \sa class CwiseNullaryOp, Ones()
+  */
+template<typename Derived>
+bool DenseBase<Derived>::isOnes
+(const RealScalar& prec) const
+{
+  return isApproxToConstant(Scalar(1), prec);
+}
+
+/** Sets all coefficients in this expression to one.
+  *
+  * Example: \include MatrixBase_setOnes.cpp
+  * Output: \verbinclude MatrixBase_setOnes.out
+  *
+  * \sa class CwiseNullaryOp, Ones()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
+{
+  return setConstant(Scalar(1));
+}
+
+/** Resizes to the given \a newSize, and sets all coefficients in this expression to one.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include Matrix_setOnes_int.cpp
+  * Output: \verbinclude Matrix_setOnes_int.out
+  *
+  * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(Index newSize)
+{
+  resize(newSize);
+  return setConstant(Scalar(1));
+}
+
+/** Resizes to the given size, and sets all coefficients in this expression to one.
+  *
+  * \param nbRows the new number of rows
+  * \param nbCols the new number of columns
+  *
+  * Example: \include Matrix_setOnes_int_int.cpp
+  * Output: \verbinclude Matrix_setOnes_int_int.out
+  *
+  * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
+{
+  resize(nbRows, nbCols);
+  return setConstant(Scalar(1));
+}
+
+// Identity:
+
+/** \returns an expression of the identity matrix (not necessarily square).
+  *
+  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Identity() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_identity_int_int.cpp
+  * Output: \verbinclude MatrixBase_identity_int_int.out
+  *
+  * \sa Identity(), setIdentity(), isIdentity()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+MatrixBase<Derived>::Identity(Index nbRows, Index nbCols)
+{
+  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_identity_op<Scalar>());
+}
+
+/** \returns an expression of the identity matrix (not necessarily square).
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variant taking size arguments.
+  *
+  * Example: \include MatrixBase_identity.cpp
+  * Output: \verbinclude MatrixBase_identity.out
+  *
+  * \sa Identity(Index,Index), setIdentity(), isIdentity()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+MatrixBase<Derived>::Identity()
+{
+  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+  return MatrixBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_identity_op<Scalar>());
+}
+
+/** \returns true if *this is approximately equal to the identity matrix
+  *          (not necessarily square),
+  *          within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isIdentity.cpp
+  * Output: \verbinclude MatrixBase_isIdentity.out
+  *
+  * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), setIdentity()
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isIdentity
+(const RealScalar& prec) const
+{
+  for(Index j = 0; j < cols(); ++j)
+  {
+    for(Index i = 0; i < rows(); ++i)
+    {
+      if(i == j)
+      {
+        if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec))
+          return false;
+      }
+      else
+      {
+        if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+namespace internal {
+
+template<typename Derived, bool Big = (Derived::SizeAtCompileTime>=16)>
+struct setIdentity_impl
+{
+  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
+  {
+    return m = Derived::Identity(m.rows(), m.cols());
+  }
+};
+
+template<typename Derived>
+struct setIdentity_impl<Derived, true>
+{
+  typedef typename Derived::Index Index;
+  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
+  {
+    m.setZero();
+    const Index size = (std::min)(m.rows(), m.cols());
+    for(Index i = 0; i < size; ++i) m.coeffRef(i,i) = typename Derived::Scalar(1);
+    return m;
+  }
+};
+
+} // end namespace internal
+
+/** Writes the identity expression (not necessarily square) into *this.
+  *
+  * Example: \include MatrixBase_setIdentity.cpp
+  * Output: \verbinclude MatrixBase_setIdentity.out
+  *
+  * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
+{
+  return internal::setIdentity_impl<Derived>::run(derived());
+}
+
+/** \brief Resizes to the given size, and writes the identity expression (not necessarily square) into *this.
+  *
+  * \param nbRows the new number of rows
+  * \param nbCols the new number of columns
+  *
+  * Example: \include Matrix_setIdentity_int_int.cpp
+  * Output: \verbinclude Matrix_setIdentity_int_int.out
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index nbRows, Index nbCols)
+{
+  derived().resize(nbRows, nbCols);
+  return setIdentity();
+}
+
+/** \returns an expression of the i-th unit (basis) vector.
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
+}
+
+/** \returns an expression of the i-th unit (basis) vector.
+  *
+  * \only_for_vectors
+  *
+  * This variant is for fixed-size vector only.
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return BasisReturnType(SquareMatrixType::Identity(),i);
+}
+
+/** \returns an expression of the X axis unit vector (1{,0}^*)
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
+{ return Derived::Unit(0); }
+
+/** \returns an expression of the Y axis unit vector (0,1{,0}^*)
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
+{ return Derived::Unit(1); }
+
+/** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
+{ return Derived::Unit(2); }
+
+/** \returns an expression of the W axis unit vector (0,0,0,1)
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
+{ return Derived::Unit(3); }
+
+} // end namespace Eigen
+
+#endif // EIGEN_CWISE_NULLARY_OP_H
diff --git a/include/eigen3/Eigen/src/Core/CwiseUnaryOp.h b/include/eigen3/Eigen/src/Core/CwiseUnaryOp.h
new file mode 100644
index 0000000..f2de749
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/CwiseUnaryOp.h
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_UNARY_OP_H
+#define EIGEN_CWISE_UNARY_OP_H
+
+namespace Eigen { 
+
+/** \class CwiseUnaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
+  *
+  * \param UnaryOp template functor implementing the operator
+  * \param XprType the type of the expression to which we are applying the unary operator
+  *
+  * This class represents an expression where a unary operator is applied to an expression.
+  * It is the return type of all operations taking exactly 1 input expression, regardless of the
+  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
+  * is considered unary, because only the right-hand side is an expression, and its
+  * return type is a specialization of CwiseUnaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseUnaryOp types explicitly.
+  *
+  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
+  */
+
+namespace internal {
+template<typename UnaryOp, typename XprType>
+struct traits<CwiseUnaryOp<UnaryOp, XprType> >
+ : traits<XprType>
+{
+  typedef typename result_of<
+                     UnaryOp(typename XprType::Scalar)
+                   >::type Scalar;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  enum {
+    Flags = _XprTypeNested::Flags & (
+      HereditaryBits | LinearAccessBit | AlignedBit
+      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
+    CoeffReadCost = _XprTypeNested::CoeffReadCost + functor_traits<UnaryOp>::Cost
+  };
+};
+}
+
+template<typename UnaryOp, typename XprType, typename StorageKind>
+class CwiseUnaryOpImpl;
+
+template<typename UnaryOp, typename XprType>
+class CwiseUnaryOp : internal::no_assignment_operator,
+  public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>
+{
+  public:
+
+    typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+
+    inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+      : m_xpr(xpr), m_functor(func) {}
+
+    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }
+
+    /** \returns the functor representing the unary operation */
+    const UnaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expression */
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    nestedExpression() const { return m_xpr; }
+
+    /** \returns the nested expression */
+    typename internal::remove_all<typename XprType::Nested>::type&
+    nestedExpression() { return m_xpr.const_cast_derived(); }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const UnaryOp m_functor;
+};
+
+// This is the generic implementation for dense storage.
+// It can be used for any expression types implementing the dense concept.
+template<typename UnaryOp, typename XprType>
+class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
+  : public internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
+{
+  public:
+
+    typedef CwiseUnaryOp<UnaryOp, XprType> Derived;
+    typedef typename internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
+    {
+      return derived().functor()(derived().nestedExpression().coeff(rowId, colId));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
+    {
+      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(rowId, colId));
+    }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      return derived().functor()(derived().nestedExpression().coeff(index));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
+    {
+      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(index));
+    }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CWISE_UNARY_OP_H
diff --git a/include/eigen3/Eigen/src/Core/CwiseUnaryView.h b/include/eigen3/Eigen/src/Core/CwiseUnaryView.h
new file mode 100644
index 0000000..b2638d3
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/CwiseUnaryView.h
@@ -0,0 +1,139 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_UNARY_VIEW_H
+#define EIGEN_CWISE_UNARY_VIEW_H
+
+namespace Eigen {
+
+/** \class CwiseUnaryView
+  * \ingroup Core_Module
+  *
+  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
+  *
+  * \param ViewOp template functor implementing the view
+  * \param MatrixType the type of the matrix we are applying the unary operator
+  *
+  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
+  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
+  */
+
+namespace internal {
+template<typename ViewOp, typename MatrixType>
+struct traits<CwiseUnaryView<ViewOp, MatrixType> >
+ : traits<MatrixType>
+{
+  typedef typename result_of<
+                     ViewOp(typename traits<MatrixType>::Scalar)
+                   >::type Scalar;
+  typedef typename MatrixType::Nested MatrixTypeNested;
+  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    Flags = (traits<_MatrixTypeNested>::Flags & (HereditaryBits | LvalueBit | LinearAccessBit | DirectAccessBit)),
+    CoeffReadCost = traits<_MatrixTypeNested>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+    MatrixTypeInnerStride =  inner_stride_at_compile_time<MatrixType>::ret,
+    // need to cast the sizeof's from size_t to int explicitly, otherwise:
+    // "error: no integral type can represent all of the enumerator values
+    InnerStrideAtCompileTime = MatrixTypeInnerStride == Dynamic
+                             ? int(Dynamic)
+                             : int(MatrixTypeInnerStride) * int(sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar)),
+    OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret == Dynamic
+                             ? int(Dynamic)
+                             : outer_stride_at_compile_time<MatrixType>::ret * int(sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar))
+  };
+};
+}
+
+template<typename ViewOp, typename MatrixType, typename StorageKind>
+class CwiseUnaryViewImpl;
+
+template<typename ViewOp, typename MatrixType>
+class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename internal::traits<MatrixType>::StorageKind>
+{
+  public:
+
+    typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+
+    inline CwiseUnaryView(const MatrixType& mat, const ViewOp& func = ViewOp())
+      : m_matrix(mat), m_functor(func) {}
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
+
+    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
+
+    /** \returns the functor representing unary operation */
+    const ViewOp& functor() const { return m_functor; }
+
+    /** \returns the nested expression */
+    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() const { return m_matrix; }
+
+    /** \returns the nested expression */
+    typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() { return m_matrix.const_cast_derived(); }
+
+  protected:
+    // FIXME changed from MatrixType::Nested because of a weird compilation error with sun CC
+    typename internal::nested<MatrixType>::type m_matrix;
+    ViewOp m_functor;
+};
+
+template<typename ViewOp, typename MatrixType>
+class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
+  : public internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type
+{
+  public:
+
+    typedef CwiseUnaryView<ViewOp, MatrixType> Derived;
+    typedef typename internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type Base;
+
+    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
+    
+    inline Scalar* data() { return &coeffRef(0); }
+    inline const Scalar* data() const { return &coeff(0); }
+
+    inline Index innerStride() const
+    {
+      return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
+    }
+
+    inline Index outerStride() const
+    {
+      return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
+    }
+
+    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
+    {
+      return derived().functor()(derived().nestedExpression().coeff(row, col));
+    }
+
+    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+    {
+      return derived().functor()(derived().nestedExpression().coeff(index));
+    }
+
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
+    {
+      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(row, col));
+    }
+
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(index));
+    }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CWISE_UNARY_VIEW_H
diff --git a/include/eigen3/Eigen/src/Core/DenseBase.h b/include/eigen3/Eigen/src/Core/DenseBase.h
new file mode 100644
index 0000000..04862f3
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/DenseBase.h
@@ -0,0 +1,523 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DENSEBASE_H
+#define EIGEN_DENSEBASE_H
+
+namespace Eigen {
+
+namespace internal {
+  
+// The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type.
+// This dummy function simply aims at checking that at compile time.
+static inline void check_DenseIndex_is_signed() {
+  EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); 
+}
+
+} // end namespace internal
+  
+/** \class DenseBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for all dense matrices, vectors, and arrays
+  *
+  * This class is the base that is inherited by all dense objects (matrix, vector, arrays,
+  * and related expression types). The common Eigen API for dense objects is contained in this class.
+  *
+  * \tparam Derived is the derived type, e.g., a matrix type or an expression.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
+  *
+  * \sa \ref TopicClassHierarchy
+  */
+template<typename Derived> class DenseBase
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
+                                     typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>
+#else
+  : public DenseCoeffsBase<Derived>
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+{
+  public:
+    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
+                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
+
+    class InnerIterator;
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+
+    /** \brief The type of indices 
+      * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+      * \sa \ref TopicPreprocessorDirectives.
+      */
+    typedef typename internal::traits<Derived>::Index Index; 
+
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    typedef DenseCoeffsBase<Derived> Base;
+    using Base::derived;
+    using Base::const_cast_derived;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::rowIndexByOuterInner;
+    using Base::colIndexByOuterInner;
+    using Base::coeff;
+    using Base::coeffByOuterInner;
+    using Base::packet;
+    using Base::packetByOuterInner;
+    using Base::writePacket;
+    using Base::writePacketByOuterInner;
+    using Base::coeffRef;
+    using Base::coeffRefByOuterInner;
+    using Base::copyCoeff;
+    using Base::copyCoeffByOuterInner;
+    using Base::copyPacket;
+    using Base::copyPacketByOuterInner;
+    using Base::operator();
+    using Base::operator[];
+    using Base::x;
+    using Base::y;
+    using Base::z;
+    using Base::w;
+    using Base::stride;
+    using Base::innerStride;
+    using Base::outerStride;
+    using Base::rowStride;
+    using Base::colStride;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+    enum {
+
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+        /**< The number of rows at compile-time. This is just a copy of the value provided
+          * by the \a Derived type. If a value is not known at compile-time,
+          * it is set to the \a Dynamic constant.
+          * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
+
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+        /**< The number of columns at compile-time. This is just a copy of the value provided
+          * by the \a Derived type. If a value is not known at compile-time,
+          * it is set to the \a Dynamic constant.
+          * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
+
+
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
+        /**< This is equal to the number of coefficients, i.e. the number of
+          * rows times the number of columns, or to \a Dynamic if this is not
+          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+        /**< This value is equal to the maximum possible number of rows that this expression
+          * might have. If this expression might have an arbitrarily high number of rows,
+          * this value is set to \a Dynamic.
+          *
+          * This value is useful to know when evaluating an expression, in order to determine
+          * whether it is possible to avoid doing a dynamic memory allocation.
+          *
+          * \sa RowsAtCompileTime, MaxColsAtCompileTime, MaxSizeAtCompileTime
+          */
+
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+        /**< This value is equal to the maximum possible number of columns that this expression
+          * might have. If this expression might have an arbitrarily high number of columns,
+          * this value is set to \a Dynamic.
+          *
+          * This value is useful to know when evaluating an expression, in order to determine
+          * whether it is possible to avoid doing a dynamic memory allocation.
+          *
+          * \sa ColsAtCompileTime, MaxRowsAtCompileTime, MaxSizeAtCompileTime
+          */
+
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                      internal::traits<Derived>::MaxColsAtCompileTime>::ret),
+        /**< This value is equal to the maximum possible number of coefficients that this expression
+          * might have. If this expression might have an arbitrarily high number of coefficients,
+          * this value is set to \a Dynamic.
+          *
+          * This value is useful to know when evaluating an expression, in order to determine
+          * whether it is possible to avoid doing a dynamic memory allocation.
+          *
+          * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
+          */
+
+      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+        /**< This is set to true if either the number of rows or the number of
+          * columns is known at compile-time to be equal to 1. Indeed, in that case,
+          * we are dealing with a column-vector (if there is only one column) or with
+          * a row-vector (if there is only one row). */
+
+      Flags = internal::traits<Derived>::Flags,
+        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
+          * constructed from this one. See the \ref flags "list of flags".
+          */
+
+      IsRowMajor = int(Flags) & RowMajorBit, /**< True if this expression has row-major storage order. */
+
+      InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
+                             : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+
+      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+        /**< This is a rough measure of how expensive it is to read one coefficient from
+          * this expression.
+          */
+
+      InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
+      OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
+    };
+
+    enum { ThisConstantIsPrivateInPlainObjectBase };
+
+    /** \returns the number of nonzero coefficients which is in practice the number
+      * of stored coefficients. */
+    inline Index nonZeros() const { return size(); }
+    /** \returns true if either the number of rows or the number of columns is equal to 1.
+      * In other words, this function returns
+      * \code rows()==1 || cols()==1 \endcode
+      * \sa rows(), cols(), IsVectorAtCompileTime. */
+
+    /** \returns the outer size.
+      *
+      * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
+      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
+      * column-major matrix, and the number of rows for a row-major matrix. */
+    Index outerSize() const
+    {
+      return IsVectorAtCompileTime ? 1
+           : int(IsRowMajor) ? this->rows() : this->cols();
+    }
+
+    /** \returns the inner size.
+      *
+      * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
+      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
+      * column-major matrix, and the number of columns for a row-major matrix. */
+    Index innerSize() const
+    {
+      return IsVectorAtCompileTime ? this->size()
+           : int(IsRowMajor) ? this->cols() : this->rows();
+    }
+
+    /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
+      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
+      * nothing else.
+      */
+    void resize(Index newSize)
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(newSize);
+      eigen_assert(newSize == this->size()
+                && "DenseBase::resize() does not actually allow to resize.");
+    }
+    /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
+      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
+      * nothing else.
+      */
+    void resize(Index nbRows, Index nbCols)
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
+      EIGEN_ONLY_USED_FOR_DEBUG(nbCols);
+      eigen_assert(nbRows == this->rows() && nbCols == this->cols()
+                && "DenseBase::resize() does not actually allow to resize.");
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+    /** \internal Represents a matrix with all coefficients equal to one another*/
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    /** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,Derived> SequentialLinSpacedReturnType;
+    /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,Derived> RandomAccessLinSpacedReturnType;
+    /** \internal the return type of MatrixBase::eigenvalues() */
+    typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
+
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    /** Copies \a other into *this. \returns a reference to *this. */
+    template<typename OtherDerived>
+    Derived& operator=(const DenseBase<OtherDerived>& other);
+
+    /** Special case of the template operator=, in order to prevent the compiler
+      * from generating a default operator= (issue hit with g++ 4.1)
+      */
+    Derived& operator=(const DenseBase& other);
+
+    template<typename OtherDerived>
+    Derived& operator=(const EigenBase<OtherDerived> &other);
+
+    template<typename OtherDerived>
+    Derived& operator+=(const EigenBase<OtherDerived> &other);
+
+    template<typename OtherDerived>
+    Derived& operator-=(const EigenBase<OtherDerived> &other);
+
+    template<typename OtherDerived>
+    Derived& operator=(const ReturnByValue<OtherDerived>& func);
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Copies \a other into *this without evaluating other. \returns a reference to *this. */
+    template<typename OtherDerived>
+    Derived& lazyAssign(const DenseBase<OtherDerived>& other);
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    CommaInitializer<Derived> operator<< (const Scalar& s);
+
+    template<unsigned int Added,unsigned int Removed>
+    const Flagged<Derived, Added, Removed> flagged() const;
+
+    template<typename OtherDerived>
+    CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);
+
+    Eigen::Transpose<Derived> transpose();
+	typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    ConstTransposeReturnType transpose() const;
+    void transposeInPlace();
+#ifndef EIGEN_NO_DEBUG
+  protected:
+    template<typename OtherDerived>
+    void checkTransposeAliasing(const OtherDerived& other) const;
+  public:
+#endif
+
+
+    static const ConstantReturnType
+    Constant(Index rows, Index cols, const Scalar& value);
+    static const ConstantReturnType
+    Constant(Index size, const Scalar& value);
+    static const ConstantReturnType
+    Constant(const Scalar& value);
+
+    static const SequentialLinSpacedReturnType
+    LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
+    static const RandomAccessLinSpacedReturnType
+    LinSpaced(Index size, const Scalar& low, const Scalar& high);
+    static const SequentialLinSpacedReturnType
+    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
+    static const RandomAccessLinSpacedReturnType
+    LinSpaced(const Scalar& low, const Scalar& high);
+
+    template<typename CustomNullaryOp>
+    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
+    template<typename CustomNullaryOp>
+    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    NullaryExpr(Index size, const CustomNullaryOp& func);
+    template<typename CustomNullaryOp>
+    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    NullaryExpr(const CustomNullaryOp& func);
+
+    static const ConstantReturnType Zero(Index rows, Index cols);
+    static const ConstantReturnType Zero(Index size);
+    static const ConstantReturnType Zero();
+    static const ConstantReturnType Ones(Index rows, Index cols);
+    static const ConstantReturnType Ones(Index size);
+    static const ConstantReturnType Ones();
+
+    void fill(const Scalar& value);
+    Derived& setConstant(const Scalar& value);
+    Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
+    Derived& setLinSpaced(const Scalar& low, const Scalar& high);
+    Derived& setZero();
+    Derived& setOnes();
+    Derived& setRandom();
+
+    template<typename OtherDerived>
+    bool isApprox(const DenseBase<OtherDerived>& other,
+                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isMuchSmallerThan(const RealScalar& other,
+                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    template<typename OtherDerived>
+    bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
+                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+    bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    
+    inline bool hasNaN() const;
+    inline bool allFinite() const;
+
+    inline Derived& operator*=(const Scalar& other);
+    inline Derived& operator/=(const Scalar& other);
+
+    typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
+    /** \returns the matrix or vector obtained by evaluating this expression.
+      *
+      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
+      * a const reference, in order to avoid a useless copy.
+      */
+    EIGEN_STRONG_INLINE EvalReturnType eval() const
+    {
+      // Even though MSVC does not honor strong inlining when the return type
+      // is a dynamic matrix, we desperately need strong inlining for fixed
+      // size types on MSVC.
+      return typename internal::eval<Derived>::type(derived());
+    }
+
+    /** swaps *this with the expression \a other.
+      *
+      */
+    template<typename OtherDerived>
+    void swap(const DenseBase<OtherDerived>& other,
+              int = OtherDerived::ThisConstantIsPrivateInPlainObjectBase)
+    {
+      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+    }
+
+    /** swaps *this with the matrix or array \a other.
+      *
+      */
+    template<typename OtherDerived>
+    void swap(PlainObjectBase<OtherDerived>& other)
+    {
+      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+    }
+
+
+    inline const NestByValue<Derived> nestByValue() const;
+    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+    inline ForceAlignedAccess<Derived> forceAlignedAccess();
+    template<bool Enable> inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
+    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+
+    Scalar sum() const;
+    Scalar mean() const;
+    Scalar trace() const;
+
+    Scalar prod() const;
+
+    typename internal::traits<Derived>::Scalar minCoeff() const;
+    typename internal::traits<Derived>::Scalar maxCoeff() const;
+
+    template<typename IndexType>
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
+    template<typename IndexType>
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
+    template<typename IndexType>
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
+    template<typename IndexType>
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
+
+    template<typename BinaryOp>
+    typename internal::result_of<BinaryOp(typename internal::traits<Derived>::Scalar)>::type
+    redux(const BinaryOp& func) const;
+
+    template<typename Visitor>
+    void visit(Visitor& func) const;
+
+    inline const WithFormat<Derived> format(const IOFormat& fmt) const;
+
+    /** \returns the unique coefficient of a 1x1 expression */
+    CoeffReturnType value() const
+    {
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      return derived().coeff(0,0);
+    }
+
+    bool all(void) const;
+    bool any(void) const;
+    Index count() const;
+
+    typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
+    typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
+    typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
+    typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;
+
+    ConstRowwiseReturnType rowwise() const;
+    RowwiseReturnType rowwise();
+    ConstColwiseReturnType colwise() const;
+    ColwiseReturnType colwise();
+
+    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index rows, Index cols);
+    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index size);
+    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random();
+
+    template<typename ThenDerived,typename ElseDerived>
+    const Select<Derived,ThenDerived,ElseDerived>
+    select(const DenseBase<ThenDerived>& thenMatrix,
+           const DenseBase<ElseDerived>& elseMatrix) const;
+
+    template<typename ThenDerived>
+    inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+    select(const DenseBase<ThenDerived>& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const;
+
+    template<typename ElseDerived>
+    inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+    select(const typename ElseDerived::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
+
+    template<int p> RealScalar lpNorm() const;
+
+    template<int RowFactor, int ColFactor>
+    inline const Replicate<Derived,RowFactor,ColFactor> replicate() const;
+    
+    typedef Replicate<Derived,Dynamic,Dynamic> ReplicateReturnType;
+    inline const ReplicateReturnType replicate(Index rowFacor,Index colFactor) const;
+
+    typedef Reverse<Derived, BothDirections> ReverseReturnType;
+    typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
+    ReverseReturnType reverse();
+    ConstReverseReturnType reverse() const;
+    void reverseInPlace();
+
+#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
+#   include "../plugins/BlockMethods.h"
+#   ifdef EIGEN_DENSEBASE_PLUGIN
+#     include EIGEN_DENSEBASE_PLUGIN
+#   endif
+#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+
+#ifdef EIGEN2_SUPPORT
+
+    Block<Derived> corner(CornerType type, Index cRows, Index cCols);
+    const Block<Derived> corner(CornerType type, Index cRows, Index cCols) const;
+    template<int CRows, int CCols>
+    Block<Derived, CRows, CCols> corner(CornerType type);
+    template<int CRows, int CCols>
+    const Block<Derived, CRows, CCols> corner(CornerType type) const;
+
+#endif // EIGEN2_SUPPORT
+
+
+    // disable the use of evalTo for dense objects with a nice compilation error
+    template<typename Dest> inline void evalTo(Dest& ) const
+    {
+      EIGEN_STATIC_ASSERT((internal::is_same<Dest,void>::value),THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS);
+    }
+
+  protected:
+    /** Default constructor. Do nothing. */
+    DenseBase()
+    {
+      /* Just checks for self-consistency of the flags.
+       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
+       */
+#ifdef EIGEN_INTERNAL_DEBUGGING
+      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))
+                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, int(!IsRowMajor))),
+                          INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION)
+#endif
+    }
+
+  private:
+    explicit DenseBase(int);
+    DenseBase(int,int);
+    template<typename OtherDerived> explicit DenseBase(const DenseBase<OtherDerived>&);
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_DENSEBASE_H
diff --git a/include/eigen3/Eigen/src/Core/DenseCoeffsBase.h b/include/eigen3/Eigen/src/Core/DenseCoeffsBase.h
new file mode 100644
index 0000000..3c890f2
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/DenseCoeffsBase.h
@@ -0,0 +1,754 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DENSECOEFFSBASE_H
+#define EIGEN_DENSECOEFFSBASE_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename T> struct add_const_on_value_type_if_arithmetic
+{
+  typedef typename conditional<is_arithmetic<T>::value, T, typename add_const_on_value_type<T>::type>::type type;
+};
+}
+
+/** \brief Base class providing read-only coefficient access to matrices and arrays.
+  * \ingroup Core_Module
+  * \tparam Derived Type of the derived class
+  * \tparam #ReadOnlyAccessors Constant indicating read-only access
+  *
+  * This class defines the \c operator() \c const function and friends, which can be used to read specific
+  * entries of a matrix or array.
+  * 
+  * \sa DenseCoeffsBase<Derived, WriteAccessors>, DenseCoeffsBase<Derived, DirectAccessors>,
+  *     \ref TopicClassHierarchy
+  */
+template<typename Derived>
+class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
+{
+  public:
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+
+    // Explanation for this CoeffReturnType typedef.
+    // - This is the return type of the coeff() method.
+    // - The LvalueBit means exactly that we can offer a coeffRef() method, which means exactly that we can get references
+    // to coeffs, which means exactly that we can have coeff() return a const reference (as opposed to returning a value).
+    // - The is_artihmetic check is required since "const int", "const double", etc. will cause warnings on some systems
+    // while the declaration of "const T", where T is a non arithmetic type does not. Always returning "const Scalar&" is
+    // not possible, since the underlying expressions might not offer a valid address the reference could be referring to.
+    typedef typename internal::conditional<bool(internal::traits<Derived>::Flags&LvalueBit),
+                         const Scalar&,
+                         typename internal::conditional<internal::is_arithmetic<Scalar>::value, Scalar, const Scalar>::type
+                     >::type CoeffReturnType;
+
+    typedef typename internal::add_const_on_value_type_if_arithmetic<
+                         typename internal::packet_traits<Scalar>::type
+                     >::type PacketReturnType;
+
+    typedef EigenBase<Derived> Base;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::derived;
+
+    EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const
+    {
+      return int(Derived::RowsAtCompileTime) == 1 ? 0
+          : int(Derived::ColsAtCompileTime) == 1 ? inner
+          : int(Derived::Flags)&RowMajorBit ? outer
+          : inner;
+    }
+
+    EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const
+    {
+      return int(Derived::ColsAtCompileTime) == 1 ? 0
+          : int(Derived::RowsAtCompileTime) == 1 ? inner
+          : int(Derived::Flags)&RowMajorBit ? inner
+          : outer;
+    }
+
+    /** Short version: don't use this function, use
+      * \link operator()(Index,Index) const \endlink instead.
+      *
+      * Long version: this function is similar to
+      * \link operator()(Index,Index) const \endlink, but without the assertion.
+      * Use this for limiting the performance cost of debugging code when doing
+      * repeated coefficient access. Only use this when it is guaranteed that the
+      * parameters \a row and \a col are in range.
+      *
+      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+      * function equivalent to \link operator()(Index,Index) const \endlink.
+      *
+      * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
+      */
+    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      return derived().coeff(row, col);
+    }
+
+    EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
+    {
+      return coeff(rowIndexByOuterInner(outer, inner),
+                   colIndexByOuterInner(outer, inner));
+    }
+
+    /** \returns the coefficient at given the given row and column.
+      *
+      * \sa operator()(Index,Index), operator[](Index)
+      */
+    EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const
+    {
+      eigen_assert(row >= 0 && row < rows()
+          && col >= 0 && col < cols());
+      return derived().coeff(row, col);
+    }
+
+    /** Short version: don't use this function, use
+      * \link operator[](Index) const \endlink instead.
+      *
+      * Long version: this function is similar to
+      * \link operator[](Index) const \endlink, but without the assertion.
+      * Use this for limiting the performance cost of debugging code when doing
+      * repeated coefficient access. Only use this when it is guaranteed that the
+      * parameter \a index is in range.
+      *
+      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+      * function equivalent to \link operator[](Index) const \endlink.
+      *
+      * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
+      */
+
+    EIGEN_STRONG_INLINE CoeffReturnType
+    coeff(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return derived().coeff(index);
+    }
+
+
+    /** \returns the coefficient at given index.
+      *
+      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+      *
+      * \sa operator[](Index), operator()(Index,Index) const, x() const, y() const,
+      * z() const, w() const
+      */
+
+    EIGEN_STRONG_INLINE CoeffReturnType
+    operator[](Index index) const
+    {
+      #ifndef EIGEN2_SUPPORT
+      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
+                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+      #endif
+      eigen_assert(index >= 0 && index < size());
+      return derived().coeff(index);
+    }
+
+    /** \returns the coefficient at given index.
+      *
+      * This is synonymous to operator[](Index) const.
+      *
+      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+      *
+      * \sa operator[](Index), operator()(Index,Index) const, x() const, y() const,
+      * z() const, w() const
+      */
+
+    EIGEN_STRONG_INLINE CoeffReturnType
+    operator()(Index index) const
+    {
+      eigen_assert(index >= 0 && index < size());
+      return derived().coeff(index);
+    }
+
+    /** equivalent to operator[](0).  */
+
+    EIGEN_STRONG_INLINE CoeffReturnType
+    x() const { return (*this)[0]; }
+
+    /** equivalent to operator[](1).  */
+
+    EIGEN_STRONG_INLINE CoeffReturnType
+    y() const { return (*this)[1]; }
+
+    /** equivalent to operator[](2).  */
+
+    EIGEN_STRONG_INLINE CoeffReturnType
+    z() const { return (*this)[2]; }
+
+    /** equivalent to operator[](3).  */
+
+    EIGEN_STRONG_INLINE CoeffReturnType
+    w() const { return (*this)[3]; }
+
+    /** \internal
+      * \returns the packet of coefficients starting at the given row and column. It is your responsibility
+      * to ensure that a packet really starts there. This method is only available on expressions having the
+      * PacketAccessBit.
+      *
+      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
+      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+      * starting at an address which is a multiple of the packet size.
+      */
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                      && col >= 0 && col < cols());
+      return derived().template packet<LoadMode>(row,col);
+    }
+
+
+    /** \internal */
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketReturnType packetByOuterInner(Index outer, Index inner) const
+    {
+      return packet<LoadMode>(rowIndexByOuterInner(outer, inner),
+                              colIndexByOuterInner(outer, inner));
+    }
+
+    /** \internal
+      * \returns the packet of coefficients starting at the given index. It is your responsibility
+      * to ensure that a packet really starts there. This method is only available on expressions having the
+      * PacketAccessBit and the LinearAccessBit.
+      *
+      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
+      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+      * starting at an address which is a multiple of the packet size.
+      */
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return derived().template packet<LoadMode>(index);
+    }
+
+  protected:
+    // explanation: DenseBase is doing "using ..." on the methods from DenseCoeffsBase.
+    // But some methods are only available in the DirectAccess case.
+    // So we add dummy methods here with these names, so that "using... " doesn't fail.
+    // It's not private so that the child class DenseBase can access them, and it's not public
+    // either since it's an implementation detail, so has to be protected.
+    void coeffRef();
+    void coeffRefByOuterInner();
+    void writePacket();
+    void writePacketByOuterInner();
+    void copyCoeff();
+    void copyCoeffByOuterInner();
+    void copyPacket();
+    void copyPacketByOuterInner();
+    void stride();
+    void innerStride();
+    void outerStride();
+    void rowStride();
+    void colStride();
+};
+
+/** \brief Base class providing read/write coefficient access to matrices and arrays.
+  * \ingroup Core_Module
+  * \tparam Derived Type of the derived class
+  * \tparam #WriteAccessors Constant indicating read/write access
+  *
+  * This class defines the non-const \c operator() function and friends, which can be used to write specific
+  * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
+  * defines the const variant for reading specific entries.
+  * 
+  * \sa DenseCoeffsBase<Derived, DirectAccessors>, \ref TopicClassHierarchy
+  */
+template<typename Derived>
+class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors>
+{
+  public:
+
+    typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    using Base::coeff;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::derived;
+    using Base::rowIndexByOuterInner;
+    using Base::colIndexByOuterInner;
+    using Base::operator[];
+    using Base::operator();
+    using Base::x;
+    using Base::y;
+    using Base::z;
+    using Base::w;
+
+    /** Short version: don't use this function, use
+      * \link operator()(Index,Index) \endlink instead.
+      *
+      * Long version: this function is similar to
+      * \link operator()(Index,Index) \endlink, but without the assertion.
+      * Use this for limiting the performance cost of debugging code when doing
+      * repeated coefficient access. Only use this when it is guaranteed that the
+      * parameters \a row and \a col are in range.
+      *
+      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+      * function equivalent to \link operator()(Index,Index) \endlink.
+      *
+      * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
+      */
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      return derived().coeffRef(row, col);
+    }
+
+    EIGEN_STRONG_INLINE Scalar&
+    coeffRefByOuterInner(Index outer, Index inner)
+    {
+      return coeffRef(rowIndexByOuterInner(outer, inner),
+                      colIndexByOuterInner(outer, inner));
+    }
+
+    /** \returns a reference to the coefficient at given the given row and column.
+      *
+      * \sa operator[](Index)
+      */
+
+    EIGEN_STRONG_INLINE Scalar&
+    operator()(Index row, Index col)
+    {
+      eigen_assert(row >= 0 && row < rows()
+          && col >= 0 && col < cols());
+      return derived().coeffRef(row, col);
+    }
+
+
+    /** Short version: don't use this function, use
+      * \link operator[](Index) \endlink instead.
+      *
+      * Long version: this function is similar to
+      * \link operator[](Index) \endlink, but without the assertion.
+      * Use this for limiting the performance cost of debugging code when doing
+      * repeated coefficient access. Only use this when it is guaranteed that the
+      * parameters \a row and \a col are in range.
+      *
+      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+      * function equivalent to \link operator[](Index) \endlink.
+      *
+      * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
+      */
+
+    EIGEN_STRONG_INLINE Scalar&
+    coeffRef(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return derived().coeffRef(index);
+    }
+
+    /** \returns a reference to the coefficient at given index.
+      *
+      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+      *
+      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
+      */
+
+    EIGEN_STRONG_INLINE Scalar&
+    operator[](Index index)
+    {
+      #ifndef EIGEN2_SUPPORT
+      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
+                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+      #endif
+      eigen_assert(index >= 0 && index < size());
+      return derived().coeffRef(index);
+    }
+
+    /** \returns a reference to the coefficient at given index.
+      *
+      * This is synonymous to operator[](Index).
+      *
+      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+      *
+      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
+      */
+
+    EIGEN_STRONG_INLINE Scalar&
+    operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < size());
+      return derived().coeffRef(index);
+    }
+
+    /** equivalent to operator[](0).  */
+
+    EIGEN_STRONG_INLINE Scalar&
+    x() { return (*this)[0]; }
+
+    /** equivalent to operator[](1).  */
+
+    EIGEN_STRONG_INLINE Scalar&
+    y() { return (*this)[1]; }
+
+    /** equivalent to operator[](2).  */
+
+    EIGEN_STRONG_INLINE Scalar&
+    z() { return (*this)[2]; }
+
+    /** equivalent to operator[](3).  */
+
+    EIGEN_STRONG_INLINE Scalar&
+    w() { return (*this)[3]; }
+
+    /** \internal
+      * Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
+      * to ensure that a packet really starts there. This method is only available on expressions having the
+      * PacketAccessBit.
+      *
+      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
+      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+      * starting at an address which is a multiple of the packet size.
+      */
+
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacket
+    (Index row, Index col, const typename internal::packet_traits<Scalar>::type& val)
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      derived().template writePacket<StoreMode>(row,col,val);
+    }
+
+
+    /** \internal */
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacketByOuterInner
+    (Index outer, Index inner, const typename internal::packet_traits<Scalar>::type& val)
+    {
+      writePacket<StoreMode>(rowIndexByOuterInner(outer, inner),
+                            colIndexByOuterInner(outer, inner),
+                            val);
+    }
+
+    /** \internal
+      * Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
+      * to ensure that a packet really starts there. This method is only available on expressions having the
+      * PacketAccessBit and the LinearAccessBit.
+      *
+      * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
+      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+      * starting at an address which is a multiple of the packet size.
+      */
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacket
+    (Index index, const typename internal::packet_traits<Scalar>::type& val)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      derived().template writePacket<StoreMode>(index,val);
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+    /** \internal Copies the coefficient at position (row,col) of other into *this.
+      *
+      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
+      * with usual assignments.
+      *
+      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
+      */
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      derived().coeffRef(row, col) = other.derived().coeff(row, col);
+    }
+
+    /** \internal Copies the coefficient at the given index of other into *this.
+      *
+      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
+      * with usual assignments.
+      *
+      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
+      */
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      derived().coeffRef(index) = other.derived().coeff(index);
+    }
+
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void copyCoeffByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
+    {
+      const Index row = rowIndexByOuterInner(outer,inner);
+      const Index col = colIndexByOuterInner(outer,inner);
+      // derived() is important here: copyCoeff() may be reimplemented in Derived!
+      derived().copyCoeff(row, col, other);
+    }
+
+    /** \internal Copies the packet at position (row,col) of other into *this.
+      *
+      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
+      * with usual assignments.
+      *
+      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
+      */
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    EIGEN_STRONG_INLINE void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      derived().template writePacket<StoreMode>(row, col,
+        other.derived().template packet<LoadMode>(row, col));
+    }
+
+    /** \internal Copies the packet at the given index of other into *this.
+      *
+      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
+      * with usual assignments.
+      *
+      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
+      */
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    EIGEN_STRONG_INLINE void copyPacket(Index index, const DenseBase<OtherDerived>& other)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      derived().template writePacket<StoreMode>(index,
+        other.derived().template packet<LoadMode>(index));
+    }
+
+    /** \internal */
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    EIGEN_STRONG_INLINE void copyPacketByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
+    {
+      const Index row = rowIndexByOuterInner(outer,inner);
+      const Index col = colIndexByOuterInner(outer,inner);
+      // derived() is important here: copyCoeff() may be reimplemented in Derived!
+      derived().template copyPacket< OtherDerived, StoreMode, LoadMode>(row, col, other);
+    }
+#endif
+
+};
+
+/** \brief Base class providing direct read-only coefficient access to matrices and arrays.
+  * \ingroup Core_Module
+  * \tparam Derived Type of the derived class
+  * \tparam #DirectAccessors Constant indicating direct access
+  *
+  * This class defines functions to work with strides which can be used to access entries directly. This class
+  * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
+  * \c operator() .
+  *
+  * \sa \ref TopicClassHierarchy
+  */
+template<typename Derived>
+class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors>
+{
+  public:
+
+    typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::derived;
+
+    /** \returns the pointer increment between two consecutive elements within a slice in the inner direction.
+      *
+      * \sa outerStride(), rowStride(), colStride()
+      */
+    inline Index innerStride() const
+    {
+      return derived().innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
+      *          in a column-major matrix).
+      *
+      * \sa innerStride(), rowStride(), colStride()
+      */
+    inline Index outerStride() const
+    {
+      return derived().outerStride();
+    }
+
+    // FIXME shall we remove it ?
+    inline Index stride() const
+    {
+      return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive rows.
+      *
+      * \sa innerStride(), outerStride(), colStride()
+      */
+    inline Index rowStride() const
+    {
+      return Derived::IsRowMajor ? outerStride() : innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive columns.
+      *
+      * \sa innerStride(), outerStride(), rowStride()
+      */
+    inline Index colStride() const
+    {
+      return Derived::IsRowMajor ? innerStride() : outerStride();
+    }
+};
+
+/** \brief Base class providing direct read/write coefficient access to matrices and arrays.
+  * \ingroup Core_Module
+  * \tparam Derived Type of the derived class
+  * \tparam #DirectWriteAccessors Constant indicating direct access
+  *
+  * This class defines functions to work with strides which can be used to access entries directly. This class
+  * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
+  * \c operator().
+  *
+  * \sa \ref TopicClassHierarchy
+  */
+template<typename Derived>
+class DenseCoeffsBase<Derived, DirectWriteAccessors>
+  : public DenseCoeffsBase<Derived, WriteAccessors>
+{
+  public:
+
+    typedef DenseCoeffsBase<Derived, WriteAccessors> Base;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::derived;
+
+    /** \returns the pointer increment between two consecutive elements within a slice in the inner direction.
+      *
+      * \sa outerStride(), rowStride(), colStride()
+      */
+    inline Index innerStride() const
+    {
+      return derived().innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
+      *          in a column-major matrix).
+      *
+      * \sa innerStride(), rowStride(), colStride()
+      */
+    inline Index outerStride() const
+    {
+      return derived().outerStride();
+    }
+
+    // FIXME shall we remove it ?
+    inline Index stride() const
+    {
+      return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive rows.
+      *
+      * \sa innerStride(), outerStride(), colStride()
+      */
+    inline Index rowStride() const
+    {
+      return Derived::IsRowMajor ? outerStride() : innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive columns.
+      *
+      * \sa innerStride(), outerStride(), rowStride()
+      */
+    inline Index colStride() const
+    {
+      return Derived::IsRowMajor ? innerStride() : outerStride();
+    }
+};
+
+namespace internal {
+
+template<typename Derived, bool JustReturnZero>
+struct first_aligned_impl
+{
+  static inline typename Derived::Index run(const Derived&)
+  { return 0; }
+};
+
+template<typename Derived>
+struct first_aligned_impl<Derived, false>
+{
+  static inline typename Derived::Index run(const Derived& m)
+  {
+    return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
+  }
+};
+
+/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+  *
+  * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
+  * documentation.
+  */
+template<typename Derived>
+static inline typename Derived::Index first_aligned(const Derived& m)
+{
+  return first_aligned_impl
+          <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
+          ::run(m);
+}
+
+template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
+struct inner_stride_at_compile_time
+{
+  enum { ret = traits<Derived>::InnerStrideAtCompileTime };
+};
+
+template<typename Derived>
+struct inner_stride_at_compile_time<Derived, false>
+{
+  enum { ret = 0 };
+};
+
+template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
+struct outer_stride_at_compile_time
+{
+  enum { ret = traits<Derived>::OuterStrideAtCompileTime };
+};
+
+template<typename Derived>
+struct outer_stride_at_compile_time<Derived, false>
+{
+  enum { ret = 0 };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_DENSECOEFFSBASE_H
diff --git a/include/eigen3/Eigen/src/Core/DenseStorage.h b/include/eigen3/Eigen/src/Core/DenseStorage.h
new file mode 100644
index 0000000..a72738e
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/DenseStorage.h
@@ -0,0 +1,339 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIXSTORAGE_H
+#define EIGEN_MATRIXSTORAGE_H
+
+#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
+#else
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+struct constructor_without_unaligned_array_assert {};
+
+template<typename T, int Size> void check_static_allocation_size()
+{
+  // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
+  #if EIGEN_STACK_ALLOCATION_LIMIT
+  EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+  #endif
+}
+
+/** \internal
+  * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned:
+  * to 16 bytes boundary if the total size is a multiple of 16 bytes.
+  */
+template <typename T, int Size, int MatrixOrArrayOptions,
+          int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
+                        : (((Size*sizeof(T))%16)==0) ? 16
+                        : 0 >
+struct plain_array
+{
+  T array[Size];
+
+  plain_array() 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
+#elif EIGEN_GNUC_AT_LEAST(4,7) 
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
+  // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
+  // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
+  template<typename PtrType>
+  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
+  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
+    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \
+              && "this assertion is explained here: " \
+              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
+              " **** READ THIS WEB PAGE !!! ****");
+#else
+  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
+    eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
+              && "this assertion is explained here: " \
+              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
+              " **** READ THIS WEB PAGE !!! ****");
+#endif
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 16>
+{
+  EIGEN_USER_ALIGN16 T array[Size];
+
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf);
+    check_static_allocation_size<T,Size>();
+  }
+
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int MatrixOrArrayOptions, int Alignment>
+struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
+{
+  EIGEN_USER_ALIGN16 T array[1];
+  plain_array() {}
+  plain_array(constructor_without_unaligned_array_assert) {}
+};
+
+} // end namespace internal
+
+/** \internal
+  *
+  * \class DenseStorage
+  * \ingroup Core_Module
+  *
+  * \brief Stores the data of a matrix
+  *
+  * This class stores the data of fixed-size, dynamic-size or mixed matrices
+  * in a way as compact as possible.
+  *
+  * \sa Matrix
+  */
+template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseStorage;
+
+// purely fixed-size matrix
+template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseStorage
+{
+    internal::plain_array<T,Size,_Options> m_data;
+  public:
+    inline DenseStorage() {}
+    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(internal::constructor_without_unaligned_array_assert()) {}
+    inline DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
+    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    static inline DenseIndex rows(void) {return _Rows;}
+    static inline DenseIndex cols(void) {return _Cols;}
+    inline void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
+    inline void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    inline const T *data() const { return m_data.array; }
+    inline T *data() { return m_data.array; }
+};
+
+// null matrix
+template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0, _Rows, _Cols, _Options>
+{
+  public:
+    inline DenseStorage() {}
+    inline DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    inline DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
+    inline void swap(DenseStorage& ) {}
+    static inline DenseIndex rows(void) {return _Rows;}
+    static inline DenseIndex cols(void) {return _Cols;}
+    inline void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
+    inline void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    inline const T *data() const { return 0; }
+    inline T *data() { return 0; }
+};
+
+// more specializations for null matrices; these are necessary to resolve ambiguities
+template<typename T, int _Options> class DenseStorage<T, 0, Dynamic, Dynamic, _Options>
+: public DenseStorage<T, 0, 0, 0, _Options> { };
+
+template<typename T, int _Rows, int _Options> class DenseStorage<T, 0, _Rows, Dynamic, _Options>
+: public DenseStorage<T, 0, 0, 0, _Options> { };
+
+template<typename T, int _Cols, int _Options> class DenseStorage<T, 0, Dynamic, _Cols, _Options>
+: public DenseStorage<T, 0, 0, 0, _Options> { };
+
+// dynamic-size matrix with fixed-size storage
+template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic, Dynamic, _Options>
+{
+    internal::plain_array<T,Size,_Options> m_data;
+    DenseIndex m_rows;
+    DenseIndex m_cols;
+  public:
+    inline DenseStorage() : m_rows(0), m_cols(0) {}
+    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
+    inline DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) : m_rows(nbRows), m_cols(nbCols) {}
+    inline void swap(DenseStorage& other)
+    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    inline DenseIndex rows() const {return m_rows;}
+    inline DenseIndex cols() const {return m_cols;}
+    inline void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
+    inline void resize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
+    inline const T *data() const { return m_data.array; }
+    inline T *data() { return m_data.array; }
+};
+
+// dynamic-size matrix with fixed-size storage and fixed width
+template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Size, Dynamic, _Cols, _Options>
+{
+    internal::plain_array<T,Size,_Options> m_data;
+    DenseIndex m_rows;
+  public:
+    inline DenseStorage() : m_rows(0) {}
+    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
+    inline DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex) : m_rows(nbRows) {}
+    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    inline DenseIndex rows(void) const {return m_rows;}
+    inline DenseIndex cols(void) const {return _Cols;}
+    inline void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
+    inline void resize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
+    inline const T *data() const { return m_data.array; }
+    inline T *data() { return m_data.array; }
+};
+
+// dynamic-size matrix with fixed-size storage and fixed height
+template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Size, _Rows, Dynamic, _Options>
+{
+    internal::plain_array<T,Size,_Options> m_data;
+    DenseIndex m_cols;
+  public:
+    inline DenseStorage() : m_cols(0) {}
+    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
+    inline DenseStorage(DenseIndex, DenseIndex, DenseIndex nbCols) : m_cols(nbCols) {}
+    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    inline DenseIndex rows(void) const {return _Rows;}
+    inline DenseIndex cols(void) const {return m_cols;}
+    inline void conservativeResize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
+    inline void resize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
+    inline const T *data() const { return m_data.array; }
+    inline T *data() { return m_data.array; }
+};
+
+// purely dynamic matrix.
+template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynamic, _Options>
+{
+    T *m_data;
+    DenseIndex m_rows;
+    DenseIndex m_cols;
+  public:
+    inline DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
+    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+       : m_data(0), m_rows(0), m_cols(0) {}
+    inline DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
+    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
+    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
+    inline void swap(DenseStorage& other)
+    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    inline DenseIndex rows(void) const {return m_rows;}
+    inline DenseIndex cols(void) const {return m_cols;}
+    inline void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    {
+      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
+      m_rows = nbRows;
+      m_cols = nbCols;
+    }
+    void resize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    {
+      if(size != m_rows*m_cols)
+      {
+        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
+        if (size)
+          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
+        else
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      }
+      m_rows = nbRows;
+      m_cols = nbCols;
+    }
+    inline const T *data() const { return m_data; }
+    inline T *data() { return m_data; }
+};
+
+// matrix with dynamic width and fixed height (so that matrix has dynamic size).
+template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Rows, Dynamic, _Options>
+{
+    T *m_data;
+    DenseIndex m_cols;
+  public:
+    inline DenseStorage() : m_data(0), m_cols(0) {}
+    inline DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
+    inline DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
+    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
+    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
+    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    static inline DenseIndex rows(void) {return _Rows;}
+    inline DenseIndex cols(void) const {return m_cols;}
+    inline void conservativeResize(DenseIndex size, DenseIndex, DenseIndex nbCols)
+    {
+      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
+      m_cols = nbCols;
+    }
+    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex, DenseIndex nbCols)
+    {
+      if(size != _Rows*m_cols)
+      {
+        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
+        if (size)
+          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
+        else
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      }
+      m_cols = nbCols;
+    }
+    inline const T *data() const { return m_data; }
+    inline T *data() { return m_data; }
+};
+
+// matrix with dynamic height and fixed width (so that matrix has dynamic size).
+template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dynamic, _Cols, _Options>
+{
+    T *m_data;
+    DenseIndex m_rows;
+  public:
+    inline DenseStorage() : m_data(0), m_rows(0) {}
+    inline DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
+    inline DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
+    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
+    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
+    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    inline DenseIndex rows(void) const {return m_rows;}
+    static inline DenseIndex cols(void) {return _Cols;}
+    inline void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex)
+    {
+      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
+      m_rows = nbRows;
+    }
+    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex nbRows, DenseIndex)
+    {
+      if(size != m_rows*_Cols)
+      {
+        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
+        if (size)
+          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
+        else
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      }
+      m_rows = nbRows;
+    }
+    inline const T *data() const { return m_data; }
+    inline T *data() { return m_data; }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/Diagonal.h b/include/eigen3/Eigen/src/Core/Diagonal.h
new file mode 100644
index 0000000..68cf6d4
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Diagonal.h
@@ -0,0 +1,237 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DIAGONAL_H
+#define EIGEN_DIAGONAL_H
+
+namespace Eigen { 
+
+/** \class Diagonal
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a diagonal/subdiagonal/superdiagonal in a matrix
+  *
+  * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
+  * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
+  *              A positive value means a superdiagonal, a negative value means a subdiagonal.
+  *              You can also use Dynamic so the index can be set at runtime.
+  *
+  * The matrix is not required to be square.
+  *
+  * This class represents an expression of the main diagonal, or any sub/super diagonal
+  * of a square matrix. It is the return type of MatrixBase::diagonal() and MatrixBase::diagonal(Index) and most of the
+  * time this is the only way it is used.
+  *
+  * \sa MatrixBase::diagonal(), MatrixBase::diagonal(Index)
+  */
+
+namespace internal {
+template<typename MatrixType, int DiagIndex>
+struct traits<Diagonal<MatrixType,DiagIndex> >
+ : traits<MatrixType>
+{
+  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef typename MatrixType::StorageKind StorageKind;
+  enum {
+    RowsAtCompileTime = (int(DiagIndex) == DynamicIndex || int(MatrixType::SizeAtCompileTime) == Dynamic) ? Dynamic
+                      : (EIGEN_PLAIN_ENUM_MIN(MatrixType::RowsAtCompileTime - EIGEN_PLAIN_ENUM_MAX(-DiagIndex, 0),
+                                              MatrixType::ColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
+    ColsAtCompileTime = 1,
+    MaxRowsAtCompileTime = int(MatrixType::MaxSizeAtCompileTime) == Dynamic ? Dynamic
+                         : DiagIndex == DynamicIndex ? EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::MaxRowsAtCompileTime,
+                                                                              MatrixType::MaxColsAtCompileTime)
+                         : (EIGEN_PLAIN_ENUM_MIN(MatrixType::MaxRowsAtCompileTime - EIGEN_PLAIN_ENUM_MAX(-DiagIndex, 0),
+                                                 MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
+    MaxColsAtCompileTime = 1,
+    MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit,
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost,
+    MatrixTypeOuterStride = outer_stride_at_compile_time<MatrixType>::ret,
+    InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1,
+    OuterStrideAtCompileTime = 0
+  };
+};
+}
+
+template<typename MatrixType, int _DiagIndex> class Diagonal
+   : public internal::dense_xpr_base< Diagonal<MatrixType,_DiagIndex> >::type
+{
+  public:
+
+    enum { DiagIndex = _DiagIndex };
+    typedef typename internal::dense_xpr_base<Diagonal>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
+
+    inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
+
+    inline Index rows() const
+    { return m_index.value()<0 ? (std::min<Index>)(m_matrix.cols(),m_matrix.rows()+m_index.value()) : (std::min<Index>)(m_matrix.rows(),m_matrix.cols()-m_index.value()); }
+
+    inline Index cols() const { return 1; }
+
+    inline Index innerStride() const
+    {
+      return m_matrix.outerStride() + 1;
+    }
+
+    inline Index outerStride() const
+    {
+      return 0;
+    }
+
+    typedef typename internal::conditional<
+                       internal::is_lvalue<MatrixType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+
+    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+
+    inline Scalar& coeffRef(Index row, Index)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+    }
+
+    inline const Scalar& coeffRef(Index row, Index) const
+    {
+      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+    }
+
+    inline CoeffReturnType coeff(Index row, Index) const
+    {
+      return m_matrix.coeff(row+rowOffset(), row+colOffset());
+    }
+
+    inline Scalar& coeffRef(Index idx)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+    }
+
+    inline const Scalar& coeffRef(Index idx) const
+    {
+      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+    }
+
+    inline CoeffReturnType coeff(Index idx) const
+    {
+      return m_matrix.coeff(idx+rowOffset(), idx+colOffset());
+    }
+
+    const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    nestedExpression() const 
+    {
+      return m_matrix;
+    }
+
+    int index() const
+    {
+      return m_index.value();
+    }
+
+  protected:
+    typename MatrixType::Nested m_matrix;
+    const internal::variable_if_dynamicindex<Index, DiagIndex> m_index;
+
+  private:
+    // some compilers may fail to optimize std::max etc in case of compile-time constants...
+    EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
+    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
+    // triger a compile time error is someone try to call packet
+    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
+    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
+};
+
+/** \returns an expression of the main diagonal of the matrix \c *this
+  *
+  * \c *this is not required to be square.
+  *
+  * Example: \include MatrixBase_diagonal.cpp
+  * Output: \verbinclude MatrixBase_diagonal.out
+  *
+  * \sa class Diagonal */
+template<typename Derived>
+inline typename MatrixBase<Derived>::DiagonalReturnType
+MatrixBase<Derived>::diagonal()
+{
+  return derived();
+}
+
+/** This is the const version of diagonal(). */
+template<typename Derived>
+inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+MatrixBase<Derived>::diagonal() const
+{
+  return ConstDiagonalReturnType(derived());
+}
+
+/** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
+  *
+  * \c *this is not required to be square.
+  *
+  * The template parameter \a DiagIndex represent a super diagonal if \a DiagIndex > 0
+  * and a sub diagonal otherwise. \a DiagIndex == 0 is equivalent to the main diagonal.
+  *
+  * Example: \include MatrixBase_diagonal_int.cpp
+  * Output: \verbinclude MatrixBase_diagonal_int.out
+  *
+  * \sa MatrixBase::diagonal(), class Diagonal */
+template<typename Derived>
+inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+MatrixBase<Derived>::diagonal(Index index)
+{
+  return DiagonalDynamicIndexReturnType(derived(), index);
+}
+
+/** This is the const version of diagonal(Index). */
+template<typename Derived>
+inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+MatrixBase<Derived>::diagonal(Index index) const
+{
+  return ConstDiagonalDynamicIndexReturnType(derived(), index);
+}
+
+/** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
+  *
+  * \c *this is not required to be square.
+  *
+  * The template parameter \a DiagIndex represent a super diagonal if \a DiagIndex > 0
+  * and a sub diagonal otherwise. \a DiagIndex == 0 is equivalent to the main diagonal.
+  *
+  * Example: \include MatrixBase_diagonal_template_int.cpp
+  * Output: \verbinclude MatrixBase_diagonal_template_int.out
+  *
+  * \sa MatrixBase::diagonal(), class Diagonal */
+template<typename Derived>
+template<int Index>
+inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index>::Type
+MatrixBase<Derived>::diagonal()
+{
+  return derived();
+}
+
+/** This is the const version of diagonal<int>(). */
+template<typename Derived>
+template<int Index>
+inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index>::Type
+MatrixBase<Derived>::diagonal() const
+{
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DIAGONAL_H
diff --git a/include/eigen3/Eigen/src/Core/DiagonalMatrix.h b/include/eigen3/Eigen/src/Core/DiagonalMatrix.h
new file mode 100644
index 0000000..e6c220f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/DiagonalMatrix.h
@@ -0,0 +1,313 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DIAGONALMATRIX_H
+#define EIGEN_DIAGONALMATRIX_H
+
+namespace Eigen { 
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename Derived>
+class DiagonalBase : public EigenBase<Derived>
+{
+  public:
+    typedef typename internal::traits<Derived>::DiagonalVectorType DiagonalVectorType;
+    typedef typename DiagonalVectorType::Scalar Scalar;
+    typedef typename DiagonalVectorType::RealScalar RealScalar;
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
+
+    enum {
+      RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+      ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+      MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+      MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+      IsVectorAtCompileTime = 0,
+      Flags = 0
+    };
+
+    typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime> DenseMatrixType;
+    typedef DenseMatrixType DenseType;
+    typedef DiagonalMatrix<Scalar,DiagonalVectorType::SizeAtCompileTime,DiagonalVectorType::MaxSizeAtCompileTime> PlainObject;
+
+    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    inline Derived& derived() { return *static_cast<Derived*>(this); }
+
+    DenseMatrixType toDenseMatrix() const { return derived(); }
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived> &other) const;
+    template<typename DenseDerived>
+    void addTo(MatrixBase<DenseDerived> &other) const
+    { other.diagonal() += diagonal(); }
+    template<typename DenseDerived>
+    void subTo(MatrixBase<DenseDerived> &other) const
+    { other.diagonal() -= diagonal(); }
+
+    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
+    inline DiagonalVectorType& diagonal() { return derived().diagonal(); }
+
+    inline Index rows() const { return diagonal().size(); }
+    inline Index cols() const { return diagonal().size(); }
+
+    /** \returns the diagonal matrix product of \c *this by the matrix \a matrix.
+      */
+    template<typename MatrixDerived>
+    const DiagonalProduct<MatrixDerived, Derived, OnTheLeft>
+    operator*(const MatrixBase<MatrixDerived> &matrix) const
+    {
+      return DiagonalProduct<MatrixDerived, Derived, OnTheLeft>(matrix.derived(), derived());
+    }
+
+    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> >
+    inverse() const
+    {
+      return diagonal().cwiseInverse();
+    }
+    
+    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    operator*(const Scalar& scalar) const
+    {
+      return diagonal() * scalar;
+    }
+    friend inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    operator*(const Scalar& scalar, const DiagonalBase& other)
+    {
+      return other.diagonal() * scalar;
+    }
+    
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    bool isApprox(const DiagonalBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return diagonal().isApprox(other.diagonal(), precision);
+    }
+    template<typename OtherDerived>
+    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return toDenseMatrix().isApprox(other, precision);
+    }
+    #endif
+};
+
+template<typename Derived>
+template<typename DenseDerived>
+void DiagonalBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+{
+  other.setZero();
+  other.diagonal() = diagonal();
+}
+#endif
+
+/** \class DiagonalMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Represents a diagonal matrix with its storage
+  *
+  * \param _Scalar the type of coefficients
+  * \param SizeAtCompileTime the dimension of the matrix, or Dynamic
+  * \param MaxSizeAtCompileTime the dimension of the matrix, or Dynamic. This parameter is optional and defaults
+  *        to SizeAtCompileTime. Most of the time, you do not need to specify it.
+  *
+  * \sa class DiagonalWrapper
+  */
+
+namespace internal {
+template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime>
+struct traits<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
+ : traits<Matrix<_Scalar,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  typedef Matrix<_Scalar,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1> DiagonalVectorType;
+  typedef Dense StorageKind;
+  typedef DenseIndex Index;
+  enum {
+    Flags = LvalueBit
+  };
+};
+}
+template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime>
+class DiagonalMatrix
+  : public DiagonalBase<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  public:
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename internal::traits<DiagonalMatrix>::DiagonalVectorType DiagonalVectorType;
+    typedef const DiagonalMatrix& Nested;
+    typedef _Scalar Scalar;
+    typedef typename internal::traits<DiagonalMatrix>::StorageKind StorageKind;
+    typedef typename internal::traits<DiagonalMatrix>::Index Index;
+    #endif
+
+  protected:
+
+    DiagonalVectorType m_diagonal;
+
+  public:
+
+    /** const version of diagonal(). */
+    inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
+    /** \returns a reference to the stored vector of diagonal coefficients. */
+    inline DiagonalVectorType& diagonal() { return m_diagonal; }
+
+    /** Default constructor without initialization */
+    inline DiagonalMatrix() {}
+
+    /** Constructs a diagonal matrix with given dimension  */
+    inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+
+    /** 2D constructor. */
+    inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x,y) {}
+
+    /** 3D constructor. */
+    inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}
+
+    /** Copy constructor. */
+    template<typename OtherDerived>
+    inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** copy constructor. prevent a default copy constructor from hiding the other templated constructor */
+    inline DiagonalMatrix(const DiagonalMatrix& other) : m_diagonal(other.diagonal()) {}
+    #endif
+
+    /** generic constructor from expression of the diagonal coefficients */
+    template<typename OtherDerived>
+    explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other)
+    {}
+
+    /** Copy operator. */
+    template<typename OtherDerived>
+    DiagonalMatrix& operator=(const DiagonalBase<OtherDerived>& other)
+    {
+      m_diagonal = other.diagonal();
+      return *this;
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    DiagonalMatrix& operator=(const DiagonalMatrix& other)
+    {
+      m_diagonal = other.diagonal();
+      return *this;
+    }
+    #endif
+
+    /** Resizes to given size. */
+    inline void resize(Index size) { m_diagonal.resize(size); }
+    /** Sets all coefficients to zero. */
+    inline void setZero() { m_diagonal.setZero(); }
+    /** Resizes and sets all coefficients to zero. */
+    inline void setZero(Index size) { m_diagonal.setZero(size); }
+    /** Sets this matrix to be the identity matrix of the current size. */
+    inline void setIdentity() { m_diagonal.setOnes(); }
+    /** Sets this matrix to be the identity matrix of the given size. */
+    inline void setIdentity(Index size) { m_diagonal.setOnes(size); }
+};
+
+/** \class DiagonalWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a diagonal matrix
+  *
+  * \param _DiagonalVectorType the type of the vector of diagonal coefficients
+  *
+  * This class is an expression of a diagonal matrix, but not storing its own vector of diagonal coefficients,
+  * instead wrapping an existing vector expression. It is the return type of MatrixBase::asDiagonal()
+  * and most of the time this is the only way that it is used.
+  *
+  * \sa class DiagonalMatrix, class DiagonalBase, MatrixBase::asDiagonal()
+  */
+
+namespace internal {
+template<typename _DiagonalVectorType>
+struct traits<DiagonalWrapper<_DiagonalVectorType> >
+{
+  typedef _DiagonalVectorType DiagonalVectorType;
+  typedef typename DiagonalVectorType::Scalar Scalar;
+  typedef typename DiagonalVectorType::Index Index;
+  typedef typename DiagonalVectorType::StorageKind StorageKind;
+  enum {
+    RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    MaxRowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    MaxColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    Flags =  traits<DiagonalVectorType>::Flags & LvalueBit
+  };
+};
+}
+
+template<typename _DiagonalVectorType>
+class DiagonalWrapper
+  : public DiagonalBase<DiagonalWrapper<_DiagonalVectorType> >, internal::no_assignment_operator
+{
+  public:
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef _DiagonalVectorType DiagonalVectorType;
+    typedef DiagonalWrapper Nested;
+    #endif
+
+    /** Constructor from expression of diagonal coefficients to wrap. */
+    inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+
+    /** \returns a const reference to the wrapped expression of diagonal coefficients. */
+    const DiagonalVectorType& diagonal() const { return m_diagonal; }
+
+  protected:
+    typename DiagonalVectorType::Nested m_diagonal;
+};
+
+/** \returns a pseudo-expression of a diagonal matrix with *this as vector of diagonal coefficients
+  *
+  * \only_for_vectors
+  *
+  * Example: \include MatrixBase_asDiagonal.cpp
+  * Output: \verbinclude MatrixBase_asDiagonal.out
+  *
+  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
+  **/
+template<typename Derived>
+inline const DiagonalWrapper<const Derived>
+MatrixBase<Derived>::asDiagonal() const
+{
+  return derived();
+}
+
+/** \returns true if *this is approximately equal to a diagonal matrix,
+  *          within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isDiagonal.cpp
+  * Output: \verbinclude MatrixBase_isDiagonal.out
+  *
+  * \sa asDiagonal()
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
+{
+  using std::abs;
+  if(cols() != rows()) return false;
+  RealScalar maxAbsOnDiagonal = static_cast<RealScalar>(-1);
+  for(Index j = 0; j < cols(); ++j)
+  {
+    RealScalar absOnDiagonal = abs(coeff(j,j));
+    if(absOnDiagonal > maxAbsOnDiagonal) maxAbsOnDiagonal = absOnDiagonal;
+  }
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < j; ++i)
+    {
+      if(!internal::isMuchSmallerThan(coeff(i, j), maxAbsOnDiagonal, prec)) return false;
+      if(!internal::isMuchSmallerThan(coeff(j, i), maxAbsOnDiagonal, prec)) return false;
+    }
+  return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DIAGONALMATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/DiagonalProduct.h b/include/eigen3/Eigen/src/Core/DiagonalProduct.h
new file mode 100644
index 0000000..c03a0c2
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/DiagonalProduct.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DIAGONALPRODUCT_H
+#define EIGEN_DIAGONALPRODUCT_H
+
+namespace Eigen { 
+
+namespace internal {
+template<typename MatrixType, typename DiagonalType, int ProductOrder>
+struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
+ : traits<MatrixType>
+{
+  typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+
+    _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
+    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
+                          ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
+    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
+    // FIXME currently we need same types, but in the future the next rule should be the one
+    //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
+    _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
+    _LinearAccessMask = (RowsAtCompileTime==1 || ColsAtCompileTime==1) ? LinearAccessBit : 0,
+
+    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit,//(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
+    CoeffReadCost = NumTraits<Scalar>::MulCost + MatrixType::CoeffReadCost + DiagonalType::DiagonalVectorType::CoeffReadCost
+  };
+};
+}
+
+template<typename MatrixType, typename DiagonalType, int ProductOrder>
+class DiagonalProduct : internal::no_assignment_operator,
+                        public MatrixBase<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
+{
+  public:
+
+    typedef MatrixBase<DiagonalProduct> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(DiagonalProduct)
+
+    inline DiagonalProduct(const MatrixType& matrix, const DiagonalType& diagonal)
+      : m_matrix(matrix), m_diagonal(diagonal)
+    {
+      eigen_assert(diagonal.diagonal().size() == (ProductOrder == OnTheLeft ? matrix.rows() : matrix.cols()));
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+    {
+      return m_diagonal.diagonal().coeff(ProductOrder == OnTheLeft ? row : col) * m_matrix.coeff(row, col);
+    }
+    
+    EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
+    {
+      enum {
+        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
+      };
+      return coeff(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
+    {
+      enum {
+        StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor
+      };
+      const Index indexInDiagonalVector = ProductOrder == OnTheLeft ? row : col;
+      return packet_impl<LoadMode>(row,col,indexInDiagonalVector,typename internal::conditional<
+        ((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
+       ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), internal::true_type, internal::false_type>::type());
+    }
+    
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
+    {
+      enum {
+        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
+      };
+      return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+    }
+
+  protected:
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::true_type) const
+    {
+      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
+                     internal::pset1<PacketScalar>(m_diagonal.diagonal().coeff(id)));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::false_type) const
+    {
+      enum {
+        InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+        DiagonalVectorPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
+      };
+      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
+                     m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(id));
+    }
+
+    typename MatrixType::Nested m_matrix;
+    typename DiagonalType::Nested m_diagonal;
+};
+
+/** \returns the diagonal matrix product of \c *this by the diagonal matrix \a diagonal.
+  */
+template<typename Derived>
+template<typename DiagonalDerived>
+inline const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
+{
+  return DiagonalProduct<Derived, DiagonalDerived, OnTheRight>(derived(), a_diagonal.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DIAGONALPRODUCT_H
diff --git a/include/eigen3/Eigen/src/Core/Dot.h b/include/eigen3/Eigen/src/Core/Dot.h
new file mode 100644
index 0000000..9d7651f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Dot.h
@@ -0,0 +1,263 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008, 2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DOT_H
+#define EIGEN_DOT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// helper function for dot(). The problem is that if we put that in the body of dot(), then upon calling dot
+// with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
+// looking at the static assertions. Thus this is a trick to get better compile errors.
+template<typename T, typename U,
+// the NeedToTranspose condition here is taken straight from Assign.h
+         bool NeedToTranspose = T::IsVectorAtCompileTime
+                && U::IsVectorAtCompileTime
+                && ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1)
+                      |  // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+                         // revert to || as soon as not needed anymore.
+                    (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))
+>
+struct dot_nocheck
+{
+  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  {
+    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+  }
+};
+
+template<typename T, typename U>
+struct dot_nocheck<T, U, true>
+{
+  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  {
+    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+  }
+};
+
+} // end namespace internal
+
+/** \returns the dot product of *this with other.
+  *
+  * \only_for_vectors
+  *
+  * \note If the scalar type is complex numbers, then this function returns the hermitian
+  * (sesquilinear) dot product, conjugate-linear in the first variable and linear in the
+  * second variable.
+  *
+  * \sa squaredNorm(), norm()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+  typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
+  EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
+
+  eigen_assert(size() == other.size());
+
+  return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
+}
+
+#ifdef EIGEN2_SUPPORT
+/** \returns the dot product of *this with other, with the Eigen2 convention that the dot product is linear in the first variable
+  * (conjugating the second variable). Of course this only makes a difference in the complex case.
+  *
+  * This method is only available in EIGEN2_SUPPORT mode.
+  *
+  * \only_for_vectors
+  *
+  * \sa dot()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+typename internal::traits<Derived>::Scalar
+MatrixBase<Derived>::eigen2_dot(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  eigen_assert(size() == other.size());
+
+  return internal::dot_nocheck<OtherDerived,Derived>::run(other,*this);
+}
+#endif
+
+
+//---------- implementation of L2 norm and related functions ----------
+
+/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
+  * In both cases, it consists in the sum of the square of all the matrix entries.
+  * For vectors, this is also equals to the dot product of \c *this with itself.
+  *
+  * \sa dot(), norm()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+{
+  return numext::real((*this).cwiseAbs2().sum());
+}
+
+/** \returns, for vectors, the \em l2 norm of \c *this, and for matrices the Frobenius norm.
+  * In both cases, it consists in the square root of the sum of the square of all the matrix entries.
+  * For vectors, this is also equals to the square root of the dot product of \c *this with itself.
+  *
+  * \sa dot(), squaredNorm()
+  */
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+{
+  using std::sqrt;
+  return sqrt(squaredNorm());
+}
+
+/** \returns an expression of the quotient of *this by its own norm.
+  *
+  * \only_for_vectors
+  *
+  * \sa norm(), normalize()
+  */
+template<typename Derived>
+inline const typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::normalized() const
+{
+  typedef typename internal::nested<Derived>::type Nested;
+  typedef typename internal::remove_reference<Nested>::type _Nested;
+  _Nested n(derived());
+  return n / n.norm();
+}
+
+/** Normalizes the vector, i.e. divides it by its own norm.
+  *
+  * \only_for_vectors
+  *
+  * \sa norm(), normalized()
+  */
+template<typename Derived>
+inline void MatrixBase<Derived>::normalize()
+{
+  *this /= norm();
+}
+
+//---------- implementation of other norms ----------
+
+namespace internal {
+
+template<typename Derived, int p>
+struct lpNorm_selector
+{
+  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
+  static inline RealScalar run(const MatrixBase<Derived>& m)
+  {
+    using std::pow;
+    return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p);
+  }
+};
+
+template<typename Derived>
+struct lpNorm_selector<Derived, 1>
+{
+  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  {
+    return m.cwiseAbs().sum();
+  }
+};
+
+template<typename Derived>
+struct lpNorm_selector<Derived, 2>
+{
+  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  {
+    return m.norm();
+  }
+};
+
+template<typename Derived>
+struct lpNorm_selector<Derived, Infinity>
+{
+  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  {
+    return m.cwiseAbs().maxCoeff();
+  }
+};
+
+} // end namespace internal
+
+/** \returns the \f$ \ell^p \f$ norm of *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
+  *          of the coefficients of *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
+  *          norm, that is the maximum of the absolute values of the coefficients of *this.
+  *
+  * \sa norm()
+  */
+template<typename Derived>
+template<int p>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::lpNorm() const
+{
+  return internal::lpNorm_selector<Derived, p>::run(*this);
+}
+
+//---------- implementation of isOrthogonal / isUnitary ----------
+
+/** \returns true if *this is approximately orthogonal to \a other,
+  *          within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isOrthogonal.cpp
+  * Output: \verbinclude MatrixBase_isOrthogonal.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+bool MatrixBase<Derived>::isOrthogonal
+(const MatrixBase<OtherDerived>& other, const RealScalar& prec) const
+{
+  typename internal::nested<Derived,2>::type nested(derived());
+  typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
+  return numext::abs2(nested.dot(otherNested)) <= prec * prec * nested.squaredNorm() * otherNested.squaredNorm();
+}
+
+/** \returns true if *this is approximately an unitary matrix,
+  *          within the precision given by \a prec. In the case where the \a Scalar
+  *          type is real numbers, a unitary matrix is an orthogonal matrix, whence the name.
+  *
+  * \note This can be used to check whether a family of vectors forms an orthonormal basis.
+  *       Indeed, \c m.isUnitary() returns true if and only if the columns (equivalently, the rows) of m form an
+  *       orthonormal basis.
+  *
+  * Example: \include MatrixBase_isUnitary.cpp
+  * Output: \verbinclude MatrixBase_isUnitary.out
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
+{
+  typename Derived::Nested nested(derived());
+  for(Index i = 0; i < cols(); ++i)
+  {
+    if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
+      return false;
+    for(Index j = 0; j < i; ++j)
+      if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec))
+        return false;
+  }
+  return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DOT_H
diff --git a/include/eigen3/Eigen/src/Core/EigenBase.h b/include/eigen3/Eigen/src/Core/EigenBase.h
new file mode 100644
index 0000000..fadb458
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/EigenBase.h
@@ -0,0 +1,131 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EIGENBASE_H
+#define EIGEN_EIGENBASE_H
+
+namespace Eigen {
+
+/** Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
+  *
+  * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
+  *
+  * Besides MatrixBase-derived classes, this also includes special matrix classes such as diagonal matrices, etc.
+  *
+  * Notice that this class is trivial, it is only used to disambiguate overloaded functions.
+  *
+  * \sa \ref TopicClassHierarchy
+  */
+template<typename Derived> struct EigenBase
+{
+//   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
+
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::Index Index;
+
+  /** \returns a reference to the derived object */
+  Derived& derived() { return *static_cast<Derived*>(this); }
+  /** \returns a const reference to the derived object */
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  inline Derived& const_cast_derived() const
+  { return *static_cast<Derived*>(const_cast<EigenBase*>(this)); }
+  inline const Derived& const_derived() const
+  { return *static_cast<const Derived*>(this); }
+
+  /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
+  inline Index rows() const { return derived().rows(); }
+  /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
+  inline Index cols() const { return derived().cols(); }
+  /** \returns the number of coefficients, which is rows()*cols().
+    * \sa rows(), cols(), SizeAtCompileTime. */
+  inline Index size() const { return rows() * cols(); }
+
+  /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
+  template<typename Dest> inline void evalTo(Dest& dst) const
+  { derived().evalTo(dst); }
+
+  /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
+  template<typename Dest> inline void addTo(Dest& dst) const
+  {
+    // This is the default implementation,
+    // derived class can reimplement it in a more optimized way.
+    typename Dest::PlainObject res(rows(),cols());
+    evalTo(res);
+    dst += res;
+  }
+
+  /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
+  template<typename Dest> inline void subTo(Dest& dst) const
+  {
+    // This is the default implementation,
+    // derived class can reimplement it in a more optimized way.
+    typename Dest::PlainObject res(rows(),cols());
+    evalTo(res);
+    dst -= res;
+  }
+
+  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
+  template<typename Dest> inline void applyThisOnTheRight(Dest& dst) const
+  {
+    // This is the default implementation,
+    // derived class can reimplement it in a more optimized way.
+    dst = dst * this->derived();
+  }
+
+  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
+  template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
+  {
+    // This is the default implementation,
+    // derived class can reimplement it in a more optimized way.
+    dst = this->derived() * dst;
+  }
+
+};
+
+/***************************************************************************
+* Implementation of matrix base methods
+***************************************************************************/
+
+/** \brief Copies the generic expression \a other into *this.
+  *
+  * \details The expression must provide a (templated) evalTo(Derived& dst) const
+  * function which does the actual job. In practice, this allows any user to write
+  * its own special matrix without having to modify MatrixBase
+  *
+  * \returns a reference to *this.
+  */
+template<typename Derived>
+template<typename OtherDerived>
+Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
+{
+  other.derived().evalTo(derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
+{
+  other.derived().addTo(derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
+{
+  other.derived().subTo(derived());
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_EIGENBASE_H
diff --git a/include/eigen3/Eigen/src/Core/Flagged.h b/include/eigen3/Eigen/src/Core/Flagged.h
new file mode 100644
index 0000000..1f2955f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Flagged.h
@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FLAGGED_H
+#define EIGEN_FLAGGED_H
+
+namespace Eigen { 
+
+/** \class Flagged
+  * \ingroup Core_Module
+  *
+  * \brief Expression with modified flags
+  *
+  * \param ExpressionType the type of the object of which we are modifying the flags
+  * \param Added the flags added to the expression
+  * \param Removed the flags removed from the expression (has priority over Added).
+  *
+  * This class represents an expression whose flags have been modified.
+  * It is the return type of MatrixBase::flagged()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::flagged()
+  */
+
+namespace internal {
+template<typename ExpressionType, unsigned int Added, unsigned int Removed>
+struct traits<Flagged<ExpressionType, Added, Removed> > : traits<ExpressionType>
+{
+  enum { Flags = (ExpressionType::Flags | Added) & ~Removed };
+};
+}
+
+template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged
+  : public MatrixBase<Flagged<ExpressionType, Added, Removed> >
+{
+  public:
+
+    typedef MatrixBase<Flagged> Base;
+    
+    EIGEN_DENSE_PUBLIC_INTERFACE(Flagged)
+    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
+        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
+    typedef typename ExpressionType::InnerIterator InnerIterator;
+
+    inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+    inline Index outerStride() const { return m_matrix.outerStride(); }
+    inline Index innerStride() const { return m_matrix.innerStride(); }
+
+    inline CoeffReturnType coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row, col);
+    }
+
+    inline CoeffReturnType coeff(Index index) const
+    {
+      return m_matrix.coeff(index);
+    }
+    
+    inline const Scalar& coeffRef(Index row, Index col) const
+    {
+      return m_matrix.const_cast_derived().coeffRef(row, col);
+    }
+
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_matrix.const_cast_derived().coeffRef(row, col);
+    }
+
+    inline Scalar& coeffRef(Index index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index row, Index col) const
+    {
+      return m_matrix.template packet<LoadMode>(row, col);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return m_matrix.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
+
+    const ExpressionType& _expression() const { return m_matrix; }
+
+    template<typename OtherDerived>
+    typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived>
+    void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;
+
+  protected:
+    ExpressionTypeNested m_matrix;
+};
+
+/** \returns an expression of *this with added and removed flags
+  *
+  * This is mostly for internal use.
+  *
+  * \sa class Flagged
+  */
+template<typename Derived>
+template<unsigned int Added,unsigned int Removed>
+inline const Flagged<Derived, Added, Removed>
+DenseBase<Derived>::flagged() const
+{
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_FLAGGED_H
diff --git a/include/eigen3/Eigen/src/Core/ForceAlignedAccess.h b/include/eigen3/Eigen/src/Core/ForceAlignedAccess.h
new file mode 100644
index 0000000..807c7a2
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/ForceAlignedAccess.h
@@ -0,0 +1,146 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FORCEALIGNEDACCESS_H
+#define EIGEN_FORCEALIGNEDACCESS_H
+
+namespace Eigen {
+
+/** \class ForceAlignedAccess
+  * \ingroup Core_Module
+  *
+  * \brief Enforce aligned packet loads and stores regardless of what is requested
+  *
+  * \param ExpressionType the type of the object of which we are forcing aligned packet access
+  *
+  * This class is the return type of MatrixBase::forceAlignedAccess()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::forceAlignedAccess()
+  */
+
+namespace internal {
+template<typename ExpressionType>
+struct traits<ForceAlignedAccess<ExpressionType> > : public traits<ExpressionType>
+{};
+}
+
+template<typename ExpressionType> class ForceAlignedAccess
+  : public internal::dense_xpr_base< ForceAlignedAccess<ExpressionType> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
+
+    inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+
+    inline Index rows() const { return m_expression.rows(); }
+    inline Index cols() const { return m_expression.cols(); }
+    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index innerStride() const { return m_expression.innerStride(); }
+
+    inline const CoeffReturnType coeff(Index row, Index col) const
+    {
+      return m_expression.coeff(row, col);
+    }
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_expression.const_cast_derived().coeffRef(row, col);
+    }
+
+    inline const CoeffReturnType coeff(Index index) const
+    {
+      return m_expression.coeff(index);
+    }
+
+    inline Scalar& coeffRef(Index index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index row, Index col) const
+    {
+      return m_expression.template packet<Aligned>(row, col);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<Aligned>(row, col, x);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return m_expression.template packet<Aligned>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
+    }
+
+    operator const ExpressionType&() const { return m_expression; }
+
+  protected:
+    const ExpressionType& m_expression;
+
+  private:
+    ForceAlignedAccess& operator=(const ForceAlignedAccess&);
+};
+
+/** \returns an expression of *this with forced aligned access
+  * \sa forceAlignedAccessIf(),class ForceAlignedAccess
+  */
+template<typename Derived>
+inline const ForceAlignedAccess<Derived>
+MatrixBase<Derived>::forceAlignedAccess() const
+{
+  return ForceAlignedAccess<Derived>(derived());
+}
+
+/** \returns an expression of *this with forced aligned access
+  * \sa forceAlignedAccessIf(), class ForceAlignedAccess
+  */
+template<typename Derived>
+inline ForceAlignedAccess<Derived>
+MatrixBase<Derived>::forceAlignedAccess()
+{
+  return ForceAlignedAccess<Derived>(derived());
+}
+
+/** \returns an expression of *this with forced aligned access if \a Enable is true.
+  * \sa forceAlignedAccess(), class ForceAlignedAccess
+  */
+template<typename Derived>
+template<bool Enable>
+inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type
+MatrixBase<Derived>::forceAlignedAccessIf() const
+{
+  return derived();
+}
+
+/** \returns an expression of *this with forced aligned access if \a Enable is true.
+  * \sa forceAlignedAccess(), class ForceAlignedAccess
+  */
+template<typename Derived>
+template<bool Enable>
+inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type
+MatrixBase<Derived>::forceAlignedAccessIf()
+{
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_FORCEALIGNEDACCESS_H
diff --git a/include/eigen3/Eigen/src/Core/Functors.h b/include/eigen3/Eigen/src/Core/Functors.h
new file mode 100644
index 0000000..b08b967
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Functors.h
@@ -0,0 +1,985 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FUNCTORS_H
+#define EIGEN_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// associative functors:
+
+/** \internal
+  * \brief Template functor to compute the sum of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, MatrixBase::sum()
+  */
+template<typename Scalar> struct scalar_sum_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::padd(a,b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  { return internal::predux(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sum_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAdd
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the product of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
+  */
+template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
+  enum {
+    // TODO vectorize mixed product
+    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
+  };
+  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmul(a,b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  { return internal::predux_mul(a); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
+    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the conjugate product of two scalars
+  *
+  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
+  */
+template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
+
+  enum {
+    Conj = NumTraits<LhsScalar>::IsComplex
+  };
+  
+  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
+  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
+  
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = NumTraits<LhsScalar>::MulCost,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the min of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
+  */
+template<typename Scalar> struct scalar_min_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::min; return (min)(a, b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmin(a,b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  { return internal::predux_min(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_min_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasMin
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the max of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
+  */
+template<typename Scalar> struct scalar_max_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::max; return (max)(a, b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmax(a,b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  { return internal::predux_max(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_max_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasMax
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the hypot of two scalars
+  *
+  * \sa MatrixBase::stableNorm(), class Redux
+  */
+template<typename Scalar> struct scalar_hypot_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
+//   typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
+  {
+    using std::max;
+    using std::min;
+    using std::sqrt;
+    Scalar p = (max)(_x, _y);
+    Scalar q = (min)(_x, _y);
+    Scalar qp = q/p;
+    return p * sqrt(Scalar(1) + qp*qp);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_hypot_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
+};
+
+/** \internal
+  * \brief Template functor to compute the pow of two scalars
+  */
+template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
+  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
+};
+template<typename Scalar, typename OtherScalar>
+struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
+// other binary functors:
+
+/** \internal
+  * \brief Template functor to compute the difference of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator-
+  */
+template<typename Scalar> struct scalar_difference_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::psub(a,b); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_difference_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasSub
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the quotient of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator/()
+  */
+template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
+  enum {
+    // TODO vectorize mixed product
+    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
+  };
+  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pdiv(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
+    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
+  };
+};
+
+
+
+/** \internal
+  * \brief Template functor to compute the and of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator&&
+  */
+struct scalar_boolean_and_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
+  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
+};
+template<> struct functor_traits<scalar_boolean_and_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the or of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator||
+  */
+struct scalar_boolean_or_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
+  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
+};
+template<> struct functor_traits<scalar_boolean_or_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+// unary functors:
+
+/** \internal
+  * \brief Template functor to compute the opposite of a scalar
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator-
+  */
+template<typename Scalar> struct scalar_opposite_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pnegate(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_opposite_op<Scalar> >
+{ enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNegate };
+};
+
+/** \internal
+  * \brief Template functor to compute the absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs
+  */
+template<typename Scalar> struct scalar_abs_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pabs(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAbs
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the squared absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs2
+  */
+template<typename Scalar> struct scalar_abs2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs2_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
+
+/** \internal
+  * \brief Template functor to compute the conjugate of a complex value
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
+  */
+template<typename Scalar> struct scalar_conjugate_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_conjugate_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
+    PacketAccess = packet_traits<Scalar>::HasConj
+  };
+};
+
+/** \internal
+  * \brief Template functor to cast a scalar to another type
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::cast()
+  */
+template<typename Scalar, typename NewType>
+struct scalar_cast_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef NewType result_type;
+  EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
+};
+template<typename Scalar, typename NewType>
+struct functor_traits<scalar_cast_op<Scalar,NewType> >
+{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the exponential of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::exp()
+  */
+template<typename Scalar> struct scalar_exp_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_exp_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log()
+  */
+template<typename Scalar> struct scalar_log_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
+
+/** \internal
+  * \brief Template functor to multiply a scalar by a fixed other one
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
+  */
+/* NOTE why doing the pset1() in packetOp *is* an optimization ?
+ * indeed it seems better to declare m_other as a Packet and do the pset1() once
+ * in the constructor. However, in practice:
+ *  - GCC does not like m_other as a Packet and generate a load every time it needs it
+ *  - on the other hand GCC is able to moves the pset1() outside the loop :)
+ *  - simpler code ;)
+ * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
+ */
+template<typename Scalar>
+struct scalar_multiple_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
+  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
+  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a, pset1<Packet>(m_other)); }
+  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_multiple_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+template<typename Scalar1, typename Scalar2>
+struct scalar_multiple2_op {
+  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
+  EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
+  EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
+  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
+};
+template<typename Scalar1,typename Scalar2>
+struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
+{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to divide a scalar by a fixed other one
+  *
+  * This functor is used to implement the quotient of a matrix by
+  * a scalar where the scalar type is not necessarily a floating point type.
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator/
+  */
+template<typename Scalar>
+struct scalar_quotient1_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
+  EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
+  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(a, pset1<Packet>(m_other)); }
+  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_quotient1_op<Scalar> >
+{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+
+// nullary functors
+
+template<typename Scalar>
+struct scalar_constant_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
+  EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return internal::pset1<Packet>(m_other); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_constant_op<Scalar> >
+// FIXME replace this packet test by a safe one
+{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
+
+template<typename Scalar> struct scalar_identity_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_identity_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
+
+template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
+
+// linear access for packet ops:
+// 1) initialization
+//   base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
+// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
+//   base += [size*step, ..., size*step]
+//
+// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
+//       in order to avoid the padd() in operator() ?
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,false>
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  linspaced_op_impl(const Scalar& low, const Scalar& step) :
+  m_low(low), m_step(step),
+  m_packetStep(pset1<Packet>(packet_traits<Scalar>::size*step)),
+  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Scalar>(-packet_traits<Scalar>::size)))) {}
+
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
+  { 
+    m_base = padd(m_base, pset1<Packet>(m_step));
+    return m_low+Scalar(i)*m_step; 
+  }
+
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
+
+  const Scalar m_low;
+  const Scalar m_step;
+  const Packet m_packetStep;
+  mutable Packet m_base;
+};
+
+// random access for packet ops:
+// 1) each step
+//   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,true>
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  linspaced_op_impl(const Scalar& low, const Scalar& step) :
+  m_low(low), m_step(step),
+  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Scalar>(0)) {}
+
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
+
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
+  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
+
+  const Scalar m_low;
+  const Scalar m_step;
+  const Packet m_lowPacket;
+  const Packet m_stepPacket;
+  const Packet m_interPacket;
+};
+
+// ----- Linspace functor ----------------------------------------------------------------
+
+// Forward declaration (we default to random access which does not really give
+// us a speed gain when using packet access but it allows to use the functor in
+// nested expressions).
+template <typename Scalar, bool RandomAccess = true> struct linspaced_op;
+template <typename Scalar, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,RandomAccess> >
+{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
+template <typename Scalar, bool RandomAccess> struct linspaced_op
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  linspaced_op(const Scalar& low, const Scalar& high, DenseIndex num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {}
+
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
+
+  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
+  // there row==0 and col is used for the actual iteration.
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
+  {
+    eigen_assert(col==0 || row==0);
+    return impl(col + row);
+  }
+
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
+
+  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
+  // there row==0 and col is used for the actual iteration.
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
+  {
+    eigen_assert(col==0 || row==0);
+    return impl.packetOp(col + row);
+  }
+
+  // This proxy object handles the actual required temporaries, the different
+  // implementations (random vs. sequential access) as well as the
+  // correct piping to size 2/4 packet operations.
+  const linspaced_op_impl<Scalar,RandomAccess> impl;
+};
+
+// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
+// to indicate whether a functor allows linear access, just always answering 'yes' except for
+// scalar_identity_op.
+// FIXME move this to functor_traits adding a functor_default
+template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
+template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
+
+// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
+// where the mixing of different types is handled by scalar_product_traits
+// In particular, real * complex<real> is allowed.
+// FIXME move this to functor_traits adding a functor_default
+template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
+template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+
+
+/** \internal
+  * \brief Template functor to add a scalar to a fixed other one
+  * \sa class CwiseUnaryOp, Array::operator+
+  */
+/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
+template<typename Scalar>
+struct scalar_add_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  // FIXME default copy constructors seems bugged with std::complex<>
+  inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
+  inline scalar_add_op(const Scalar& other) : m_other(other) { }
+  inline Scalar operator() (const Scalar& a) const { return a + m_other; }
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::padd(a, pset1<Packet>(m_other)); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_add_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
+
+/** \internal
+  * \brief Template functor to compute the square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sqrt()
+  */
+template<typename Scalar> struct scalar_sqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sqrt_op<Scalar> >
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSqrt
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cos()
+  */
+template<typename Scalar> struct scalar_cos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
+  inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sin()
+  */
+template<typename Scalar> struct scalar_sin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSin
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the tan of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::tan()
+  */
+template<typename Scalar> struct scalar_tan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_tan_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasTan
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::acos()
+  */
+template<typename Scalar> struct scalar_acos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_acos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasACos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::asin()
+  */
+template<typename Scalar> struct scalar_asin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_asin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasASin
+  };
+};
+
+/** \internal
+  * \brief Template functor to raise a scalar to a power
+  * \sa class CwiseUnaryOp, Cwise::pow
+  */
+template<typename Scalar>
+struct scalar_pow_op {
+  // FIXME default copy constructors seems bugged with std::complex<>
+  inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
+  inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
+  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
+  const Scalar m_exponent;
+};
+template<typename Scalar>
+struct functor_traits<scalar_pow_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to compute the quotient between a scalar and array entries.
+  * \sa class CwiseUnaryOp, Cwise::inverse()
+  */
+template<typename Scalar>
+struct scalar_inverse_mult_op {
+  scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
+  inline Scalar operator() (const Scalar& a) const { return m_other / a; }
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(pset1<Packet>(m_other),a); }
+  Scalar m_other;
+};
+
+/** \internal
+  * \brief Template functor to compute the inverse of a scalar
+  * \sa class CwiseUnaryOp, Cwise::inverse()
+  */
+template<typename Scalar>
+struct scalar_inverse_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
+  inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+
+/** \internal
+  * \brief Template functor to compute the square of a scalar
+  * \sa class CwiseUnaryOp, Cwise::square()
+  */
+template<typename Scalar>
+struct scalar_square_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
+  inline Scalar operator() (const Scalar& a) const { return a*a; }
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_square_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+/** \internal
+  * \brief Template functor to compute the cube of a scalar
+  * \sa class CwiseUnaryOp, Cwise::cube()
+  */
+template<typename Scalar>
+struct scalar_cube_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
+  inline Scalar operator() (const Scalar& a) const { return a*a*a; }
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,pmul(a,a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cube_op<Scalar> >
+{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+// default functor traits for STL functors:
+
+template<typename T>
+struct functor_traits<std::multiplies<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::divides<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::plus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::minus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::negate<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_or<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_and<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_not<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::not_equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binder2nd<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binder1st<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::unary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+
+#ifdef EIGEN_STDEXT_SUPPORT
+
+template<typename T0,typename T1>
+struct functor_traits<std::project1st<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::project2nd<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select2nd<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select1st<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::unary_compose<T0,T1> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
+
+template<typename T0,typename T1,typename T2>
+struct functor_traits<std::binary_compose<T0,T1,T2> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
+
+#endif // EIGEN_STDEXT_SUPPORT
+
+// allow to add new functors and specializations of functor_traits from outside Eigen.
+// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
+#ifdef EIGEN_FUNCTORS_PLUGIN
+#include EIGEN_FUNCTORS_PLUGIN
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_FUNCTORS_H
diff --git a/include/eigen3/Eigen/src/Core/Fuzzy.h b/include/eigen3/Eigen/src/Core/Fuzzy.h
new file mode 100644
index 0000000..fe63bd2
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Fuzzy.h
@@ -0,0 +1,150 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FUZZY_H
+#define EIGEN_FUZZY_H
+
+namespace Eigen { 
+
+namespace internal
+{
+
+template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
+struct isApprox_selector
+{
+  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
+  {
+    using std::min;
+    typename internal::nested<Derived,2>::type nested(x);
+    typename internal::nested<OtherDerived,2>::type otherNested(y);
+    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * (min)(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
+  }
+};
+
+template<typename Derived, typename OtherDerived>
+struct isApprox_selector<Derived, OtherDerived, true>
+{
+  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar&)
+  {
+    return x.matrix() == y.matrix();
+  }
+};
+
+template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
+struct isMuchSmallerThan_object_selector
+{
+  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
+  {
+    return x.cwiseAbs2().sum() <= numext::abs2(prec) * y.cwiseAbs2().sum();
+  }
+};
+
+template<typename Derived, typename OtherDerived>
+struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
+{
+  static bool run(const Derived& x, const OtherDerived&, const typename Derived::RealScalar&)
+  {
+    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
+  }
+};
+
+template<typename Derived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
+struct isMuchSmallerThan_scalar_selector
+{
+  static bool run(const Derived& x, const typename Derived::RealScalar& y, const typename Derived::RealScalar& prec)
+  {
+    return x.cwiseAbs2().sum() <= numext::abs2(prec * y);
+  }
+};
+
+template<typename Derived>
+struct isMuchSmallerThan_scalar_selector<Derived, true>
+{
+  static bool run(const Derived& x, const typename Derived::RealScalar&, const typename Derived::RealScalar&)
+  {
+    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
+  }
+};
+
+} // end namespace internal
+
+
+/** \returns \c true if \c *this is approximately equal to \a other, within the precision
+  * determined by \a prec.
+  *
+  * \note The fuzzy compares are done multiplicatively. Two vectors \f$ v \f$ and \f$ w \f$
+  * are considered to be approximately equal within precision \f$ p \f$ if
+  * \f[ \Vert v - w \Vert \leqslant p\,\min(\Vert v\Vert, \Vert w\Vert). \f]
+  * For matrices, the comparison is done using the Hilbert-Schmidt norm (aka Frobenius norm
+  * L2 norm).
+  *
+  * \note Because of the multiplicativeness of this comparison, one can't use this function
+  * to check whether \c *this is approximately equal to the zero matrix or vector.
+  * Indeed, \c isApprox(zero) returns false unless \c *this itself is exactly the zero matrix
+  * or vector. If you want to test whether \c *this is zero, use internal::isMuchSmallerThan(const
+  * RealScalar&, RealScalar) instead.
+  *
+  * \sa internal::isMuchSmallerThan(const RealScalar&, RealScalar) const
+  */
+template<typename Derived>
+template<typename OtherDerived>
+bool DenseBase<Derived>::isApprox(
+  const DenseBase<OtherDerived>& other,
+  const RealScalar& prec
+) const
+{
+  return internal::isApprox_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
+}
+
+/** \returns \c true if the norm of \c *this is much smaller than \a other,
+  * within the precision determined by \a prec.
+  *
+  * \note The fuzzy compares are done multiplicatively. A vector \f$ v \f$ is
+  * considered to be much smaller than \f$ x \f$ within precision \f$ p \f$ if
+  * \f[ \Vert v \Vert \leqslant p\,\vert x\vert. \f]
+  *
+  * For matrices, the comparison is done using the Hilbert-Schmidt norm. For this reason,
+  * the value of the reference scalar \a other should come from the Hilbert-Schmidt norm
+  * of a reference matrix of same dimensions.
+  *
+  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
+  */
+template<typename Derived>
+bool DenseBase<Derived>::isMuchSmallerThan(
+  const typename NumTraits<Scalar>::Real& other,
+  const RealScalar& prec
+) const
+{
+  return internal::isMuchSmallerThan_scalar_selector<Derived>::run(derived(), other, prec);
+}
+
+/** \returns \c true if the norm of \c *this is much smaller than the norm of \a other,
+  * within the precision determined by \a prec.
+  *
+  * \note The fuzzy compares are done multiplicatively. A vector \f$ v \f$ is
+  * considered to be much smaller than a vector \f$ w \f$ within precision \f$ p \f$ if
+  * \f[ \Vert v \Vert \leqslant p\,\Vert w\Vert. \f]
+  * For matrices, the comparison is done using the Hilbert-Schmidt norm.
+  *
+  * \sa isApprox(), isMuchSmallerThan(const RealScalar&, RealScalar) const
+  */
+template<typename Derived>
+template<typename OtherDerived>
+bool DenseBase<Derived>::isMuchSmallerThan(
+  const DenseBase<OtherDerived>& other,
+  const RealScalar& prec
+) const
+{
+  return internal::isMuchSmallerThan_object_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_FUZZY_H
diff --git a/include/eigen3/Eigen/src/Core/GeneralProduct.h b/include/eigen3/Eigen/src/Core/GeneralProduct.h
new file mode 100644
index 0000000..9e805a8
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/GeneralProduct.h
@@ -0,0 +1,635 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_PRODUCT_H
+#define EIGEN_GENERAL_PRODUCT_H
+
+namespace Eigen { 
+
+/** \class GeneralProduct
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the product of two general matrices or vectors
+  *
+  * \param LhsNested the type used to store the left-hand side
+  * \param RhsNested the type used to store the right-hand side
+  * \param ProductMode the type of the product
+  *
+  * This class represents an expression of the product of two general matrices.
+  * We call a general matrix, a dense matrix with full storage. For instance,
+  * This excludes triangular, selfadjoint, and sparse matrices.
+  * It is the return type of the operator* between general matrices. Its template
+  * arguments are determined automatically by ProductReturnType. Therefore,
+  * GeneralProduct should never be used direclty. To determine the result type of a
+  * function which involves a matrix product, use ProductReturnType::Type.
+  *
+  * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
+  */
+template<typename Lhs, typename Rhs, int ProductType = internal::product_type<Lhs,Rhs>::value>
+class GeneralProduct;
+
+enum {
+  Large = 2,
+  Small = 3
+};
+
+namespace internal {
+
+template<int Rows, int Cols, int Depth> struct product_type_selector;
+
+template<int Size, int MaxSize> struct product_size_category
+{
+  enum { is_large = MaxSize == Dynamic ||
+                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD,
+         value = is_large  ? Large
+               : Size == 1 ? 1
+                           : Small
+  };
+};
+
+template<typename Lhs, typename Rhs> struct product_type
+{
+  typedef typename remove_all<Lhs>::type _Lhs;
+  typedef typename remove_all<Rhs>::type _Rhs;
+  enum {
+    MaxRows  = _Lhs::MaxRowsAtCompileTime,
+    Rows  = _Lhs::RowsAtCompileTime,
+    MaxCols  = _Rhs::MaxColsAtCompileTime,
+    Cols  = _Rhs::ColsAtCompileTime,
+    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
+                                           _Rhs::MaxRowsAtCompileTime),
+    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
+                                        _Rhs::RowsAtCompileTime),
+    LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+  };
+
+  // the splitting into different lines of code here, introducing the _select enums and the typedef below,
+  // is to work around an internal compiler error with gcc 4.1 and 4.2.
+private:
+  enum {
+    rows_select = product_size_category<Rows,MaxRows>::value,
+    cols_select = product_size_category<Cols,MaxCols>::value,
+    depth_select = product_size_category<Depth,MaxDepth>::value
+  };
+  typedef product_type_selector<rows_select, cols_select, depth_select> selector;
+
+public:
+  enum {
+    value = selector::ret
+  };
+#ifdef EIGEN_DEBUG_PRODUCT
+  static void debug()
+  {
+      EIGEN_DEBUG_VAR(Rows);
+      EIGEN_DEBUG_VAR(Cols);
+      EIGEN_DEBUG_VAR(Depth);
+      EIGEN_DEBUG_VAR(rows_select);
+      EIGEN_DEBUG_VAR(cols_select);
+      EIGEN_DEBUG_VAR(depth_select);
+      EIGEN_DEBUG_VAR(value);
+  }
+#endif
+};
+
+
+/* The following allows to select the kind of product at compile time
+ * based on the three dimensions of the product.
+ * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
+// FIXME I'm not sure the current mapping is the ideal one.
+template<int M, int N>  struct product_type_selector<M,N,1>              { enum { ret = OuterProduct }; };
+template<int Depth>     struct product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
+template<>              struct product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
+template<>              struct product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<1,    Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small, Large, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>              struct product_type_selector<Large, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>              struct product_type_selector<1,    Large,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
+template<>              struct product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Large,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Large,1,    Large>  { enum { ret = GemvProduct }; };
+template<>              struct product_type_selector<Small,1,    Large>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Small,Large>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
+
+} // end namespace internal
+
+/** \class ProductReturnType
+  * \ingroup Core_Module
+  *
+  * \brief Helper class to get the correct and optimized returned type of operator*
+  *
+  * \param Lhs the type of the left-hand side
+  * \param Rhs the type of the right-hand side
+  * \param ProductMode the type of the product (determined automatically by internal::product_mode)
+  *
+  * This class defines the typename Type representing the optimized product expression
+  * between two matrix expressions. In practice, using ProductReturnType<Lhs,Rhs>::Type
+  * is the recommended way to define the result type of a function returning an expression
+  * which involve a matrix product. The class Product should never be
+  * used directly.
+  *
+  * \sa class Product, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
+  */
+template<typename Lhs, typename Rhs, int ProductType>
+struct ProductReturnType
+{
+  // TODO use the nested type to reduce instanciations ????
+//   typedef typename internal::nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+//   typedef typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+
+  typedef GeneralProduct<Lhs/*Nested*/, Rhs/*Nested*/, ProductType> Type;
+};
+
+template<typename Lhs, typename Rhs>
+struct ProductReturnType<Lhs,Rhs,CoeffBasedProductMode>
+{
+  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
+  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
+  typedef CoeffBasedProduct<LhsNested, RhsNested, EvalBeforeAssigningBit | EvalBeforeNestingBit> Type;
+};
+
+template<typename Lhs, typename Rhs>
+struct ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
+{
+  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
+  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
+  typedef CoeffBasedProduct<LhsNested, RhsNested, NestByRefBit> Type;
+};
+
+// this is a workaround for sun CC
+template<typename Lhs, typename Rhs>
+struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
+{};
+
+/***********************************************************************
+*  Implementation of Inner Vector Vector Product
+***********************************************************************/
+
+// FIXME : maybe the "inner product" could return a Scalar
+// instead of a 1x1 matrix ??
+// Pro: more natural for the user
+// Cons: this could be a problem if in a meta unrolled algorithm a matrix-matrix
+// product ends up to a row-vector times col-vector product... To tackle this use
+// case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);
+
+namespace internal {
+
+template<typename Lhs, typename Rhs>
+struct traits<GeneralProduct<Lhs,Rhs,InnerProduct> >
+ : traits<Matrix<typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> >
+{};
+
+}
+
+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, InnerProduct>
+  : internal::no_assignment_operator,
+    public Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1>
+{
+    typedef Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> Base;
+  public:
+    GeneralProduct(const Lhs& lhs, const Rhs& rhs)
+    {
+      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+      Base::coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
+    }
+
+    /** Convertion to scalar */
+    operator const typename Base::Scalar() const {
+      return Base::coeff(0,0);
+    }
+};
+
+/***********************************************************************
+*  Implementation of Outer Vector Vector Product
+***********************************************************************/
+
+namespace internal {
+
+// Column major
+template<typename ProductType, typename Dest, typename Func>
+EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const false_type&)
+{
+  typedef typename Dest::Index Index;
+  // FIXME make sure lhs is sequentially stored
+  // FIXME not very good if rhs is real and lhs complex while alpha is real too
+  const Index cols = dest.cols();
+  for (Index j=0; j<cols; ++j)
+    func(dest.col(j), prod.rhs().coeff(0,j) * prod.lhs());
+}
+
+// Row major
+template<typename ProductType, typename Dest, typename Func>
+EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const true_type&) {
+  typedef typename Dest::Index Index;
+  // FIXME make sure rhs is sequentially stored
+  // FIXME not very good if lhs is real and rhs complex while alpha is real too
+  const Index rows = dest.rows();
+  for (Index i=0; i<rows; ++i)
+    func(dest.row(i), prod.lhs().coeff(i,0) * prod.rhs());
+}
+
+template<typename Lhs, typename Rhs>
+struct traits<GeneralProduct<Lhs,Rhs,OuterProduct> >
+ : traits<ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs> >
+{};
+
+}
+
+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, OuterProduct>
+  : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
+{
+    template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
+    
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+
+    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {
+      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+    }
+    
+    struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+    struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+    struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+    struct adds {
+      Scalar m_scale;
+      adds(const Scalar& s) : m_scale(s) {}
+      template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+        dst.const_cast_derived() += m_scale * src;
+      }
+    };
+    
+    template<typename Dest>
+    inline void evalTo(Dest& dest) const {
+      internal::outer_product_selector_run(*this, dest, set(), IsRowMajor<Dest>());
+    }
+    
+    template<typename Dest>
+    inline void addTo(Dest& dest) const {
+      internal::outer_product_selector_run(*this, dest, add(), IsRowMajor<Dest>());
+    }
+
+    template<typename Dest>
+    inline void subTo(Dest& dest) const {
+      internal::outer_product_selector_run(*this, dest, sub(), IsRowMajor<Dest>());
+    }
+
+    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+    {
+      internal::outer_product_selector_run(*this, dest, adds(alpha), IsRowMajor<Dest>());
+    }
+};
+
+/***********************************************************************
+*  Implementation of General Matrix Vector Product
+***********************************************************************/
+
+/*  According to the shape/flags of the matrix we have to distinghish 3 different cases:
+ *   1 - the matrix is col-major, BLAS compatible and M is large => call fast BLAS-like colmajor routine
+ *   2 - the matrix is row-major, BLAS compatible and N is large => call fast BLAS-like rowmajor routine
+ *   3 - all other cases are handled using a simple loop along the outer-storage direction.
+ *  Therefore we need a lower level meta selector.
+ *  Furthermore, if the matrix is the rhs, then the product has to be transposed.
+ */
+namespace internal {
+
+template<typename Lhs, typename Rhs>
+struct traits<GeneralProduct<Lhs,Rhs,GemvProduct> >
+ : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs> >
+{};
+
+template<int Side, int StorageOrder, bool BlasCompatible>
+struct gemv_selector;
+
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, GemvProduct>
+  : public ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs>
+{
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
+
+    GeneralProduct(const Lhs& a_lhs, const Rhs& a_rhs) : Base(a_lhs,a_rhs)
+    {
+//       EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::Scalar, typename Rhs::Scalar>::value),
+//         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+    }
+
+    enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
+    typedef typename internal::conditional<int(Side)==OnTheRight,_LhsNested,_RhsNested>::type MatrixType;
+
+    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+    {
+      eigen_assert(m_lhs.rows() == dst.rows() && m_rhs.cols() == dst.cols());
+      internal::gemv_selector<Side,(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                       bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)>::run(*this, dst, alpha);
+    }
+};
+
+namespace internal {
+
+// The vector is on the left => transposition
+template<int StorageOrder, bool BlasCompatible>
+struct gemv_selector<OnTheLeft,StorageOrder,BlasCompatible>
+{
+  template<typename ProductType, typename Dest>
+  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  {
+    Transpose<Dest> destT(dest);
+    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
+    gemv_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
+      ::run(GeneralProduct<Transpose<const typename ProductType::_RhsNested>,Transpose<const typename ProductType::_LhsNested>, GemvProduct>
+        (prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha);
+  }
+};
+
+template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vector_if;
+
+template<typename Scalar,int Size,int MaxSize>
+struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
+{
+  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+};
+
+template<typename Scalar,int Size>
+struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
+{
+  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
+};
+
+template<typename Scalar,int Size,int MaxSize>
+struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
+{
+  #if EIGEN_ALIGN_STATICALLY
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
+  #else
+  // Some architectures cannot align on the stack,
+  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
+  enum {
+    ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
+    PacketSize      = internal::packet_traits<Scalar>::size
+  };
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() {
+    return ForceAlignment
+            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
+            : m_data.array;
+  }
+  #endif
+};
+
+template<> struct gemv_selector<OnTheRight,ColMajor,true>
+{
+  template<typename ProductType, typename Dest>
+  static inline void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  {
+    typedef typename ProductType::Index Index;
+    typedef typename ProductType::LhsScalar   LhsScalar;
+    typedef typename ProductType::RhsScalar   RhsScalar;
+    typedef typename ProductType::Scalar      ResScalar;
+    typedef typename ProductType::RealScalar  RealScalar;
+    typedef typename ProductType::ActualLhsType ActualLhsType;
+    typedef typename ProductType::ActualRhsType ActualRhsType;
+    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+
+    ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
+    ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+
+    enum {
+      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // on, the other hand it is good for the cache to pack the vector anyways...
+      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
+      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+    };
+
+    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+    
+    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
+
+    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                  evalToDest ? dest.data() : static_dest.data());
+    
+    if(!evalToDest)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      int size = dest.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      if(!alphaIsCompatible)
+      {
+        MappedDest(actualDestPtr, dest.size()).setZero();
+        compatibleAlpha = RhsScalar(1);
+      }
+      else
+        MappedDest(actualDestPtr, dest.size()) = dest;
+    }
+
+    general_matrix_vector_product
+      <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        actualLhs.rows(), actualLhs.cols(),
+        actualLhs.data(), actualLhs.outerStride(),
+        actualRhs.data(), actualRhs.innerStride(),
+        actualDestPtr, 1,
+        compatibleAlpha);
+
+    if (!evalToDest)
+    {
+      if(!alphaIsCompatible)
+        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
+      else
+        dest = MappedDest(actualDestPtr, dest.size());
+    }
+  }
+};
+
+template<> struct gemv_selector<OnTheRight,RowMajor,true>
+{
+  template<typename ProductType, typename Dest>
+  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  {
+    typedef typename ProductType::LhsScalar LhsScalar;
+    typedef typename ProductType::RhsScalar RhsScalar;
+    typedef typename ProductType::Scalar    ResScalar;
+    typedef typename ProductType::Index Index;
+    typedef typename ProductType::ActualLhsType ActualLhsType;
+    typedef typename ProductType::ActualRhsType ActualRhsType;
+    typedef typename ProductType::_ActualRhsType _ActualRhsType;
+    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+
+    enum {
+      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // on, the other hand it is good for the cache to pack the vector anyways...
+      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+    };
+
+    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
+        DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
+
+    if(!DirectlyUseRhs)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      int size = actualRhs.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+    }
+
+    general_matrix_vector_product
+      <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        actualLhs.rows(), actualLhs.cols(),
+        actualLhs.data(), actualLhs.outerStride(),
+        actualRhsPtr, 1,
+        dest.data(), dest.innerStride(),
+        actualAlpha);
+  }
+};
+
+template<> struct gemv_selector<OnTheRight,ColMajor,false>
+{
+  template<typename ProductType, typename Dest>
+  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  {
+    typedef typename Dest::Index Index;
+    // TODO makes sure dest is sequentially stored in memory, otherwise use a temp
+    const Index size = prod.rhs().rows();
+    for(Index k=0; k<size; ++k)
+      dest += (alpha*prod.rhs().coeff(k)) * prod.lhs().col(k);
+  }
+};
+
+template<> struct gemv_selector<OnTheRight,RowMajor,false>
+{
+  template<typename ProductType, typename Dest>
+  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  {
+    typedef typename Dest::Index Index;
+    // TODO makes sure rhs is sequentially stored in memory, otherwise use a temp
+    const Index rows = prod.rows();
+    for(Index i=0; i<rows; ++i)
+      dest.coeffRef(i) += alpha * (prod.lhs().row(i).cwiseProduct(prod.rhs().transpose())).sum();
+  }
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* Implementation of matrix base methods
+***************************************************************************/
+
+/** \returns the matrix product of \c *this and \a other.
+  *
+  * \note If instead of the matrix product you want the coefficient-wise product, see Cwise::operator*().
+  *
+  * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline const typename ProductReturnType<Derived, OtherDerived>::Type
+MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
+{
+  // A note regarding the function declaration: In MSVC, this function will sometimes
+  // not be inlined since DenseStorage is an unwindable object for dynamic
+  // matrices and product types are holding a member to store the result.
+  // Thus it does not help tagging this function with EIGEN_STRONG_INLINE.
+  enum {
+    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
+                   || OtherDerived::RowsAtCompileTime==Dynamic
+                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
+    AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
+    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
+  };
+  // note to the lost user:
+  //    * for a dot product use: v1.dot(v2)
+  //    * for a coeff-wise product use: v1.cwiseProduct(v2)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
+    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
+    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
+#ifdef EIGEN_DEBUG_PRODUCT
+  internal::product_type<Derived,OtherDerived>::debug();
+#endif
+  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+}
+
+/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
+  *
+  * The returned product will behave like any other expressions: the coefficients of the product will be
+  * computed once at a time as requested. This might be useful in some extremely rare cases when only
+  * a small and no coherent fraction of the result's coefficients have to be computed.
+  *
+  * \warning This version of the matrix product can be much much slower. So use it only if you know
+  * what you are doing and that you measured a true speed improvement.
+  *
+  * \sa operator*(const MatrixBase&)
+  */
+template<typename Derived>
+template<typename OtherDerived>
+const typename LazyProductReturnType<Derived,OtherDerived>::Type
+MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
+{
+  enum {
+    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
+                   || OtherDerived::RowsAtCompileTime==Dynamic
+                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
+    AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
+    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
+  };
+  // note to the lost user:
+  //    * for a dot product use: v1.dot(v2)
+  //    * for a coeff-wise product use: v1.cwiseProduct(v2)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
+    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
+    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
+
+  return typename LazyProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_H
diff --git a/include/eigen3/Eigen/src/Core/GenericPacketMath.h b/include/eigen3/Eigen/src/Core/GenericPacketMath.h
new file mode 100644
index 0000000..5f783eb
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/GenericPacketMath.h
@@ -0,0 +1,350 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERIC_PACKET_MATH_H
+#define EIGEN_GENERIC_PACKET_MATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \file GenericPacketMath.h
+  *
+  * Default implementation for types not supported by the vectorization.
+  * In practice these functions are provided to make easier the writing
+  * of generic vectorized code.
+  */
+
+#ifndef EIGEN_DEBUG_ALIGNED_LOAD
+#define EIGEN_DEBUG_ALIGNED_LOAD
+#endif
+
+#ifndef EIGEN_DEBUG_UNALIGNED_LOAD
+#define EIGEN_DEBUG_UNALIGNED_LOAD
+#endif
+
+#ifndef EIGEN_DEBUG_ALIGNED_STORE
+#define EIGEN_DEBUG_ALIGNED_STORE
+#endif
+
+#ifndef EIGEN_DEBUG_UNALIGNED_STORE
+#define EIGEN_DEBUG_UNALIGNED_STORE
+#endif
+
+struct default_packet_traits
+{
+  enum {
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 1,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 1,
+
+    HasDiv    = 0,
+    HasSqrt   = 0,
+    HasExp    = 0,
+    HasLog    = 0,
+    HasPow    = 0,
+
+    HasSin    = 0,
+    HasCos    = 0,
+    HasTan    = 0,
+    HasASin   = 0,
+    HasACos   = 0,
+    HasATan   = 0
+  };
+};
+
+template<typename T> struct packet_traits : default_packet_traits
+{
+  typedef T type;
+  enum {
+    Vectorizable = 0,
+    size = 1,
+    AlignedOnScalar = 0
+  };
+  enum {
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0
+  };
+};
+
+/** \internal \returns a + b (coeff-wise) */
+template<typename Packet> inline Packet
+padd(const Packet& a,
+        const Packet& b) { return a+b; }
+
+/** \internal \returns a - b (coeff-wise) */
+template<typename Packet> inline Packet
+psub(const Packet& a,
+        const Packet& b) { return a-b; }
+
+/** \internal \returns -a (coeff-wise) */
+template<typename Packet> inline Packet
+pnegate(const Packet& a) { return -a; }
+
+/** \internal \returns conj(a) (coeff-wise) */
+template<typename Packet> inline Packet
+pconj(const Packet& a) { return numext::conj(a); }
+
+/** \internal \returns a * b (coeff-wise) */
+template<typename Packet> inline Packet
+pmul(const Packet& a,
+        const Packet& b) { return a*b; }
+
+/** \internal \returns a / b (coeff-wise) */
+template<typename Packet> inline Packet
+pdiv(const Packet& a,
+        const Packet& b) { return a/b; }
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise) */
+template<typename Packet> inline Packet
+pmin(const Packet& a,
+        const Packet& b) { using std::min; return (min)(a, b); }
+
+/** \internal \returns the max of \a a and \a b  (coeff-wise) */
+template<typename Packet> inline Packet
+pmax(const Packet& a,
+        const Packet& b) { using std::max; return (max)(a, b); }
+
+/** \internal \returns the absolute value of \a a */
+template<typename Packet> inline Packet
+pabs(const Packet& a) { using std::abs; return abs(a); }
+
+/** \internal \returns the bitwise and of \a a and \a b */
+template<typename Packet> inline Packet
+pand(const Packet& a, const Packet& b) { return a & b; }
+
+/** \internal \returns the bitwise or of \a a and \a b */
+template<typename Packet> inline Packet
+por(const Packet& a, const Packet& b) { return a | b; }
+
+/** \internal \returns the bitwise xor of \a a and \a b */
+template<typename Packet> inline Packet
+pxor(const Packet& a, const Packet& b) { return a ^ b; }
+
+/** \internal \returns the bitwise andnot of \a a and \a b */
+template<typename Packet> inline Packet
+pandnot(const Packet& a, const Packet& b) { return a & (!b); }
+
+/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
+template<typename Packet> inline Packet
+pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
+
+/** \internal \returns a packet version of \a *from, (un-aligned load) */
+template<typename Packet> inline Packet
+ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
+
+/** \internal \returns a packet with elements of \a *from duplicated.
+  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
+  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
+  * Currently, this function is only used for scalar * complex products.
+ */
+template<typename Packet> inline Packet
+ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
+
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template<typename Packet> inline Packet
+pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+
+/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
+template<typename Scalar> inline typename packet_traits<Scalar>::type
+plset(const Scalar& a) { return a; }
+
+/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
+template<typename Scalar, typename Packet> inline void pstore(Scalar* to, const Packet& from)
+{ (*to) = from; }
+
+/** \internal copy the packet \a from to \a *to, (un-aligned store) */
+template<typename Scalar, typename Packet> inline void pstoreu(Scalar* to, const Packet& from)
+{ (*to) = from; }
+
+/** \internal tries to do cache prefetching of \a addr */
+template<typename Scalar> inline void prefetch(const Scalar* addr)
+{
+#if !defined(_MSC_VER)
+__builtin_prefetch(addr);
+#endif
+}
+
+/** \internal \returns the first element of a packet */
+template<typename Packet> inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
+{ return a; }
+
+/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
+template<typename Packet> inline Packet
+preduxp(const Packet* vecs) { return vecs[0]; }
+
+/** \internal \returns the sum of the elements of \a a*/
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux(const Packet& a)
+{ return a; }
+
+/** \internal \returns the product of the elements of \a a*/
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
+{ return a; }
+
+/** \internal \returns the min of the elements of \a a*/
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
+{ return a; }
+
+/** \internal \returns the max of the elements of \a a*/
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
+{ return a; }
+
+/** \internal \returns the reversed elements of \a a*/
+template<typename Packet> inline Packet preverse(const Packet& a)
+{ return a; }
+
+
+/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
+template<typename Packet> inline Packet pcplxflip(const Packet& a)
+{
+  // FIXME: uncomment the following in case we drop the internal imag and real functions.
+//   using std::imag;
+//   using std::real;
+  return Packet(imag(a),real(a));
+}
+
+/**************************
+* Special math functions
+***************************/
+
+/** \internal \returns the sine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet psin(const Packet& a) { using std::sin; return sin(a); }
+
+/** \internal \returns the cosine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pcos(const Packet& a) { using std::cos; return cos(a); }
+
+/** \internal \returns the tan of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ptan(const Packet& a) { using std::tan; return tan(a); }
+
+/** \internal \returns the arc sine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pasin(const Packet& a) { using std::asin; return asin(a); }
+
+/** \internal \returns the arc cosine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pacos(const Packet& a) { using std::acos; return acos(a); }
+
+/** \internal \returns the exp of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pexp(const Packet& a) { using std::exp; return exp(a); }
+
+/** \internal \returns the log of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog(const Packet& a) { using std::log; return log(a); }
+
+/** \internal \returns the square-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
+
+/***************************************************************************
+* The following functions might not have to be overwritten for vectorized types
+***************************************************************************/
+
+/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
+// NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
+template<typename Packet>
+inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
+{
+  pstore(to, pset1<Packet>(a));
+}
+
+/** \internal \returns a * b + c (coeff-wise) */
+template<typename Packet> inline Packet
+pmadd(const Packet&  a,
+         const Packet&  b,
+         const Packet&  c)
+{ return padd(pmul(a, b),c); }
+
+/** \internal \returns a packet version of \a *from.
+  * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
+template<typename Packet, int LoadMode>
+inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
+{
+  if(LoadMode == Aligned)
+    return pload<Packet>(from);
+  else
+    return ploadu<Packet>(from);
+}
+
+/** \internal copy the packet \a from to \a *to.
+  * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
+template<typename Scalar, typename Packet, int LoadMode>
+inline void pstoret(Scalar* to, const Packet& from)
+{
+  if(LoadMode == Aligned)
+    pstore(to, from);
+  else
+    pstoreu(to, from);
+}
+
+/** \internal default implementation of palign() allowing partial specialization */
+template<int Offset,typename PacketType>
+struct palign_impl
+{
+  // by default data are aligned, so there is nothing to be done :)
+  static inline void run(PacketType&, const PacketType&) {}
+};
+
+/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
+  * of \a first and \a Offset first elements of \a second.
+  * 
+  * This function is currently only used to optimize matrix-vector products on unligned matrices.
+  * It takes 2 packets that represent a contiguous memory array, and returns a packet starting
+  * at the position \a Offset. For instance, for packets of 4 elements, we have:
+  *  Input:
+  *  - first = {f0,f1,f2,f3}
+  *  - second = {s0,s1,s2,s3}
+  * Output: 
+  *   - if Offset==0 then {f0,f1,f2,f3}
+  *   - if Offset==1 then {f1,f2,f3,s0}
+  *   - if Offset==2 then {f2,f3,s0,s1}
+  *   - if Offset==3 then {f3,s0,s1,s3}
+  */
+template<int Offset,typename PacketType>
+inline void palign(PacketType& first, const PacketType& second)
+{
+  palign_impl<Offset,PacketType>::run(first,second);
+}
+
+/***************************************************************************
+* Fast complex products (GCC generates a function call which is very slow)
+***************************************************************************/
+
+template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
+{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+
+template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
+{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERIC_PACKET_MATH_H
+
diff --git a/include/eigen3/Eigen/src/Core/GlobalFunctions.h b/include/eigen3/Eigen/src/Core/GlobalFunctions.h
new file mode 100644
index 0000000..2acf977
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/GlobalFunctions.h
@@ -0,0 +1,92 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GLOBAL_FUNCTIONS_H
+#define EIGEN_GLOBAL_FUNCTIONS_H
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR) \
+  template<typename Derived> \
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
+  NAME(const Eigen::ArrayBase<Derived>& x) { \
+    return x.derived(); \
+  }
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
+  \
+  template<typename Derived> \
+  struct NAME##_retval<ArrayBase<Derived> > \
+  { \
+    typedef const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> type; \
+  }; \
+  template<typename Derived> \
+  struct NAME##_impl<ArrayBase<Derived> > \
+  { \
+    static inline typename NAME##_retval<ArrayBase<Derived> >::type run(const Eigen::ArrayBase<Derived>& x) \
+    { \
+      return x.derived(); \
+    } \
+  };
+
+
+namespace Eigen
+{
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op)
+  
+  template<typename Derived>
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
+  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
+    return x.derived().pow(exponent);
+  }
+
+  template<typename Derived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<Derived>& exponents) 
+  {
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>(
+      x.derived(),
+      exponents.derived()
+    );
+  }
+  
+  /**
+  * \brief Component-wise division of a scalar by array elements.
+  **/
+  template <typename Derived>
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>
+    operator/(const typename Derived::Scalar& s, const Eigen::ArrayBase<Derived>& a)
+  {
+    return Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>(
+      a.derived(),
+      Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>(s)  
+    );
+  }
+
+  namespace internal
+  {
+    EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real,scalar_real_op)
+    EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(imag,scalar_imag_op)
+    EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(abs2,scalar_abs2_op)
+  }
+}
+
+// TODO: cleanly disable those functions that are not supported on Array (numext::real_ref, internal::random, internal::isApprox...)
+
+#endif // EIGEN_GLOBAL_FUNCTIONS_H
diff --git a/include/eigen3/Eigen/src/Core/IO.h b/include/eigen3/Eigen/src/Core/IO.h
new file mode 100644
index 0000000..8d4bc59
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/IO.h
@@ -0,0 +1,250 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_IO_H
+#define EIGEN_IO_H
+
+namespace Eigen { 
+
+enum { DontAlignCols = 1 };
+enum { StreamPrecision = -1,
+       FullPrecision = -2 };
+
+namespace internal {
+template<typename Derived>
+std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt);
+}
+
+/** \class IOFormat
+  * \ingroup Core_Module
+  *
+  * \brief Stores a set of parameters controlling the way matrices are printed
+  *
+  * List of available parameters:
+  *  - \b precision number of digits for floating point values, or one of the special constants \c StreamPrecision and \c FullPrecision.
+  *                 The default is the special value \c StreamPrecision which means to use the
+  *                 stream's own precision setting, as set for instance using \c cout.precision(3). The other special value
+  *                 \c FullPrecision means that the number of digits will be computed to match the full precision of each floating-point
+  *                 type.
+  *  - \b flags an OR-ed combination of flags, the default value is 0, the only currently available flag is \c DontAlignCols which
+  *             allows to disable the alignment of columns, resulting in faster code.
+  *  - \b coeffSeparator string printed between two coefficients of the same row
+  *  - \b rowSeparator string printed between two rows
+  *  - \b rowPrefix string printed at the beginning of each row
+  *  - \b rowSuffix string printed at the end of each row
+  *  - \b matPrefix string printed at the beginning of the matrix
+  *  - \b matSuffix string printed at the end of the matrix
+  *
+  * Example: \include IOFormat.cpp
+  * Output: \verbinclude IOFormat.out
+  *
+  * \sa DenseBase::format(), class WithFormat
+  */
+struct IOFormat
+{
+  /** Default contructor, see class IOFormat for the meaning of the parameters */
+  IOFormat(int _precision = StreamPrecision, int _flags = 0,
+    const std::string& _coeffSeparator = " ",
+    const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
+    const std::string& _matPrefix="", const std::string& _matSuffix="")
+  : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
+    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
+  {
+    int i = int(matSuffix.length())-1;
+    while (i>=0 && matSuffix[i]!='\n')
+    {
+      rowSpacer += ' ';
+      i--;
+    }
+  }
+  std::string matPrefix, matSuffix;
+  std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
+  std::string coeffSeparator;
+  int precision;
+  int flags;
+};
+
+/** \class WithFormat
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression providing matrix output with given format
+  *
+  * \param ExpressionType the type of the object on which IO stream operations are performed
+  *
+  * This class represents an expression with stream operators controlled by a given IOFormat.
+  * It is the return type of DenseBase::format()
+  * and most of the time this is the only way it is used.
+  *
+  * See class IOFormat for some examples.
+  *
+  * \sa DenseBase::format(), class IOFormat
+  */
+template<typename ExpressionType>
+class WithFormat
+{
+  public:
+
+    WithFormat(const ExpressionType& matrix, const IOFormat& format)
+      : m_matrix(matrix), m_format(format)
+    {}
+
+    friend std::ostream & operator << (std::ostream & s, const WithFormat& wf)
+    {
+      return internal::print_matrix(s, wf.m_matrix.eval(), wf.m_format);
+    }
+
+  protected:
+    const typename ExpressionType::Nested m_matrix;
+    IOFormat m_format;
+};
+
+/** \returns a WithFormat proxy object allowing to print a matrix the with given
+  * format \a fmt.
+  *
+  * See class IOFormat for some examples.
+  *
+  * \sa class IOFormat, class WithFormat
+  */
+template<typename Derived>
+inline const WithFormat<Derived>
+DenseBase<Derived>::format(const IOFormat& fmt) const
+{
+  return WithFormat<Derived>(derived(), fmt);
+}
+
+namespace internal {
+
+template<typename Scalar, bool IsInteger>
+struct significant_decimals_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline int run()
+  {
+    using std::ceil;
+    using std::log;
+    return cast<RealScalar,int>(ceil(-log(NumTraits<RealScalar>::epsilon())/log(RealScalar(10))));
+  }
+};
+
+template<typename Scalar>
+struct significant_decimals_default_impl<Scalar, true>
+{
+  static inline int run()
+  {
+    return 0;
+  }
+};
+
+template<typename Scalar>
+struct significant_decimals_impl
+  : significant_decimals_default_impl<Scalar, NumTraits<Scalar>::IsInteger>
+{};
+
+/** \internal
+  * print the matrix \a _m to the output stream \a s using the output format \a fmt */
+template<typename Derived>
+std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt)
+{
+  if(_m.size() == 0)
+  {
+    s << fmt.matPrefix << fmt.matSuffix;
+    return s;
+  }
+  
+  typename Derived::Nested m = _m;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Index Index;
+
+  Index width = 0;
+
+  std::streamsize explicit_precision;
+  if(fmt.precision == StreamPrecision)
+  {
+    explicit_precision = 0;
+  }
+  else if(fmt.precision == FullPrecision)
+  {
+    if (NumTraits<Scalar>::IsInteger)
+    {
+      explicit_precision = 0;
+    }
+    else
+    {
+      explicit_precision = significant_decimals_impl<Scalar>::run();
+    }
+  }
+  else
+  {
+    explicit_precision = fmt.precision;
+  }
+
+  std::streamsize old_precision = 0;
+  if(explicit_precision) old_precision = s.precision(explicit_precision);
+
+  bool align_cols = !(fmt.flags & DontAlignCols);
+  if(align_cols)
+  {
+    // compute the largest width
+    for(Index j = 0; j < m.cols(); ++j)
+      for(Index i = 0; i < m.rows(); ++i)
+      {
+        std::stringstream sstr;
+        sstr.copyfmt(s);
+        sstr << m.coeff(i,j);
+        width = std::max<Index>(width, Index(sstr.str().length()));
+      }
+  }
+  s << fmt.matPrefix;
+  for(Index i = 0; i < m.rows(); ++i)
+  {
+    if (i)
+      s << fmt.rowSpacer;
+    s << fmt.rowPrefix;
+    if(width) s.width(width);
+    s << m.coeff(i, 0);
+    for(Index j = 1; j < m.cols(); ++j)
+    {
+      s << fmt.coeffSeparator;
+      if (width) s.width(width);
+      s << m.coeff(i, j);
+    }
+    s << fmt.rowSuffix;
+    if( i < m.rows() - 1)
+      s << fmt.rowSeparator;
+  }
+  s << fmt.matSuffix;
+  if(explicit_precision) s.precision(old_precision);
+  return s;
+}
+
+} // end namespace internal
+
+/** \relates DenseBase
+  *
+  * Outputs the matrix, to the given stream.
+  *
+  * If you wish to print the matrix with a format different than the default, use DenseBase::format().
+  *
+  * It is also possible to change the default format by defining EIGEN_DEFAULT_IO_FORMAT before including Eigen headers.
+  * If not defined, this will automatically be defined to Eigen::IOFormat(), that is the Eigen::IOFormat with default parameters.
+  *
+  * \sa DenseBase::format()
+  */
+template<typename Derived>
+std::ostream & operator <<
+(std::ostream & s,
+ const DenseBase<Derived> & m)
+{
+  return internal::print_matrix(s, m.eval(), EIGEN_DEFAULT_IO_FORMAT);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_IO_H
diff --git a/include/eigen3/Eigen/src/Core/Map.h b/include/eigen3/Eigen/src/Core/Map.h
new file mode 100644
index 0000000..f804c89
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Map.h
@@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MAP_H
+#define EIGEN_MAP_H
+
+namespace Eigen { 
+
+/** \class Map
+  * \ingroup Core_Module
+  *
+  * \brief A matrix or vector expression mapping an existing array of data.
+  *
+  * \tparam PlainObjectType the equivalent matrix type of the mapped data
+  * \tparam MapOptions specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  *                The default is \c #Unaligned.
+  * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
+  *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
+  *                   The type passed here must be a specialization of the Stride template, see examples below.
+  *
+  * This class represents a matrix or vector expression mapping an existing array of data.
+  * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
+  * such as plain C arrays or structures from other libraries. By default, it assumes that the
+  * data is laid out contiguously in memory. You can however override this by explicitly specifying
+  * inner and outer strides.
+  *
+  * Here's an example of simply mapping a contiguous array as a \ref TopicStorageOrders "column-major" matrix:
+  * \include Map_simple.cpp
+  * Output: \verbinclude Map_simple.out
+  *
+  * If you need to map non-contiguous arrays, you can do so by specifying strides:
+  *
+  * Here's an example of mapping an array as a vector, specifying an inner stride, that is, the pointer
+  * increment between two consecutive coefficients. Here, we're specifying the inner stride as a compile-time
+  * fixed value.
+  * \include Map_inner_stride.cpp
+  * Output: \verbinclude Map_inner_stride.out
+  *
+  * Here's an example of mapping an array while specifying an outer stride. Here, since we're mapping
+  * as a column-major matrix, 'outer stride' means the pointer increment between two consecutive columns.
+  * Here, we're specifying the outer stride as a runtime parameter. Note that here \c OuterStride<> is
+  * a short version of \c OuterStride<Dynamic> because the default template parameter of OuterStride
+  * is  \c Dynamic
+  * \include Map_outer_stride.cpp
+  * Output: \verbinclude Map_outer_stride.out
+  *
+  * For more details and for an example of specifying both an inner and an outer stride, see class Stride.
+  *
+  * \b Tip: to change the array of data mapped by a Map object, you can use the C++
+  * placement new syntax:
+  *
+  * Example: \include Map_placement_new.cpp
+  * Output: \verbinclude Map_placement_new.out
+  *
+  * This class is the return type of PlainObjectBase::Map() but can also be used directly.
+  *
+  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+  */
+
+namespace internal {
+template<typename PlainObjectType, int MapOptions, typename StrideType>
+struct traits<Map<PlainObjectType, MapOptions, StrideType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> TraitsBase;
+  typedef typename PlainObjectType::Index Index;
+  typedef typename PlainObjectType::Scalar Scalar;
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    HasNoInnerStride = InnerStrideAtCompileTime == 1,
+    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
+    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
+    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+    KeepsPacketAccess = bool(HasNoInnerStride)
+                        && ( bool(IsDynamicSize)
+                           || HasNoOuterStride
+                           || ( OuterStrideAtCompileTime!=Dynamic
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
+    Flags0 = TraitsBase::Flags & (~NestByRefBit),
+    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
+    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
+           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
+    Flags3 = is_lvalue<PlainObjectType>::value ? int(Flags2) : (int(Flags2) & ~LvalueBit),
+    Flags = KeepsPacketAccess ? int(Flags3) : (int(Flags3) & ~PacketAccessBit)
+  };
+private:
+  enum { Options }; // Expressions don't have Options
+};
+}
+
+template<typename PlainObjectType, int MapOptions, typename StrideType> class Map
+  : public MapBase<Map<PlainObjectType, MapOptions, StrideType> >
+{
+  public:
+
+    typedef MapBase<Map> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Map)
+
+    typedef typename Base::PointerType PointerType;
+#if EIGEN2_SUPPORT_STAGE <= STAGE30_FULL_EIGEN3_API
+    typedef const Scalar* PointerArgType;
+    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return const_cast<PointerType>(ptr); }
+#else
+    typedef PointerType PointerArgType;
+    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
+#endif
+
+    inline Index innerStride() const
+    {
+      return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
+    }
+
+    inline Index outerStride() const
+    {
+      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
+           : IsVectorAtCompileTime ? this->size()
+           : int(Flags)&RowMajorBit ? this->cols()
+           : this->rows();
+    }
+
+    /** Constructor in the fixed-size case.
+      *
+      * \param dataPtr pointer to the array to map
+      * \param a_stride optional Stride object, passing the strides.
+      */
+    inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
+    {
+      PlainObjectType::Base::_check_template_params();
+    }
+
+    /** Constructor in the dynamic-size vector case.
+      *
+      * \param dataPtr pointer to the array to map
+      * \param a_size the size of the vector expression
+      * \param a_stride optional Stride object, passing the strides.
+      */
+    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
+    {
+      PlainObjectType::Base::_check_template_params();
+    }
+
+    /** Constructor in the dynamic-size matrix case.
+      *
+      * \param dataPtr pointer to the array to map
+      * \param nbRows the number of rows of the matrix expression
+      * \param nbCols the number of columns of the matrix expression
+      * \param a_stride optional Stride object, passing the strides.
+      */
+    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
+    {
+      PlainObjectType::Base::_check_template_params();
+    }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
+
+  protected:
+    StrideType m_stride;
+};
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+inline Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
+  ::Array(const Scalar *data)
+{
+  this->_set_noalias(Eigen::Map<const Array>(data));
+}
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+inline Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
+  ::Matrix(const Scalar *data)
+{
+  this->_set_noalias(Eigen::Map<const Matrix>(data));
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MAP_H
diff --git a/include/eigen3/Eigen/src/Core/MapBase.h b/include/eigen3/Eigen/src/Core/MapBase.h
new file mode 100644
index 0000000..cebed2b
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/MapBase.h
@@ -0,0 +1,247 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MAPBASE_H
+#define EIGEN_MAPBASE_H
+
+#define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) \
+      EIGEN_STATIC_ASSERT((int(internal::traits<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
+                          YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)
+
+namespace Eigen { 
+
+/** \class MapBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for Map and Block expression with direct access
+  *
+  * \sa class Map, class Block
+  */
+template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
+  : public internal::dense_xpr_base<Derived>::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<Derived>::type Base;
+    enum {
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      SizeAtCompileTime = Base::SizeAtCompileTime
+    };
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         Scalar *,
+                         const Scalar *>::type
+                     PointerType;
+
+    using Base::derived;
+//    using Base::RowsAtCompileTime;
+//    using Base::ColsAtCompileTime;
+//    using Base::SizeAtCompileTime;
+    using Base::MaxRowsAtCompileTime;
+    using Base::MaxColsAtCompileTime;
+    using Base::MaxSizeAtCompileTime;
+    using Base::IsVectorAtCompileTime;
+    using Base::Flags;
+    using Base::IsRowMajor;
+
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::coeff;
+    using Base::coeffRef;
+    using Base::lazyAssign;
+    using Base::eval;
+
+    using Base::innerStride;
+    using Base::outerStride;
+    using Base::rowStride;
+    using Base::colStride;
+
+    // bug 217 - compile error on ICC 11.1
+    using Base::operator=;
+
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+    inline Index rows() const { return m_rows.value(); }
+    inline Index cols() const { return m_cols.value(); }
+
+    /** Returns a pointer to the first coefficient of the matrix or vector.
+      *
+      * \note When addressing this data, make sure to honor the strides returned by innerStride() and outerStride().
+      *
+      * \sa innerStride(), outerStride()
+      */
+    inline const Scalar* data() const { return m_data; }
+
+    inline const Scalar& coeff(Index rowId, Index colId) const
+    {
+      return m_data[colId * colStride() + rowId * rowStride()];
+    }
+
+    inline const Scalar& coeff(Index index) const
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      return m_data[index * innerStride()];
+    }
+
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return this->m_data[colId * colStride() + rowId * rowStride()];
+    }
+
+    inline const Scalar& coeffRef(Index index) const
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      return this->m_data[index * innerStride()];
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(Index rowId, Index colId) const
+    {
+      return internal::ploadt<PacketScalar, LoadMode>
+               (m_data + (colId * colStride() + rowId * rowStride()));
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(Index index) const
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
+    }
+
+    inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
+    {
+      EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+      checkSanity();
+    }
+
+    inline MapBase(PointerType dataPtr, Index vecSize)
+            : m_data(dataPtr),
+              m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
+              m_cols(ColsAtCompileTime == Dynamic ? vecSize : Index(ColsAtCompileTime))
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+      eigen_assert(vecSize >= 0);
+      eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize);
+      checkSanity();
+    }
+
+    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
+            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
+    {
+      eigen_assert( (dataPtr == 0)
+              || (   nbRows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
+                  && nbCols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols)));
+      checkSanity();
+    }
+
+  protected:
+
+    void checkSanity() const
+    {
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
+                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
+                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0)
+                   && "data is not aligned");
+    }
+
+    PointerType m_data;
+    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
+    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
+};
+
+template<typename Derived> class MapBase<Derived, WriteAccessors>
+  : public MapBase<Derived, ReadOnlyAccessors>
+{
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
+  public:
+
+    typedef MapBase<Derived, ReadOnlyAccessors> Base;
+
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::PacketScalar PacketScalar;
+    typedef typename Base::Index Index;
+    typedef typename Base::PointerType PointerType;
+
+    using Base::derived;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::coeff;
+    using Base::coeffRef;
+
+    using Base::innerStride;
+    using Base::outerStride;
+    using Base::rowStride;
+    using Base::colStride;
+
+    typedef typename internal::conditional<
+                    internal::is_lvalue<Derived>::value,
+                    Scalar,
+                    const Scalar
+                  >::type ScalarWithConstIfNotLvalue;
+
+    inline const Scalar* data() const { return this->m_data; }
+    inline ScalarWithConstIfNotLvalue* data() { return this->m_data; } // no const-cast here so non-const-correct code will give a compile error
+
+    inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col)
+    {
+      return this->m_data[col * colStride() + row * rowStride()];
+    }
+
+    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      return this->m_data[index * innerStride()];
+    }
+
+    template<int StoreMode>
+    inline void writePacket(Index row, Index col, const PacketScalar& val)
+    {
+      internal::pstoret<Scalar, PacketScalar, StoreMode>
+               (this->m_data + (col * colStride() + row * rowStride()), val);
+    }
+
+    template<int StoreMode>
+    inline void writePacket(Index index, const PacketScalar& val)
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      internal::pstoret<Scalar, PacketScalar, StoreMode>
+                (this->m_data + index * innerStride(), val);
+    }
+
+    explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+    inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
+
+    Derived& operator=(const MapBase& other)
+    {
+      ReadOnlyMapBase::Base::operator=(other);
+      return derived();
+    }
+
+    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+    // see bugs 821 and 920.
+    using ReadOnlyMapBase::Base::operator=;
+};
+
+#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
+
+} // end namespace Eigen
+
+#endif // EIGEN_MAPBASE_H
diff --git a/include/eigen3/Eigen/src/Core/MathFunctions.h b/include/eigen3/Eigen/src/Core/MathFunctions.h
new file mode 100644
index 0000000..2bfc5eb
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/MathFunctions.h
@@ -0,0 +1,768 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATHFUNCTIONS_H
+#define EIGEN_MATHFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \struct global_math_functions_filtering_base
+  *
+  * What it does:
+  * Defines a typedef 'type' as follows:
+  * - if type T has a member typedef Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then
+  *   global_math_functions_filtering_base<T>::type is a typedef for it.
+  * - otherwise, global_math_functions_filtering_base<T>::type is a typedef for T.
+  *
+  * How it's used:
+  * To allow to defined the global math functions (like sin...) in certain cases, like the Array expressions.
+  * When you do sin(array1+array2), the object array1+array2 has a complicated expression type, all what you want to know
+  * is that it inherits ArrayBase. So we implement a partial specialization of sin_impl for ArrayBase<Derived>.
+  * So we must make sure to use sin_impl<ArrayBase<Derived> > and not sin_impl<Derived>, otherwise our partial specialization
+  * won't be used. How does sin know that? That's exactly what global_math_functions_filtering_base tells it.
+  *
+  * How it's implemented:
+  * SFINAE in the style of enable_if. Highly susceptible of breaking compilers. With GCC, it sure does work, but if you replace
+  * the typename dummy by an integer template parameter, it doesn't work anymore!
+  */
+
+template<typename T, typename dummy = void>
+struct global_math_functions_filtering_base
+{
+  typedef T type;
+};
+
+template<typename T> struct always_void { typedef void type; };
+
+template<typename T>
+struct global_math_functions_filtering_base
+  <T,
+   typename always_void<typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>::type
+  >
+{
+  typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type;
+};
+
+#define EIGEN_MATHFUNC_IMPL(func, scalar) Eigen::internal::func##_impl<typename Eigen::internal::global_math_functions_filtering_base<scalar>::type>
+#define EIGEN_MATHFUNC_RETVAL(func, scalar) typename Eigen::internal::func##_retval<typename Eigen::internal::global_math_functions_filtering_base<scalar>::type>::type
+
+/****************************************************************************
+* Implementation of real                                                 *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct real_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar& x)
+  {
+    return x;
+  }
+};
+
+template<typename Scalar>
+struct real_default_impl<Scalar,true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar& x)
+  {
+    using std::real;
+    return real(x);
+  }
+};
+
+template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
+
+template<typename Scalar>
+struct real_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+
+/****************************************************************************
+* Implementation of imag                                                 *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct imag_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar&)
+  {
+    return RealScalar(0);
+  }
+};
+
+template<typename Scalar>
+struct imag_default_impl<Scalar,true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar& x)
+  {
+    using std::imag;
+    return imag(x);
+  }
+};
+
+template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
+
+template<typename Scalar>
+struct imag_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of real_ref                                             *
+****************************************************************************/
+
+template<typename Scalar>
+struct real_ref_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar& run(Scalar& x)
+  {
+    return reinterpret_cast<RealScalar*>(&x)[0];
+  }
+  static inline const RealScalar& run(const Scalar& x)
+  {
+    return reinterpret_cast<const RealScalar*>(&x)[0];
+  }
+};
+
+template<typename Scalar>
+struct real_ref_retval
+{
+  typedef typename NumTraits<Scalar>::Real & type;
+};
+
+/****************************************************************************
+* Implementation of imag_ref                                             *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex>
+struct imag_ref_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar& run(Scalar& x)
+  {
+    return reinterpret_cast<RealScalar*>(&x)[1];
+  }
+  static inline const RealScalar& run(const Scalar& x)
+  {
+    return reinterpret_cast<RealScalar*>(&x)[1];
+  }
+};
+
+template<typename Scalar>
+struct imag_ref_default_impl<Scalar, false>
+{
+  static inline Scalar run(Scalar&)
+  {
+    return Scalar(0);
+  }
+  static inline const Scalar run(const Scalar&)
+  {
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct imag_ref_impl : imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
+
+template<typename Scalar>
+struct imag_ref_retval
+{
+  typedef typename NumTraits<Scalar>::Real & type;
+};
+
+/****************************************************************************
+* Implementation of conj                                                 *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_impl
+{
+  static inline Scalar run(const Scalar& x)
+  {
+    return x;
+  }
+};
+
+template<typename Scalar>
+struct conj_impl<Scalar,true>
+{
+  static inline Scalar run(const Scalar& x)
+  {
+    using std::conj;
+    return conj(x);
+  }
+};
+
+template<typename Scalar>
+struct conj_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of abs2                                                 *
+****************************************************************************/
+
+template<typename Scalar>
+struct abs2_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar& x)
+  {
+    return x*x;
+  }
+};
+
+template<typename RealScalar>
+struct abs2_impl<std::complex<RealScalar> >
+{
+  static inline RealScalar run(const std::complex<RealScalar>& x)
+  {
+    return real(x)*real(x) + imag(x)*imag(x);
+  }
+};
+
+template<typename Scalar>
+struct abs2_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of norm1                                                *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex>
+struct norm1_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar& x)
+  {
+    using std::abs;
+    return abs(real(x)) + abs(imag(x));
+  }
+};
+
+template<typename Scalar>
+struct norm1_default_impl<Scalar, false>
+{
+  static inline Scalar run(const Scalar& x)
+  {
+    using std::abs;
+    return abs(x);
+  }
+};
+
+template<typename Scalar>
+struct norm1_impl : norm1_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
+
+template<typename Scalar>
+struct norm1_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of hypot                                                *
+****************************************************************************/
+
+template<typename Scalar>
+struct hypot_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar& x, const Scalar& y)
+  {
+    using std::max;
+    using std::min;
+    using std::abs;
+    using std::sqrt;
+    RealScalar _x = abs(x);
+    RealScalar _y = abs(y);
+    RealScalar p = (max)(_x, _y);
+    if(p==RealScalar(0)) return 0;
+    RealScalar q = (min)(_x, _y);
+    RealScalar qp = q/p;
+    return p * sqrt(RealScalar(1) + qp*qp);
+  }
+};
+
+template<typename Scalar>
+struct hypot_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of cast                                                 *
+****************************************************************************/
+
+template<typename OldType, typename NewType>
+struct cast_impl
+{
+  static inline NewType run(const OldType& x)
+  {
+    return static_cast<NewType>(x);
+  }
+};
+
+// here, for once, we're plainly returning NewType: we don't want cast to do weird things.
+
+template<typename OldType, typename NewType>
+inline NewType cast(const OldType& x)
+{
+  return cast_impl<OldType, NewType>::run(x);
+}
+
+/****************************************************************************
+* Implementation of atanh2                                                *
+****************************************************************************/
+
+template<typename Scalar, bool IsInteger>
+struct atanh2_default_impl
+{
+  typedef Scalar retval;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    using std::abs;
+    using std::log;
+    using std::sqrt;
+    Scalar z = x / y;
+    if (y == Scalar(0) || abs(z) > sqrt(NumTraits<RealScalar>::epsilon()))
+      return RealScalar(0.5) * log((y + x) / (y - x));
+    else
+      return z + z*z*z / RealScalar(3);
+  }
+};
+
+template<typename Scalar>
+struct atanh2_default_impl<Scalar, true>
+{
+  static inline Scalar run(const Scalar&, const Scalar&)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct atanh2_impl : atanh2_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
+
+template<typename Scalar>
+struct atanh2_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of pow                                                  *
+****************************************************************************/
+
+template<typename Scalar, bool IsInteger>
+struct pow_default_impl
+{
+  typedef Scalar retval;
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    using std::pow;
+    return pow(x, y);
+  }
+};
+
+template<typename Scalar>
+struct pow_default_impl<Scalar, true>
+{
+  static inline Scalar run(Scalar x, Scalar y)
+  {
+    Scalar res(1);
+    eigen_assert(!NumTraits<Scalar>::IsSigned || y >= 0);
+    if(y & 1) res *= x;
+    y >>= 1;
+    while(y)
+    {
+      x *= x;
+      if(y&1) res *= x;
+      y >>= 1;
+    }
+    return res;
+  }
+};
+
+template<typename Scalar>
+struct pow_impl : pow_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
+
+template<typename Scalar>
+struct pow_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of random                                               *
+****************************************************************************/
+
+template<typename Scalar,
+         bool IsComplex,
+         bool IsInteger>
+struct random_default_impl {};
+
+template<typename Scalar>
+struct random_impl : random_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
+
+template<typename Scalar>
+struct random_retval
+{
+  typedef Scalar type;
+};
+
+template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y);
+template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
+
+template<typename Scalar>
+struct random_default_impl<Scalar, false, false>
+{
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    return x + (y-x) * Scalar(std::rand()) / Scalar(RAND_MAX);
+  }
+  static inline Scalar run()
+  {
+    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
+  }
+};
+
+enum {
+  floor_log2_terminate,
+  floor_log2_move_up,
+  floor_log2_move_down,
+  floor_log2_bogus
+};
+
+template<unsigned int n, int lower, int upper> struct floor_log2_selector
+{
+  enum { middle = (lower + upper) / 2,
+         value = (upper <= lower + 1) ? int(floor_log2_terminate)
+               : (n < (1 << middle)) ? int(floor_log2_move_down)
+               : (n==0) ? int(floor_log2_bogus)
+               : int(floor_log2_move_up)
+  };
+};
+
+template<unsigned int n,
+         int lower = 0,
+         int upper = sizeof(unsigned int) * CHAR_BIT - 1,
+         int selector = floor_log2_selector<n, lower, upper>::value>
+struct floor_log2 {};
+
+template<unsigned int n, int lower, int upper>
+struct floor_log2<n, lower, upper, floor_log2_move_down>
+{
+  enum { value = floor_log2<n, lower, floor_log2_selector<n, lower, upper>::middle>::value };
+};
+
+template<unsigned int n, int lower, int upper>
+struct floor_log2<n, lower, upper, floor_log2_move_up>
+{
+  enum { value = floor_log2<n, floor_log2_selector<n, lower, upper>::middle, upper>::value };
+};
+
+template<unsigned int n, int lower, int upper>
+struct floor_log2<n, lower, upper, floor_log2_terminate>
+{
+  enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower };
+};
+
+template<unsigned int n, int lower, int upper>
+struct floor_log2<n, lower, upper, floor_log2_bogus>
+{
+  // no value, error at compile time
+};
+
+template<typename Scalar>
+struct random_default_impl<Scalar, false, true>
+{
+  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    return x + Scalar((NonInteger(y)-x+1) * std::rand() / (RAND_MAX + NonInteger(1)));
+  }
+
+  static inline Scalar run()
+  {
+#ifdef EIGEN_MAKING_DOCS
+    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
+#else
+    enum { rand_bits = floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+           scalar_bits = sizeof(Scalar) * CHAR_BIT,
+           shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
+           offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
+    };
+    return Scalar((std::rand() >> shift) - offset);
+#endif
+  }
+};
+
+template<typename Scalar>
+struct random_default_impl<Scalar, true, false>
+{
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    return Scalar(random(real(x), real(y)),
+                  random(imag(x), imag(y)));
+  }
+  static inline Scalar run()
+  {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    return Scalar(random<RealScalar>(), random<RealScalar>());
+  }
+};
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y)
+{
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
+{
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
+}
+
+} // end namespace internal
+
+/****************************************************************************
+* Generic math function                                                    *
+****************************************************************************/
+
+namespace numext {
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
+}  
+
+template<typename Scalar>
+inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x)
+{
+  return internal::real_ref_impl<Scalar>::run(x);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
+}
+
+template<typename Scalar>
+inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x)
+{
+  return internal::imag_ref_impl<Scalar>::run(x);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y)
+{
+  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
+{
+  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
+{
+  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
+}
+
+// std::isfinite is non standard, so let's define our own version,
+// even though it is not very efficient.
+template<typename T> bool (isfinite)(const T& x)
+{
+  return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+}
+
+} // end namespace numext
+
+namespace internal {
+
+/****************************************************************************
+* Implementation of fuzzy comparisons                                       *
+****************************************************************************/
+
+template<typename Scalar,
+         bool IsComplex,
+         bool IsInteger>
+struct scalar_fuzzy_default_impl {};
+
+template<typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, false, false>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template<typename OtherScalar>
+  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
+  {
+    using std::abs;
+    return abs(x) <= abs(y) * prec;
+  }
+  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
+  {
+    using std::min;
+    using std::abs;
+    return abs(x - y) <= (min)(abs(x), abs(y)) * prec;
+  }
+  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
+  {
+    return x <= y || isApprox(x, y, prec);
+  }
+};
+
+template<typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, false, true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template<typename OtherScalar>
+  static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&)
+  {
+    return x == Scalar(0);
+  }
+  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&)
+  {
+    return x == y;
+  }
+  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&)
+  {
+    return x <= y;
+  }
+};
+
+template<typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, true, false>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template<typename OtherScalar>
+  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
+  {
+    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
+  }
+  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
+  {
+    using std::min;
+    return numext::abs2(x - y) <= (min)(numext::abs2(x), numext::abs2(y)) * prec * prec;
+  }
+};
+
+template<typename Scalar>
+struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
+
+template<typename Scalar, typename OtherScalar>
+inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
+                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+{
+  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
+}
+
+template<typename Scalar>
+inline bool isApprox(const Scalar& x, const Scalar& y,
+                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+{
+  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
+}
+
+template<typename Scalar>
+inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
+                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+{
+  return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
+}
+
+/******************************************
+***  The special case of the  bool type ***
+******************************************/
+
+template<> struct random_impl<bool>
+{
+  static inline bool run()
+  {
+    return random<int>(0,1)==0 ? false : true;
+  }
+};
+
+template<> struct scalar_fuzzy_impl<bool>
+{
+  typedef bool RealScalar;
+  
+  template<typename OtherScalar>
+  static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
+  {
+    return !x;
+  }
+  
+  static inline bool isApprox(bool x, bool y, bool)
+  {
+    return x == y;
+  }
+
+  static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&)
+  {
+    return (!x) || y;
+  }
+  
+};
+
+  
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATHFUNCTIONS_H
diff --git a/include/eigen3/Eigen/src/Core/Matrix.h b/include/eigen3/Eigen/src/Core/Matrix.h
new file mode 100644
index 0000000..d7d0b5b
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Matrix.h
@@ -0,0 +1,405 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_H
+#define EIGEN_MATRIX_H
+
+namespace Eigen {
+
+/** \class Matrix
+  * \ingroup Core_Module
+  *
+  * \brief The matrix class, also used for vectors and row-vectors
+  *
+  * The %Matrix class is the work-horse for all \em dense (\ref dense "note") matrices and vectors within Eigen.
+  * Vectors are matrices with one column, and row-vectors are matrices with one row.
+  *
+  * The %Matrix class encompasses \em both fixed-size and dynamic-size objects (\ref fixedsize "note").
+  *
+  * The first three template parameters are required:
+  * \tparam _Scalar \anchor matrix_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  *                 User defined sclar types are supported as well (see \ref user_defined_scalars "here").
+  * \tparam _Rows Number of rows, or \b Dynamic
+  * \tparam _Cols Number of columns, or \b Dynamic
+  *
+  * The remaining template parameters are optional -- in most cases you don't have to worry about them.
+  * \tparam _Options \anchor matrix_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  *                 \b #AutoAlign or \b #DontAlign.
+  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
+  *                 for vectorization. It defaults to aligning matrices except for fixed sizes that aren't a multiple of the packet size.
+  * \tparam _MaxRows Maximum number of rows. Defaults to \a _Rows (\ref maxrows "note").
+  * \tparam _MaxCols Maximum number of columns. Defaults to \a _Cols (\ref maxrows "note").
+  *
+  * Eigen provides a number of typedefs covering the usual cases. Here are some examples:
+  *
+  * \li \c Matrix2d is a 2x2 square matrix of doubles (\c Matrix<double, 2, 2>)
+  * \li \c Vector4f is a vector of 4 floats (\c Matrix<float, 4, 1>)
+  * \li \c RowVector3i is a row-vector of 3 ints (\c Matrix<int, 1, 3>)
+  *
+  * \li \c MatrixXf is a dynamic-size matrix of floats (\c Matrix<float, Dynamic, Dynamic>)
+  * \li \c VectorXf is a dynamic-size vector of floats (\c Matrix<float, Dynamic, 1>)
+  *
+  * \li \c Matrix2Xf is a partially fixed-size (dynamic-size) matrix of floats (\c Matrix<float, 2, Dynamic>)
+  * \li \c MatrixX3d is a partially dynamic-size (fixed-size) matrix of double (\c Matrix<double, Dynamic, 3>)
+  *
+  * See \link matrixtypedefs this page \endlink for a complete list of predefined \em %Matrix and \em Vector typedefs.
+  *
+  * You can access elements of vectors and matrices using normal subscripting:
+  *
+  * \code
+  * Eigen::VectorXd v(10);
+  * v[0] = 0.1;
+  * v[1] = 0.2;
+  * v(0) = 0.3;
+  * v(1) = 0.4;
+  *
+  * Eigen::MatrixXi m(10, 10);
+  * m(0, 1) = 1;
+  * m(0, 2) = 2;
+  * m(0, 3) = 3;
+  * \endcode
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
+  *
+  * <i><b>Some notes:</b></i>
+  *
+  * <dl>
+  * <dt><b>\anchor dense Dense versus sparse:</b></dt>
+  * <dd>This %Matrix class handles dense, not sparse matrices and vectors. For sparse matrices and vectors, see the Sparse module.
+  *
+  * Dense matrices and vectors are plain usual arrays of coefficients. All the coefficients are stored, in an ordinary contiguous array.
+  * This is unlike Sparse matrices and vectors where the coefficients are stored as a list of nonzero coefficients.</dd>
+  *
+  * <dt><b>\anchor fixedsize Fixed-size versus dynamic-size:</b></dt>
+  * <dd>Fixed-size means that the numbers of rows and columns are known are compile-time. In this case, Eigen allocates the array
+  * of coefficients as a fixed-size array, as a class member. This makes sense for very small matrices, typically up to 4x4, sometimes up
+  * to 16x16. Larger matrices should be declared as dynamic-size even if one happens to know their size at compile-time.
+  *
+  * Dynamic-size means that the numbers of rows or columns are not necessarily known at compile-time. In this case they are runtime
+  * variables, and the array of coefficients is allocated dynamically on the heap.
+  *
+  * Note that \em dense matrices, be they Fixed-size or Dynamic-size, <em>do not</em> expand dynamically in the sense of a std::map.
+  * If you want this behavior, see the Sparse module.</dd>
+  *
+  * <dt><b>\anchor maxrows _MaxRows and _MaxCols:</b></dt>
+  * <dd>In most cases, one just leaves these parameters to the default values.
+  * These parameters mean the maximum size of rows and columns that the matrix may have. They are useful in cases
+  * when the exact numbers of rows and columns are not known are compile-time, but it is known at compile-time that they cannot
+  * exceed a certain value. This happens when taking dynamic-size blocks inside fixed-size matrices: in this case _MaxRows and _MaxCols
+  * are the dimensions of the original matrix, while _Rows and _Cols are Dynamic.</dd>
+  * </dl>
+  *
+  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, 
+  * \ref TopicStorageOrders 
+  */
+
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef DenseIndex Index;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _MaxRows,
+    MaxColsAtCompileTime = _MaxCols,
+    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Options = _Options,
+    InnerStrideAtCompileTime = 1,
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime
+  };
+};
+}
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+class Matrix
+  : public PlainObjectBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  public:
+
+    /** \brief Base class typedef.
+      * \sa PlainObjectBase
+      */
+    typedef PlainObjectBase<Matrix> Base;
+
+    enum { Options = _Options };
+
+    EIGEN_DENSE_PUBLIC_INTERFACE(Matrix)
+
+    typedef typename Base::PlainObject PlainObject;
+
+    using Base::base;
+    using Base::coeffRef;
+
+    /**
+      * \brief Assigns matrices to each other.
+      *
+      * \note This is a special case of the templated operator=. Its purpose is
+      * to prevent a default operator= from hiding the templated operator=.
+      *
+      * \callgraph
+      */
+    EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other)
+    {
+      return Base::_set(other);
+    }
+
+    /** \internal
+      * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
+      *
+      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
+      * it will be initialized.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
+    {
+      return Base::_set(other);
+    }
+
+    /* Here, doxygen failed to copy the brief information when using \copydoc */
+
+    /**
+      * \brief Copies the generic expression \a other into *this.
+      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase<OtherDerived> &other)
+    {
+      return Base::operator=(other);
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived>& func)
+    {
+      return Base::operator=(func);
+    }
+
+    /** \brief Default constructor.
+      *
+      * For fixed-size matrices, does nothing.
+      *
+      * For dynamic-size matrices, creates an empty matrix of size 0. Does not allocate any array. Such a matrix
+      * is called a null matrix. This constructor is the unique way to create null matrices: resizing
+      * a matrix to 0 is not supported.
+      *
+      * \sa resize(Index,Index)
+      */
+    EIGEN_STRONG_INLINE Matrix() : Base()
+    {
+      Base::_check_template_params();
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+    // FIXME is it still needed
+    Matrix(internal::constructor_without_unaligned_array_assert)
+      : Base(internal::constructor_without_unaligned_array_assert())
+    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+
+    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Matrix() instead.
+      */
+    EIGEN_STRONG_INLINE explicit Matrix(Index dim)
+      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Matrix)
+      eigen_assert(dim >= 0);
+      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename T0, typename T1>
+    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
+    {
+      Base::_check_template_params();
+      Base::template _init2<T0,T1>(x, y);
+    }
+    #else
+    /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
+      *
+      * This is useful for dynamic-size matrices. For fixed-size matrices,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Matrix() instead. */
+    Matrix(Index rows, Index cols);
+    /** \brief Constructs an initialized 2D vector with given coefficients */
+    Matrix(const Scalar& x, const Scalar& y);
+    #endif
+
+    /** \brief Constructs an initialized 3D vector with given coefficients */
+    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 3)
+      m_storage.data()[0] = x;
+      m_storage.data()[1] = y;
+      m_storage.data()[2] = z;
+    }
+    /** \brief Constructs an initialized 4D vector with given coefficients */
+    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 4)
+      m_storage.data()[0] = x;
+      m_storage.data()[1] = y;
+      m_storage.data()[2] = z;
+      m_storage.data()[3] = w;
+    }
+
+    explicit Matrix(const Scalar *data);
+
+    /** \brief Constructor copying the value of the expression \a other */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
+             : Base(other.rows() * other.cols(), other.rows(), other.cols())
+    {
+      // This test resides here, to bring the error messages closer to the user. Normally, these checks
+      // are performed deeply within the library, thus causing long and scary error traces.
+      EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+      Base::_check_template_params();
+      Base::_set_noalias(other);
+    }
+    /** \brief Copy constructor */
+    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
+            : Base(other.rows() * other.cols(), other.rows(), other.cols())
+    {
+      Base::_check_template_params();
+      Base::_set_noalias(other);
+    }
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
+    {
+      Base::_check_template_params();
+      Base::resize(other.rows(), other.cols());
+      other.evalTo(*this);
+    }
+
+    /** \brief Copy constructor for generic expressions.
+      * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
+      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
+    {
+      Base::_check_template_params();
+      Base::_resize_to_match(other);
+      // FIXME/CHECK: isn't *this = other.derived() more efficient. it allows to
+      //              go for pure _set() implementations, right?
+      *this = other;
+    }
+
+    /** \internal
+      * \brief Override MatrixBase::swap() since for dynamic-sized matrices
+      * of same type it is enough to swap the data pointers.
+      */
+    template<typename OtherDerived>
+    void swap(MatrixBase<OtherDerived> const & other)
+    { this->_swap(other.derived()); }
+
+    inline Index innerStride() const { return 1; }
+    inline Index outerStride() const { return this->innerSize(); }
+
+    /////////// Geometry module ///////////
+
+    template<typename OtherDerived>
+    explicit Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
+    template<typename OtherDerived>
+    Matrix& operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
+
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    explicit Matrix(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
+    template<typename OtherDerived>
+    Matrix& operator=(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
+    #endif
+
+    // allow to extend Matrix outside Eigen
+    #ifdef EIGEN_MATRIX_PLUGIN
+    #include EIGEN_MATRIX_PLUGIN
+    #endif
+
+  protected:
+    template <typename Derived, typename OtherDerived, bool IsVector>
+    friend struct internal::conservative_resize_like_impl;
+
+    using Base::m_storage;
+};
+
+/** \defgroup matrixtypedefs Global matrix typedefs
+  *
+  * \ingroup Core_Module
+  *
+  * Eigen defines several typedef shortcuts for most common matrix and vector types.
+  *
+  * The general patterns are the following:
+  *
+  * \c MatrixSizeType where \c Size can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size,
+  * and where \c Type can be \c i for integer, \c f for float, \c d for double, \c cf for complex float, \c cd
+  * for complex double.
+  *
+  * For example, \c Matrix3d is a fixed-size 3x3 matrix type of doubles, and \c MatrixXf is a dynamic-size matrix of floats.
+  *
+  * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
+  * a fixed-size vector of 4 complex floats.
+  *
+  * \sa class Matrix
+  */
+
+#define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)   \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Size, Size> Matrix##SizeSuffix##TypeSuffix;  \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Size, 1>    Vector##SizeSuffix##TypeSuffix;  \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, 1, Size>    RowVector##SizeSuffix##TypeSuffix;
+
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size)         \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Size, Dynamic> Matrix##Size##X##TypeSuffix;  \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Dynamic, Size> Matrix##X##Size##TypeSuffix;
+
+#define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 2, 2) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 3, 3) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 4, 4) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X) \
+EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 2) \
+EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 3) \
+EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 4)
+
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(int,                  i)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(float,                f)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(double,               d)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<float>,  cf)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
+
+#undef EIGEN_MAKE_TYPEDEFS_ALL_SIZES
+#undef EIGEN_MAKE_TYPEDEFS
+#undef EIGEN_MAKE_FIXED_TYPEDEFS
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/MatrixBase.h b/include/eigen3/Eigen/src/Core/MatrixBase.h
new file mode 100644
index 0000000..cc32797
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/MatrixBase.h
@@ -0,0 +1,556 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIXBASE_H
+#define EIGEN_MATRIXBASE_H
+
+namespace Eigen {
+
+/** \class MatrixBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for all dense matrices, vectors, and expressions
+  *
+  * This class is the base that is inherited by all matrix, vector, and related expression
+  * types. Most of the Eigen API is contained in this class, and its base classes. Other important
+  * classes for the Eigen API are Matrix, and VectorwiseOp.
+  *
+  * Note that some methods are defined in other modules such as the \ref LU_Module LU module
+  * for all functions related to matrix inversions.
+  *
+  * \tparam Derived is the derived type, e.g. a matrix type, or an expression, etc.
+  *
+  * When writing a function taking Eigen objects as argument, if you want your function
+  * to take as argument any matrix, vector, or expression, just let it take a
+  * MatrixBase argument. As an example, here is a function printFirstRow which, given
+  * a matrix, vector, or expression \a x, prints the first row of \a x.
+  *
+  * \code
+    template<typename Derived>
+    void printFirstRow(const Eigen::MatrixBase<Derived>& x)
+    {
+      cout << x.row(0) << endl;
+    }
+  * \endcode
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
+  *
+  * \sa \ref TopicClassHierarchy
+  */
+template<typename Derived> class MatrixBase
+  : public DenseBase<Derived>
+{
+  public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef MatrixBase StorageBaseType;
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    typedef DenseBase<Derived> Base;
+    using Base::RowsAtCompileTime;
+    using Base::ColsAtCompileTime;
+    using Base::SizeAtCompileTime;
+    using Base::MaxRowsAtCompileTime;
+    using Base::MaxColsAtCompileTime;
+    using Base::MaxSizeAtCompileTime;
+    using Base::IsVectorAtCompileTime;
+    using Base::Flags;
+    using Base::CoeffReadCost;
+
+    using Base::derived;
+    using Base::const_cast_derived;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::coeff;
+    using Base::coeffRef;
+    using Base::lazyAssign;
+    using Base::eval;
+    using Base::operator+=;
+    using Base::operator-=;
+    using Base::operator*=;
+    using Base::operator/=;
+
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
+    typedef typename Base::RowXpr RowXpr;
+    typedef typename Base::ColXpr ColXpr;
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** type of the equivalent square matrix */
+    typedef Matrix<Scalar,EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime),
+                          EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime)> SquareMatrixType;
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    /** \returns the size of the main diagonal, which is min(rows(),cols()).
+      * \sa rows(), cols(), SizeAtCompileTime. */
+    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
+
+    /** \brief The plain matrix type corresponding to this expression.
+      *
+      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
+      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
+      * that the return type of eval() is either PlainObject or const PlainObject&.
+      */
+    typedef Matrix<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainObject;
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal Represents a matrix with all coefficients equal to one another*/
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    /** \internal the return type of MatrixBase::adjoint() */
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+                        ConstTransposeReturnType
+                     >::type AdjointReturnType;
+    /** \internal Return type of eigenvalues() */
+    typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType;
+    /** \internal the return type of identity */
+    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,Derived> IdentityReturnType;
+    /** \internal the return type of unit vectors */
+    typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
+                  internal::traits<Derived>::RowsAtCompileTime,
+                  internal::traits<Derived>::ColsAtCompileTime> BasisReturnType;
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
+#   include "../plugins/CommonCwiseUnaryOps.h"
+#   include "../plugins/CommonCwiseBinaryOps.h"
+#   include "../plugins/MatrixCwiseUnaryOps.h"
+#   include "../plugins/MatrixCwiseBinaryOps.h"
+#   ifdef EIGEN_MATRIXBASE_PLUGIN
+#     include EIGEN_MATRIXBASE_PLUGIN
+#   endif
+#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+
+    /** Special case of the template operator=, in order to prevent the compiler
+      * from generating a default operator= (issue hit with g++ 4.1)
+      */
+    Derived& operator=(const MatrixBase& other);
+
+    // We cannot inherit here via Base::operator= since it is causing
+    // trouble with MSVC.
+
+    template <typename OtherDerived>
+    Derived& operator=(const DenseBase<OtherDerived>& other);
+
+    template <typename OtherDerived>
+    Derived& operator=(const EigenBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    Derived& operator=(const ReturnByValue<OtherDerived>& other);
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
+
+    template<typename MatrixPower, typename Lhs, typename Rhs>
+    Derived& lazyAssign(const MatrixPowerProduct<MatrixPower, Lhs,Rhs>& other);
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    template<typename OtherDerived>
+    Derived& operator+=(const MatrixBase<OtherDerived>& other);
+    template<typename OtherDerived>
+    Derived& operator-=(const MatrixBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    const typename ProductReturnType<Derived,OtherDerived>::Type
+    operator*(const MatrixBase<OtherDerived> &other) const;
+
+    template<typename OtherDerived>
+    const typename LazyProductReturnType<Derived,OtherDerived>::Type
+    lazyProduct(const MatrixBase<OtherDerived> &other) const;
+
+    template<typename OtherDerived>
+    Derived& operator*=(const EigenBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    void applyOnTheLeft(const EigenBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    void applyOnTheRight(const EigenBase<OtherDerived>& other);
+
+    template<typename DiagonalDerived>
+    const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+    operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;
+
+    template<typename OtherDerived>
+    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+    dot(const MatrixBase<OtherDerived>& other) const;
+
+    #ifdef EIGEN2_SUPPORT
+      template<typename OtherDerived>
+      Scalar eigen2_dot(const MatrixBase<OtherDerived>& other) const;
+    #endif
+
+    RealScalar squaredNorm() const;
+    RealScalar norm() const;
+    RealScalar stableNorm() const;
+    RealScalar blueNorm() const;
+    RealScalar hypotNorm() const;
+    const PlainObject normalized() const;
+    void normalize();
+
+    const AdjointReturnType adjoint() const;
+    void adjointInPlace();
+
+    typedef Diagonal<Derived> DiagonalReturnType;
+    DiagonalReturnType diagonal();
+    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    ConstDiagonalReturnType diagonal() const;
+
+    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
+    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };
+
+    template<int Index> typename DiagonalIndexReturnType<Index>::Type diagonal();
+    template<int Index> typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
+    
+    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
+    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
+
+    DiagonalDynamicIndexReturnType diagonal(Index index);
+    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
+
+    #ifdef EIGEN2_SUPPORT
+    template<unsigned int Mode> typename internal::eigen2_part_return_type<Derived, Mode>::type part();
+    template<unsigned int Mode> const typename internal::eigen2_part_return_type<Derived, Mode>::type part() const;
+    
+    // huuuge hack. make Eigen2's matrix.part<Diagonal>() work in eigen3. Problem: Diagonal is now a class template instead
+    // of an integer constant. Solution: overload the part() method template wrt template parameters list.
+    template<template<typename T, int N> class U>
+    const DiagonalWrapper<ConstDiagonalReturnType> part() const
+    { return diagonal().asDiagonal(); }
+    #endif // EIGEN2_SUPPORT
+
+    template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
+    template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
+
+    template<unsigned int Mode> typename TriangularViewReturnType<Mode>::Type triangularView();
+    template<unsigned int Mode> typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+
+    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
+    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; };
+
+    template<unsigned int UpLo> typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+    template<unsigned int UpLo> typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+
+    const SparseView<Derived> sparseView(const Scalar& m_reference = Scalar(0),
+                                         const typename NumTraits<Scalar>::Real& m_epsilon = NumTraits<Scalar>::dummy_precision()) const;
+    static const IdentityReturnType Identity();
+    static const IdentityReturnType Identity(Index rows, Index cols);
+    static const BasisReturnType Unit(Index size, Index i);
+    static const BasisReturnType Unit(Index i);
+    static const BasisReturnType UnitX();
+    static const BasisReturnType UnitY();
+    static const BasisReturnType UnitZ();
+    static const BasisReturnType UnitW();
+
+    const DiagonalWrapper<const Derived> asDiagonal() const;
+    const PermutationWrapper<const Derived> asPermutation() const;
+
+    Derived& setIdentity();
+    Derived& setIdentity(Index rows, Index cols);
+
+    bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+    bool isUpperTriangular(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isLowerTriangular(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+    template<typename OtherDerived>
+    bool isOrthogonal(const MatrixBase<OtherDerived>& other,
+                      const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isUnitary(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+    /** \returns true if each coefficients of \c *this and \a other are all exactly equal.
+      * \warning When using floating point scalar values you probably should rather use a
+      *          fuzzy comparison such as isApprox()
+      * \sa isApprox(), operator!= */
+    template<typename OtherDerived>
+    inline bool operator==(const MatrixBase<OtherDerived>& other) const
+    { return cwiseEqual(other).all(); }
+
+    /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
+      * \warning When using floating point scalar values you probably should rather use a
+      *          fuzzy comparison such as isApprox()
+      * \sa isApprox(), operator== */
+    template<typename OtherDerived>
+    inline bool operator!=(const MatrixBase<OtherDerived>& other) const
+    { return cwiseNotEqual(other).any(); }
+
+    NoAlias<Derived,Eigen::MatrixBase > noalias();
+
+    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+    inline ForceAlignedAccess<Derived> forceAlignedAccess();
+    template<bool Enable> inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type forceAlignedAccessIf() const;
+    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+
+    Scalar trace() const;
+
+/////////// Array module ///////////
+
+    template<int p> RealScalar lpNorm() const;
+
+    MatrixBase<Derived>& matrix() { return *this; }
+    const MatrixBase<Derived>& matrix() const { return *this; }
+
+    /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
+      * \sa ArrayBase::matrix() */
+    ArrayWrapper<Derived> array() { return derived(); }
+    const ArrayWrapper<const Derived> array() const { return derived(); }
+
+/////////// LU module ///////////
+
+    const FullPivLU<PlainObject> fullPivLu() const;
+    const PartialPivLU<PlainObject> partialPivLu() const;
+
+    #if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
+    const LU<PlainObject> lu() const;
+    #endif
+
+    #ifdef EIGEN2_SUPPORT
+    const LU<PlainObject> eigen2_lu() const;
+    #endif
+
+    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+    const PartialPivLU<PlainObject> lu() const;
+    #endif
+    
+    #ifdef EIGEN2_SUPPORT
+    template<typename ResultType>
+    void computeInverse(MatrixBase<ResultType> *result) const {
+      *result = this->inverse();
+    }
+    #endif
+
+    const internal::inverse_impl<Derived> inverse() const;
+    template<typename ResultType>
+    void computeInverseAndDetWithCheck(
+      ResultType& inverse,
+      typename ResultType::Scalar& determinant,
+      bool& invertible,
+      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
+    ) const;
+    template<typename ResultType>
+    void computeInverseWithCheck(
+      ResultType& inverse,
+      bool& invertible,
+      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
+    ) const;
+    Scalar determinant() const;
+
+/////////// Cholesky module ///////////
+
+    const LLT<PlainObject>  llt() const;
+    const LDLT<PlainObject> ldlt() const;
+
+/////////// QR module ///////////
+
+    const HouseholderQR<PlainObject> householderQr() const;
+    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
+    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
+    
+    #ifdef EIGEN2_SUPPORT
+    const QR<PlainObject> qr() const;
+    #endif
+
+    EigenvaluesReturnType eigenvalues() const;
+    RealScalar operatorNorm() const;
+
+/////////// SVD module ///////////
+
+    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
+
+    #ifdef EIGEN2_SUPPORT
+    SVD<PlainObject> svd() const;
+    #endif
+
+/////////// Geometry module ///////////
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /// \internal helper struct to form the return type of the cross product
+    template<typename OtherDerived> struct cross_product_return_type {
+      typedef typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+      typedef Matrix<Scalar,MatrixBase::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> type;
+    };
+    #endif // EIGEN_PARSED_BY_DOXYGEN
+    template<typename OtherDerived>
+    typename cross_product_return_type<OtherDerived>::type
+    cross(const MatrixBase<OtherDerived>& other) const;
+    template<typename OtherDerived>
+    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
+    PlainObject unitOrthogonal(void) const;
+    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
+    
+    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
+    // put this as separate enum value to work around possible GCC 4.3 bug (?)
+    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
+    typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
+    HomogeneousReturnType homogeneous() const;
+    #endif
+    
+    enum {
+      SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
+    };
+    typedef Block<const Derived,
+                  internal::traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
+                  internal::traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne;
+    typedef CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>,
+                const ConstStartMinusOne > HNormalizedReturnType;
+
+    const HNormalizedReturnType hnormalized() const;
+
+////////// Householder module ///////////
+
+    void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
+    template<typename EssentialPart>
+    void makeHouseholder(EssentialPart& essential,
+                         Scalar& tau, RealScalar& beta) const;
+    template<typename EssentialPart>
+    void applyHouseholderOnTheLeft(const EssentialPart& essential,
+                                   const Scalar& tau,
+                                   Scalar* workspace);
+    template<typename EssentialPart>
+    void applyHouseholderOnTheRight(const EssentialPart& essential,
+                                    const Scalar& tau,
+                                    Scalar* workspace);
+
+///////// Jacobi module /////////
+
+    template<typename OtherScalar>
+    void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
+    template<typename OtherScalar>
+    void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
+
+///////// MatrixFunctions module /////////
+
+    typedef typename internal::stem_function<Scalar>::type StemFunction;
+    const MatrixExponentialReturnValue<Derived> exp() const;
+    const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
+    const MatrixFunctionReturnValue<Derived> cosh() const;
+    const MatrixFunctionReturnValue<Derived> sinh() const;
+    const MatrixFunctionReturnValue<Derived> cos() const;
+    const MatrixFunctionReturnValue<Derived> sin() const;
+    const MatrixSquareRootReturnValue<Derived> sqrt() const;
+    const MatrixLogarithmReturnValue<Derived> log() const;
+    const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
+
+#ifdef EIGEN2_SUPPORT
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    Derived& operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
+                                      EvalBeforeAssigningBit>& other);
+
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    Derived& operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
+                                      EvalBeforeAssigningBit>& other);
+
+    /** \deprecated because .lazy() is deprecated
+      * Overloaded for cache friendly product evaluation */
+    template<typename OtherDerived>
+    Derived& lazyAssign(const Flagged<OtherDerived, 0, EvalBeforeAssigningBit>& other)
+    { return lazyAssign(other._expression()); }
+
+    template<unsigned int Added>
+    const Flagged<Derived, Added, 0> marked() const;
+    const Flagged<Derived, 0, EvalBeforeAssigningBit> lazy() const;
+
+    inline const Cwise<Derived> cwise() const;
+    inline Cwise<Derived> cwise();
+
+    VectorBlock<Derived> start(Index size);
+    const VectorBlock<const Derived> start(Index size) const;
+    VectorBlock<Derived> end(Index size);
+    const VectorBlock<const Derived> end(Index size) const;
+    template<int Size> VectorBlock<Derived,Size> start();
+    template<int Size> const VectorBlock<const Derived,Size> start() const;
+    template<int Size> VectorBlock<Derived,Size> end();
+    template<int Size> const VectorBlock<const Derived,Size> end() const;
+
+    Minor<Derived> minor(Index row, Index col);
+    const Minor<Derived> minor(Index row, Index col) const;
+#endif
+
+  protected:
+    MatrixBase() : Base() {}
+
+  private:
+    explicit MatrixBase(int);
+    MatrixBase(int,int);
+    template<typename OtherDerived> explicit MatrixBase(const MatrixBase<OtherDerived>&);
+  protected:
+    // mixing arrays and matrices is not legal
+    template<typename OtherDerived> Derived& operator+=(const ArrayBase<OtherDerived>& )
+    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+    // mixing arrays and matrices is not legal
+    template<typename OtherDerived> Derived& operator-=(const ArrayBase<OtherDerived>& )
+    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+};
+
+
+/***************************************************************************
+* Implementation of matrix base methods
+***************************************************************************/
+
+/** replaces \c *this by \c *this * \a other.
+  *
+  * \returns a reference to \c *this
+  *
+  * Example: \include MatrixBase_applyOnTheRight.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheRight.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline Derived&
+MatrixBase<Derived>::operator*=(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheRight(derived());
+  return derived();
+}
+
+/** replaces \c *this by \c *this * \a other. It is equivalent to MatrixBase::operator*=().
+  *
+  * Example: \include MatrixBase_applyOnTheRight.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheRight.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheRight(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheRight(derived());
+}
+
+/** replaces \c *this by \a other * \c *this.
+  *
+  * Example: \include MatrixBase_applyOnTheLeft.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheLeft.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheLeft(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheLeft(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIXBASE_H
diff --git a/include/eigen3/Eigen/src/Core/NestByValue.h b/include/eigen3/Eigen/src/Core/NestByValue.h
new file mode 100644
index 0000000..a893b17
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/NestByValue.h
@@ -0,0 +1,111 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NESTBYVALUE_H
+#define EIGEN_NESTBYVALUE_H
+
+namespace Eigen {
+
+/** \class NestByValue
+  * \ingroup Core_Module
+  *
+  * \brief Expression which must be nested by value
+  *
+  * \param ExpressionType the type of the object of which we are requiring nesting-by-value
+  *
+  * This class is the return type of MatrixBase::nestByValue()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::nestByValue()
+  */
+
+namespace internal {
+template<typename ExpressionType>
+struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
+{};
+}
+
+template<typename ExpressionType> class NestByValue
+  : public internal::dense_xpr_base< NestByValue<ExpressionType> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<NestByValue>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
+
+    inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+
+    inline Index rows() const { return m_expression.rows(); }
+    inline Index cols() const { return m_expression.cols(); }
+    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index innerStride() const { return m_expression.innerStride(); }
+
+    inline const CoeffReturnType coeff(Index row, Index col) const
+    {
+      return m_expression.coeff(row, col);
+    }
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_expression.const_cast_derived().coeffRef(row, col);
+    }
+
+    inline const CoeffReturnType coeff(Index index) const
+    {
+      return m_expression.coeff(index);
+    }
+
+    inline Scalar& coeffRef(Index index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index row, Index col) const
+    {
+      return m_expression.template packet<LoadMode>(row, col);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return m_expression.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
+
+    operator const ExpressionType&() const { return m_expression; }
+
+  protected:
+    const ExpressionType m_expression;
+};
+
+/** \returns an expression of the temporary version of *this.
+  */
+template<typename Derived>
+inline const NestByValue<Derived>
+DenseBase<Derived>::nestByValue() const
+{
+  return NestByValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_NESTBYVALUE_H
diff --git a/include/eigen3/Eigen/src/Core/NoAlias.h b/include/eigen3/Eigen/src/Core/NoAlias.h
new file mode 100644
index 0000000..768bfb1
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/NoAlias.h
@@ -0,0 +1,134 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NOALIAS_H
+#define EIGEN_NOALIAS_H
+
+namespace Eigen {
+
+/** \class NoAlias
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression providing an operator = assuming no aliasing
+  *
+  * \param ExpressionType the type of the object on which to do the lazy assignment
+  *
+  * This class represents an expression with special assignment operators
+  * assuming no aliasing between the target expression and the source expression.
+  * More precisely it alloas to bypass the EvalBeforeAssignBit flag of the source expression.
+  * It is the return type of MatrixBase::noalias()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::noalias()
+  */
+template<typename ExpressionType, template <typename> class StorageBase>
+class NoAlias
+{
+    typedef typename ExpressionType::Scalar Scalar;
+  public:
+    NoAlias(ExpressionType& expression) : m_expression(expression) {}
+
+    /** Behaves like MatrixBase::lazyAssign(other)
+      * \sa MatrixBase::lazyAssign() */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
+    { return internal::assign_selector<ExpressionType,OtherDerived,false>::run(m_expression,other.derived()); }
+
+    /** \sa MatrixBase::operator+= */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
+    {
+      typedef SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
+      SelfAdder tmp(m_expression);
+      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
+      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
+      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      return m_expression;
+    }
+
+    /** \sa MatrixBase::operator-= */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
+    {
+      typedef SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
+      SelfAdder tmp(m_expression);
+      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
+      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
+      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      return m_expression;
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
+    { other.derived().addTo(m_expression); return m_expression; }
+
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
+    { other.derived().subTo(m_expression); return m_expression; }
+
+    template<typename Lhs, typename Rhs, int NestingFlags>
+    EIGEN_STRONG_INLINE ExpressionType& operator+=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
+    { return m_expression.derived() += CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
+
+    template<typename Lhs, typename Rhs, int NestingFlags>
+    EIGEN_STRONG_INLINE ExpressionType& operator-=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
+    { return m_expression.derived() -= CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
+    
+    template<typename OtherDerived>
+    ExpressionType& operator=(const ReturnByValue<OtherDerived>& func)
+    { return m_expression = func; }
+#endif
+
+    ExpressionType& expression() const
+    {
+      return m_expression;
+    }
+
+  protected:
+    ExpressionType& m_expression;
+};
+
+/** \returns a pseudo expression of \c *this with an operator= assuming
+  * no aliasing between \c *this and the source expression.
+  *
+  * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
+  * Currently, even though several expressions may alias, only product
+  * expressions have this flag. Therefore, noalias() is only usefull when
+  * the source expression contains a matrix product.
+  *
+  * Here are some examples where noalias is usefull:
+  * \code
+  * D.noalias()  = A * B;
+  * D.noalias() += A.transpose() * B;
+  * D.noalias() -= 2 * A * B.adjoint();
+  * \endcode
+  *
+  * On the other hand the following example will lead to a \b wrong result:
+  * \code
+  * A.noalias() = A * B;
+  * \endcode
+  * because the result matrix A is also an operand of the matrix product. Therefore,
+  * there is no alternative than evaluating A * B in a temporary, that is the default
+  * behavior when you write:
+  * \code
+  * A = A * B;
+  * \endcode
+  *
+  * \sa class NoAlias
+  */
+template<typename Derived>
+NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
+{
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_NOALIAS_H
diff --git a/include/eigen3/Eigen/src/Core/NumTraits.h b/include/eigen3/Eigen/src/Core/NumTraits.h
new file mode 100644
index 0000000..bac9e50
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/NumTraits.h
@@ -0,0 +1,150 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NUMTRAITS_H
+#define EIGEN_NUMTRAITS_H
+
+namespace Eigen {
+
+/** \class NumTraits
+  * \ingroup Core_Module
+  *
+  * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
+  *
+  * \param T the numeric type at hand
+  *
+  * This class stores enums, typedefs and static methods giving information about a numeric type.
+  *
+  * The provided data consists of:
+  * \li A typedef \a Real, giving the "real part" type of \a T. If \a T is already real,
+  *     then \a Real is just a typedef to \a T. If \a T is \c std::complex<U> then \a Real
+  *     is a typedef to \a U.
+  * \li A typedef \a NonInteger, giving the type that should be used for operations producing non-integral values,
+  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
+  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
+  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
+  *     only intended as a helper for code that needs to explicitly promote types.
+  * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
+  *     this means, just use \a T here.
+  * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
+  *     type, and to 0 otherwise.
+  * \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
+  *     and to \c 0 otherwise.
+  * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
+  *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
+  *     Stay vague here. No need to do architecture-specific stuff.
+  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
+  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
+  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
+  * \li An epsilon() function which, unlike std::numeric_limits::epsilon(), returns a \a Real instead of a \a T.
+  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
+  *     value by the fuzzy comparison operators.
+  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  */
+
+template<typename T> struct GenericNumTraits
+{
+  enum {
+    IsInteger = std::numeric_limits<T>::is_integer,
+    IsSigned = std::numeric_limits<T>::is_signed,
+    IsComplex = 0,
+    RequireInitialization = internal::is_arithmetic<T>::value ? 0 : 1,
+    ReadCost = 1,
+    AddCost = 1,
+    MulCost = 1
+  };
+
+  typedef T Real;
+  typedef typename internal::conditional<
+                     IsInteger,
+                     typename internal::conditional<sizeof(T)<=2, float, double>::type,
+                     T
+                   >::type NonInteger;
+  typedef T Nested;
+
+  static inline Real epsilon() { return std::numeric_limits<T>::epsilon(); }
+  static inline Real dummy_precision()
+  {
+    // make sure to override this for floating-point types
+    return Real(0);
+  }
+  static inline T highest() { return (std::numeric_limits<T>::max)(); }
+  static inline T lowest()  { return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)()); }
+  
+#ifdef EIGEN2_SUPPORT
+  enum {
+    HasFloatingPoint = !IsInteger
+  };
+  typedef NonInteger FloatingPoint;
+#endif
+};
+
+template<typename T> struct NumTraits : GenericNumTraits<T>
+{};
+
+template<> struct NumTraits<float>
+  : GenericNumTraits<float>
+{
+  static inline float dummy_precision() { return 1e-5f; }
+};
+
+template<> struct NumTraits<double> : GenericNumTraits<double>
+{
+  static inline double dummy_precision() { return 1e-12; }
+};
+
+template<> struct NumTraits<long double>
+  : GenericNumTraits<long double>
+{
+  static inline long double dummy_precision() { return 1e-15l; }
+};
+
+template<typename _Real> struct NumTraits<std::complex<_Real> >
+  : GenericNumTraits<std::complex<_Real> >
+{
+  typedef _Real Real;
+  enum {
+    IsComplex = 1,
+    RequireInitialization = NumTraits<_Real>::RequireInitialization,
+    ReadCost = 2 * NumTraits<_Real>::ReadCost,
+    AddCost = 2 * NumTraits<Real>::AddCost,
+    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
+  };
+
+  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+{
+  typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> ArrayType;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Array<RealScalar, Rows, Cols, Options, MaxRows, MaxCols> Real;
+  typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar;
+  typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger;
+  typedef ArrayType & Nested;
+  
+  enum {
+    IsComplex = NumTraits<Scalar>::IsComplex,
+    IsInteger = NumTraits<Scalar>::IsInteger,
+    IsSigned  = NumTraits<Scalar>::IsSigned,
+    RequireInitialization = 1,
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+  };
+  
+  static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+  static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_NUMTRAITS_H
diff --git a/include/eigen3/Eigen/src/Core/PermutationMatrix.h b/include/eigen3/Eigen/src/Core/PermutationMatrix.h
new file mode 100644
index 0000000..f26f3e5
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/PermutationMatrix.h
@@ -0,0 +1,692 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PERMUTATIONMATRIX_H
+#define EIGEN_PERMUTATIONMATRIX_H
+
+namespace Eigen { 
+
+template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;
+
+/** \class PermutationBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for permutations
+  *
+  * \param Derived the derived class
+  *
+  * This class is the base class for all expressions representing a permutation matrix,
+  * internally stored as a vector of integers.
+  * The convention followed here is that if \f$ \sigma \f$ is a permutation, the corresponding permutation matrix
+  * \f$ P_\sigma \f$ is such that if \f$ (e_1,\ldots,e_p) \f$ is the canonical basis, we have:
+  *  \f[ P_\sigma(e_i) = e_{\sigma(i)}. \f]
+  * This convention ensures that for any two permutations \f$ \sigma, \tau \f$, we have:
+  *  \f[ P_{\sigma\circ\tau} = P_\sigma P_\tau. \f]
+  *
+  * Permutation matrices are square and invertible.
+  *
+  * Notice that in addition to the member functions and operators listed here, there also are non-member
+  * operator* to multiply any kind of permutation object with any kind of matrix expression (MatrixBase)
+  * on either side.
+  *
+  * \sa class PermutationMatrix, class PermutationWrapper
+  */
+
+namespace internal {
+
+template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
+struct permut_matrix_product_retval;
+template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
+struct permut_sparsematrix_product_retval;
+enum PermPermProduct_t {PermPermProduct};
+
+} // end namespace internal
+
+template<typename Derived>
+class PermutationBase : public EigenBase<Derived>
+{
+    typedef internal::traits<Derived> Traits;
+    typedef EigenBase<Derived> Base;
+  public:
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename Traits::IndicesType IndicesType;
+    enum {
+      Flags = Traits::Flags,
+      CoeffReadCost = Traits::CoeffReadCost,
+      RowsAtCompileTime = Traits::RowsAtCompileTime,
+      ColsAtCompileTime = Traits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+    };
+    typedef typename Traits::Scalar Scalar;
+    typedef typename Traits::Index Index;
+    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
+            DenseMatrixType;
+    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,Index>
+            PlainPermutationType;
+    using Base::derived;
+    #endif
+
+    /** Copies the other permutation into *this */
+    template<typename OtherDerived>
+    Derived& operator=(const PermutationBase<OtherDerived>& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+
+    /** Assignment from the Transpositions \a tr */
+    template<typename OtherDerived>
+    Derived& operator=(const TranspositionsBase<OtherDerived>& tr)
+    {
+      setIdentity(tr.size());
+      for(Index k=size()-1; k>=0; --k)
+        applyTranspositionOnTheRight(k,tr.coeff(k));
+      return derived();
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Derived& operator=(const PermutationBase& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+    #endif
+
+    /** \returns the number of rows */
+    inline Index rows() const { return Index(indices().size()); }
+
+    /** \returns the number of columns */
+    inline Index cols() const { return Index(indices().size()); }
+
+    /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
+    inline Index size() const { return Index(indices().size()); }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived>& other) const
+    {
+      other.setZero();
+      for (int i=0; i<rows();++i)
+        other.coeffRef(indices().coeff(i),i) = typename DenseDerived::Scalar(1);
+    }
+    #endif
+
+    /** \returns a Matrix object initialized from this permutation matrix. Notice that it
+      * is inefficient to return this Matrix object by value. For efficiency, favor using
+      * the Matrix constructor taking EigenBase objects.
+      */
+    DenseMatrixType toDenseMatrix() const
+    {
+      return derived();
+    }
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return derived().indices(); }
+    /** \returns a reference to the stored array representing the permutation. */
+    IndicesType& indices() { return derived().indices(); }
+
+    /** Resizes to given size.
+      */
+    inline void resize(Index newSize)
+    {
+      indices().resize(newSize);
+    }
+
+    /** Sets *this to be the identity permutation matrix */
+    void setIdentity()
+    {
+      for(Index i = 0; i < size(); ++i)
+        indices().coeffRef(i) = i;
+    }
+
+    /** Sets *this to be the identity permutation matrix of given size.
+      */
+    void setIdentity(Index newSize)
+    {
+      resize(newSize);
+      setIdentity();
+    }
+
+    /** Multiplies *this by the transposition \f$(ij)\f$ on the left.
+      *
+      * \returns a reference to *this.
+      *
+      * \warning This is much slower than applyTranspositionOnTheRight(int,int):
+      * this has linear complexity and requires a lot of branching.
+      *
+      * \sa applyTranspositionOnTheRight(int,int)
+      */
+    Derived& applyTranspositionOnTheLeft(Index i, Index j)
+    {
+      eigen_assert(i>=0 && j>=0 && i<size() && j<size());
+      for(Index k = 0; k < size(); ++k)
+      {
+        if(indices().coeff(k) == i) indices().coeffRef(k) = j;
+        else if(indices().coeff(k) == j) indices().coeffRef(k) = i;
+      }
+      return derived();
+    }
+
+    /** Multiplies *this by the transposition \f$(ij)\f$ on the right.
+      *
+      * \returns a reference to *this.
+      *
+      * This is a fast operation, it only consists in swapping two indices.
+      *
+      * \sa applyTranspositionOnTheLeft(int,int)
+      */
+    Derived& applyTranspositionOnTheRight(Index i, Index j)
+    {
+      eigen_assert(i>=0 && j>=0 && i<size() && j<size());
+      std::swap(indices().coeffRef(i), indices().coeffRef(j));
+      return derived();
+    }
+
+    /** \returns the inverse permutation matrix.
+      *
+      * \note \note_try_to_help_rvo
+      */
+    inline Transpose<PermutationBase> inverse() const
+    { return derived(); }
+    /** \returns the tranpose permutation matrix.
+      *
+      * \note \note_try_to_help_rvo
+      */
+    inline Transpose<PermutationBase> transpose() const
+    { return derived(); }
+
+    /**** multiplication helpers to hopefully get RVO ****/
+
+  
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  protected:
+    template<typename OtherDerived>
+    void assignTranspose(const PermutationBase<OtherDerived>& other)
+    {
+      for (int i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
+    }
+    template<typename Lhs,typename Rhs>
+    void assignProduct(const Lhs& lhs, const Rhs& rhs)
+    {
+      eigen_assert(lhs.cols() == rhs.rows());
+      for (int i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
+    }
+#endif
+
+  public:
+
+    /** \returns the product permutation matrix.
+      *
+      * \note \note_try_to_help_rvo
+      */
+    template<typename Other>
+    inline PlainPermutationType operator*(const PermutationBase<Other>& other) const
+    { return PlainPermutationType(internal::PermPermProduct, derived(), other.derived()); }
+
+    /** \returns the product of a permutation with another inverse permutation.
+      *
+      * \note \note_try_to_help_rvo
+      */
+    template<typename Other>
+    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other) const
+    { return PlainPermutationType(internal::PermPermProduct, *this, other.eval()); }
+
+    /** \returns the product of an inverse permutation with another permutation.
+      *
+      * \note \note_try_to_help_rvo
+      */
+    template<typename Other> friend
+    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other, const PermutationBase& perm)
+    { return PlainPermutationType(internal::PermPermProduct, other.eval(), perm); }
+
+  protected:
+
+};
+
+/** \class PermutationMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Permutation matrix
+  *
+  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
+  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  * \param IndexType the interger type of the indices
+  *
+  * This class represents a permutation matrix, internally stored as a vector of integers.
+  *
+  * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
+  */
+
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
+ : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  typedef IndexType Index;
+  typedef Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+};
+}
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
+{
+    typedef PermutationBase<PermutationMatrix> Base;
+    typedef internal::traits<PermutationMatrix> Traits;
+  public:
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename Traits::IndicesType IndicesType;
+    #endif
+
+    inline PermutationMatrix()
+    {}
+
+    /** Constructs an uninitialized permutation matrix of given size.
+      */
+    inline PermutationMatrix(int size) : m_indices(size)
+    {}
+
+    /** Copy constructor. */
+    template<typename OtherDerived>
+    inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
+      : m_indices(other.indices()) {}
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Standard copy constructor. Defined only to prevent a default copy constructor
+      * from hiding the other templated constructor */
+    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
+    #endif
+
+    /** Generic constructor from expression of the indices. The indices
+      * array has the meaning that the permutations sends each integer i to indices[i].
+      *
+      * \warning It is your responsibility to check that the indices array that you passes actually
+      * describes a permutation, i.e., each value between 0 and n-1 occurs exactly once, where n is the
+      * array's size.
+      */
+    template<typename Other>
+    explicit inline PermutationMatrix(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    {}
+
+    /** Convert the Transpositions \a tr to a permutation matrix */
+    template<typename Other>
+    explicit PermutationMatrix(const TranspositionsBase<Other>& tr)
+      : m_indices(tr.size())
+    {
+      *this = tr;
+    }
+
+    /** Copies the other permutation into *this */
+    template<typename Other>
+    PermutationMatrix& operator=(const PermutationBase<Other>& other)
+    {
+      m_indices = other.indices();
+      return *this;
+    }
+
+    /** Assignment from the Transpositions \a tr */
+    template<typename Other>
+    PermutationMatrix& operator=(const TranspositionsBase<Other>& tr)
+    {
+      return Base::operator=(tr.derived());
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    PermutationMatrix& operator=(const PermutationMatrix& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return m_indices; }
+    /** \returns a reference to the stored array representing the permutation. */
+    IndicesType& indices() { return m_indices; }
+
+
+    /**** multiplication helpers to hopefully get RVO ****/
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename Other>
+    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
+      : m_indices(other.nestedPermutation().size())
+    {
+      for (int i=0; i<m_indices.size();++i) m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
+    }
+    template<typename Lhs,typename Rhs>
+    PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
+      : m_indices(lhs.indices().size())
+    {
+      Base::assignProduct(lhs,rhs);
+    }
+#endif
+
+  protected:
+
+    IndicesType m_indices;
+};
+
+
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
+struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
+ : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  typedef IndexType Index;
+  typedef Map<const Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+};
+}
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
+class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess>
+  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
+{
+    typedef PermutationBase<Map> Base;
+    typedef internal::traits<Map> Traits;
+  public:
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar Index;
+    #endif
+
+    inline Map(const Index* indicesPtr)
+      : m_indices(indicesPtr)
+    {}
+
+    inline Map(const Index* indicesPtr, Index size)
+      : m_indices(indicesPtr,size)
+    {}
+
+    /** Copies the other permutation into *this */
+    template<typename Other>
+    Map& operator=(const PermutationBase<Other>& other)
+    { return Base::operator=(other.derived()); }
+
+    /** Assignment from the Transpositions \a tr */
+    template<typename Other>
+    Map& operator=(const TranspositionsBase<Other>& tr)
+    { return Base::operator=(tr.derived()); }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Map& operator=(const Map& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return m_indices; }
+    /** \returns a reference to the stored array representing the permutation. */
+    IndicesType& indices() { return m_indices; }
+
+  protected:
+
+    IndicesType m_indices;
+};
+
+/** \class PermutationWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Class to view a vector of integers as a permutation matrix
+  *
+  * \param _IndicesType the type of the vector of integer (can be any compatible expression)
+  *
+  * This class allows to view any vector expression of integers as a permutation matrix.
+  *
+  * \sa class PermutationBase, class PermutationMatrix
+  */
+
+struct PermutationStorage {};
+
+template<typename _IndicesType> class TranspositionsWrapper;
+namespace internal {
+template<typename _IndicesType>
+struct traits<PermutationWrapper<_IndicesType> >
+{
+  typedef PermutationStorage StorageKind;
+  typedef typename _IndicesType::Scalar Scalar;
+  typedef typename _IndicesType::Scalar Index;
+  typedef _IndicesType IndicesType;
+  enum {
+    RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
+    ColsAtCompileTime = _IndicesType::SizeAtCompileTime,
+    MaxRowsAtCompileTime = IndicesType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = IndicesType::MaxColsAtCompileTime,
+    Flags = 0,
+    CoeffReadCost = _IndicesType::CoeffReadCost
+  };
+};
+}
+
+template<typename _IndicesType>
+class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesType> >
+{
+    typedef PermutationBase<PermutationWrapper> Base;
+    typedef internal::traits<PermutationWrapper> Traits;
+  public:
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename Traits::IndicesType IndicesType;
+    #endif
+
+    inline PermutationWrapper(const IndicesType& a_indices)
+      : m_indices(a_indices)
+    {}
+
+    /** const version of indices(). */
+    const typename internal::remove_all<typename IndicesType::Nested>::type&
+    indices() const { return m_indices; }
+
+  protected:
+
+    typename IndicesType::Nested m_indices;
+};
+
+/** \returns the matrix with the permutation applied to the columns.
+  */
+template<typename Derived, typename PermutationDerived>
+inline const internal::permut_matrix_product_retval<PermutationDerived, Derived, OnTheRight>
+operator*(const MatrixBase<Derived>& matrix,
+          const PermutationBase<PermutationDerived> &permutation)
+{
+  return internal::permut_matrix_product_retval
+           <PermutationDerived, Derived, OnTheRight>
+           (permutation.derived(), matrix.derived());
+}
+
+/** \returns the matrix with the permutation applied to the rows.
+  */
+template<typename Derived, typename PermutationDerived>
+inline const internal::permut_matrix_product_retval
+               <PermutationDerived, Derived, OnTheLeft>
+operator*(const PermutationBase<PermutationDerived> &permutation,
+          const MatrixBase<Derived>& matrix)
+{
+  return internal::permut_matrix_product_retval
+           <PermutationDerived, Derived, OnTheLeft>
+           (permutation.derived(), matrix.derived());
+}
+
+namespace internal {
+
+template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
+struct traits<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+{
+  typedef typename MatrixType::PlainObject ReturnType;
+};
+
+template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
+struct permut_matrix_product_retval
+ : public ReturnByValue<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+{
+    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
+    typedef typename MatrixType::Index Index;
+
+    permut_matrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
+      : m_permutation(perm), m_matrix(matrix)
+    {}
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+
+    template<typename Dest> inline void evalTo(Dest& dst) const
+    {
+      const Index n = Side==OnTheLeft ? rows() : cols();
+      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
+      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
+      if(    is_same<MatrixTypeNestedCleaned,Dest>::value
+          && blas_traits<MatrixTypeNestedCleaned>::HasUsableDirectAccess
+          && blas_traits<Dest>::HasUsableDirectAccess
+          && extract_data(dst) == extract_data(m_matrix))
+      {
+        // apply the permutation inplace
+        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
+        mask.fill(false);
+        Index r = 0;
+        while(r < m_permutation.size())
+        {
+          // search for the next seed
+          while(r<m_permutation.size() && mask[r]) r++;
+          if(r>=m_permutation.size())
+            break;
+          // we got one, let's follow it until we are back to the seed
+          Index k0 = r++;
+          Index kPrev = k0;
+          mask.coeffRef(k0) = true;
+          for(Index k=m_permutation.indices().coeff(k0); k!=k0; k=m_permutation.indices().coeff(k))
+          {
+                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
+            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
+
+            mask.coeffRef(k) = true;
+            kPrev = k;
+          }
+        }
+      }
+      else
+      {
+        for(int i = 0; i < n; ++i)
+        {
+          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+               (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
+
+          =
+
+          Block<const MatrixTypeNestedCleaned,Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime>
+               (m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
+        }
+      }
+    }
+
+  protected:
+    const PermutationType& m_permutation;
+    typename MatrixType::Nested m_matrix;
+};
+
+/* Template partial specialization for transposed/inverse permutations */
+
+template<typename Derived>
+struct traits<Transpose<PermutationBase<Derived> > >
+ : traits<Derived>
+{};
+
+} // end namespace internal
+
+template<typename Derived>
+class Transpose<PermutationBase<Derived> >
+  : public EigenBase<Transpose<PermutationBase<Derived> > >
+{
+    typedef Derived PermutationType;
+    typedef typename PermutationType::IndicesType IndicesType;
+    typedef typename PermutationType::PlainPermutationType PlainPermutationType;
+  public:
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef internal::traits<PermutationType> Traits;
+    typedef typename Derived::DenseMatrixType DenseMatrixType;
+    enum {
+      Flags = Traits::Flags,
+      CoeffReadCost = Traits::CoeffReadCost,
+      RowsAtCompileTime = Traits::RowsAtCompileTime,
+      ColsAtCompileTime = Traits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+    };
+    typedef typename Traits::Scalar Scalar;
+    #endif
+
+    Transpose(const PermutationType& p) : m_permutation(p) {}
+
+    inline int rows() const { return m_permutation.rows(); }
+    inline int cols() const { return m_permutation.cols(); }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived>& other) const
+    {
+      other.setZero();
+      for (int i=0; i<rows();++i)
+        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
+    }
+    #endif
+
+    /** \return the equivalent permutation matrix */
+    PlainPermutationType eval() const { return *this; }
+
+    DenseMatrixType toDenseMatrix() const { return *this; }
+
+    /** \returns the matrix with the inverse permutation applied to the columns.
+      */
+    template<typename OtherDerived> friend
+    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>
+    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
+    {
+      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>(trPerm.m_permutation, matrix.derived());
+    }
+
+    /** \returns the matrix with the inverse permutation applied to the rows.
+      */
+    template<typename OtherDerived>
+    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>
+    operator*(const MatrixBase<OtherDerived>& matrix) const
+    {
+      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>(m_permutation, matrix.derived());
+    }
+
+    const PermutationType& nestedPermutation() const { return m_permutation; }
+
+  protected:
+    const PermutationType& m_permutation;
+};
+
+template<typename Derived>
+const PermutationWrapper<const Derived> MatrixBase<Derived>::asPermutation() const
+{
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_PERMUTATIONMATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/PlainObjectBase.h b/include/eigen3/Eigen/src/Core/PlainObjectBase.h
new file mode 100644
index 0000000..dd34b59
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/PlainObjectBase.h
@@ -0,0 +1,790 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DENSESTORAGEBASE_H
+#define EIGEN_DENSESTORAGEBASE_H
+
+#if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO)
+# define EIGEN_INITIALIZE_COEFFS
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
+#elif defined(EIGEN_INITIALIZE_MATRICES_BY_NAN)
+# define EIGEN_INITIALIZE_COEFFS
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
+#else
+# undef EIGEN_INITIALIZE_COEFFS
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
+  template<typename Index>
+  static EIGEN_ALWAYS_INLINE void run(Index, Index)
+  {
+  }
+};
+
+template<> struct check_rows_cols_for_overflow<Dynamic> {
+  template<typename Index>
+  static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols)
+  {
+    // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
+    // we assume Index is signed
+    Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
+    bool error = (rows == 0 || cols == 0) ? false
+               : (rows > max_index / cols);
+    if (error)
+      throw_std_bad_alloc();
+  }
+};
+
+template <typename Derived,
+          typename OtherDerived = Derived,
+          bool IsVector = bool(Derived::IsVectorAtCompileTime) && bool(OtherDerived::IsVectorAtCompileTime)>
+struct conservative_resize_like_impl;
+
+template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct matrix_swap_impl;
+
+} // end namespace internal
+
+/** \class PlainObjectBase
+  * \brief %Dense storage base class for matrices and arrays.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+  *
+  * \sa \ref TopicClassHierarchy
+  */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+namespace internal {
+
+// this is a warkaround to doxygen not being able to understand the inheritence logic
+// when it is hidden by the dense_xpr_base helper struct.
+template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher_for_doxygen<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher_for_doxygen<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
+
+} // namespace internal
+
+template<typename Derived>
+class PlainObjectBase : public internal::dense_xpr_base_dispatcher_for_doxygen<Derived>
+#else
+template<typename Derived>
+class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
+#endif
+{
+  public:
+    enum { Options = internal::traits<Derived>::Options };
+    typedef typename internal::dense_xpr_base<Derived>::type Base;
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef Derived DenseType;
+
+    using Base::RowsAtCompileTime;
+    using Base::ColsAtCompileTime;
+    using Base::SizeAtCompileTime;
+    using Base::MaxRowsAtCompileTime;
+    using Base::MaxColsAtCompileTime;
+    using Base::MaxSizeAtCompileTime;
+    using Base::IsVectorAtCompileTime;
+    using Base::Flags;
+
+    template<typename PlainObjectType, int MapOptions, typename StrideType> friend class Eigen::Map;
+    friend  class Eigen::Map<Derived, Unaligned>;
+    typedef Eigen::Map<Derived, Unaligned>  MapType;
+    friend  class Eigen::Map<const Derived, Unaligned>;
+    typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
+    friend  class Eigen::Map<Derived, Aligned>;
+    typedef Eigen::Map<Derived, Aligned> AlignedMapType;
+    friend  class Eigen::Map<const Derived, Aligned>;
+    typedef const Eigen::Map<const Derived, Aligned> ConstAlignedMapType;
+    template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
+    template<typename StrideType> struct StridedConstMapType { typedef Eigen::Map<const Derived, Unaligned, StrideType> type; };
+    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, Aligned, StrideType> type; };
+    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, Aligned, StrideType> type; };
+
+  protected:
+    DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;
+
+  public:
+    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::Flags & AlignedBit) != 0 };
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+
+    Base& base() { return *static_cast<Base*>(this); }
+    const Base& base() const { return *static_cast<const Base*>(this); }
+
+    EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
+
+    EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
+    {
+      if(Flags & RowMajorBit)
+        return m_storage.data()[colId + rowId * m_storage.cols()];
+      else // column-major
+        return m_storage.data()[rowId + colId * m_storage.rows()];
+    }
+
+    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+    {
+      return m_storage.data()[index];
+    }
+
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
+    {
+      if(Flags & RowMajorBit)
+        return m_storage.data()[colId + rowId * m_storage.cols()];
+      else // column-major
+        return m_storage.data()[rowId + colId * m_storage.rows()];
+    }
+
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      return m_storage.data()[index];
+    }
+
+    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      if(Flags & RowMajorBit)
+        return m_storage.data()[colId + rowId * m_storage.cols()];
+      else // column-major
+        return m_storage.data()[rowId + colId * m_storage.rows()];
+    }
+
+    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
+    {
+      return m_storage.data()[index];
+    }
+
+    /** \internal */
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
+    {
+      return internal::ploadt<PacketScalar, LoadMode>
+               (m_storage.data() + (Flags & RowMajorBit
+                                   ? colId + rowId * m_storage.cols()
+                                   : rowId + colId * m_storage.rows()));
+    }
+
+    /** \internal */
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
+    {
+      return internal::ploadt<PacketScalar, LoadMode>(m_storage.data() + index);
+    }
+
+    /** \internal */
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    {
+      internal::pstoret<Scalar, PacketScalar, StoreMode>
+              (m_storage.data() + (Flags & RowMajorBit
+                                   ? colId + rowId * m_storage.cols()
+                                   : rowId + colId * m_storage.rows()), val);
+    }
+
+    /** \internal */
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacket(Index index, const PacketScalar& val)
+    {
+      internal::pstoret<Scalar, PacketScalar, StoreMode>(m_storage.data() + index, val);
+    }
+
+    /** \returns a const pointer to the data array of this matrix */
+    EIGEN_STRONG_INLINE const Scalar *data() const
+    { return m_storage.data(); }
+
+    /** \returns a pointer to the data array of this matrix */
+    EIGEN_STRONG_INLINE Scalar *data()
+    { return m_storage.data(); }
+
+    /** Resizes \c *this to a \a rows x \a cols matrix.
+      *
+      * This method is intended for dynamic-size matrices, although it is legal to call it on any
+      * matrix as long as fixed dimensions are left unchanged. If you only want to change the number
+      * of rows and/or of columns, you can use resize(NoChange_t, Index), resize(Index, NoChange_t).
+      *
+      * If the current number of coefficients of \c *this exactly matches the
+      * product \a rows * \a cols, then no memory allocation is performed and
+      * the current values are left unchanged. In all other cases, including
+      * shrinking, the data is reallocated and all previous values are lost.
+      *
+      * Example: \include Matrix_resize_int_int.cpp
+      * Output: \verbinclude Matrix_resize_int_int.out
+      *
+      * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
+      */
+    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
+    {
+      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,nbCols==ColsAtCompileTime)
+                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,nbRows<=MaxRowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,nbCols<=MaxColsAtCompileTime)
+                   && nbRows>=0 && nbCols>=0 && "Invalid sizes when resizing a matrix or array.");
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        Index size = nbRows*nbCols;
+        bool size_changed = size != this->size();
+        m_storage.resize(size, nbRows, nbCols);
+        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      #else
+        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
+        m_storage.resize(nbRows*nbCols, nbRows, nbCols);
+      #endif
+    }
+
+    /** Resizes \c *this to a vector of length \a size
+      *
+      * \only_for_vectors. This method does not work for
+      * partially dynamic matrices when the static dimension is anything other
+      * than 1. For example it will not work with Matrix<double, 2, Dynamic>.
+      *
+      * Example: \include Matrix_resize_int.cpp
+      * Output: \verbinclude Matrix_resize_int.out
+      *
+      * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
+      */
+    inline void resize(Index size)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
+      eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime==Dynamic || size<=MaxSizeAtCompileTime)) || SizeAtCompileTime == size) && size>=0);
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        bool size_changed = size != this->size();
+      #endif
+      if(RowsAtCompileTime == 1)
+        m_storage.resize(size, 1, size);
+      else
+        m_storage.resize(size, size, 1);
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      #endif
+    }
+
+    /** Resizes the matrix, changing only the number of columns. For the parameter of type NoChange_t, just pass the special value \c NoChange
+      * as in the example below.
+      *
+      * Example: \include Matrix_resize_NoChange_int.cpp
+      * Output: \verbinclude Matrix_resize_NoChange_int.out
+      *
+      * \sa resize(Index,Index)
+      */
+    inline void resize(NoChange_t, Index nbCols)
+    {
+      resize(rows(), nbCols);
+    }
+
+    /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange
+      * as in the example below.
+      *
+      * Example: \include Matrix_resize_int_NoChange.cpp
+      * Output: \verbinclude Matrix_resize_int_NoChange.out
+      *
+      * \sa resize(Index,Index)
+      */
+    inline void resize(Index nbRows, NoChange_t)
+    {
+      resize(nbRows, cols());
+    }
+
+    /** Resizes \c *this to have the same dimensions as \a other.
+      * Takes care of doing all the checking that's needed.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
+    {
+      const OtherDerived& other = _other.derived();
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.rows(), other.cols());
+      const Index othersize = other.rows()*other.cols();
+      if(RowsAtCompileTime == 1)
+      {
+        eigen_assert(other.rows() == 1 || other.cols() == 1);
+        resize(1, othersize);
+      }
+      else if(ColsAtCompileTime == 1)
+      {
+        eigen_assert(other.rows() == 1 || other.cols() == 1);
+        resize(othersize, 1);
+      }
+      else resize(other.rows(), other.cols());
+    }
+
+    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
+      *
+      * The method is intended for matrices of dynamic size. If you only want to change the number
+      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
+      * conservativeResize(Index, NoChange_t).
+      *
+      * Matrices are resized relative to the top-left element. In case values need to be 
+      * appended to the matrix they will be uninitialized.
+      */
+    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
+    {
+      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
+    }
+
+    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
+      *
+      * As opposed to conservativeResize(Index rows, Index cols), this version leaves
+      * the number of columns unchanged.
+      *
+      * In case the matrix is growing, new rows will be uninitialized.
+      */
+    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
+    {
+      // Note: see the comment in conservativeResize(Index,Index)
+      conservativeResize(nbRows, cols());
+    }
+
+    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
+      *
+      * As opposed to conservativeResize(Index rows, Index cols), this version leaves
+      * the number of rows unchanged.
+      *
+      * In case the matrix is growing, new columns will be uninitialized.
+      */
+    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
+    {
+      // Note: see the comment in conservativeResize(Index,Index)
+      conservativeResize(rows(), nbCols);
+    }
+
+    /** Resizes the vector to \a size while retaining old values.
+      *
+      * \only_for_vectors. This method does not work for
+      * partially dynamic matrices when the static dimension is anything other
+      * than 1. For example it will not work with Matrix<double, 2, Dynamic>.
+      *
+      * When values are appended, they will be uninitialized.
+      */
+    EIGEN_STRONG_INLINE void conservativeResize(Index size)
+    {
+      internal::conservative_resize_like_impl<Derived>::run(*this, size);
+    }
+
+    /** Resizes the matrix to \a rows x \a cols of \c other, while leaving old values untouched.
+      *
+      * The method is intended for matrices of dynamic size. If you only want to change the number
+      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
+      * conservativeResize(Index, NoChange_t).
+      *
+      * Matrices are resized relative to the top-left element. In case values need to be 
+      * appended to the matrix they will copied from \c other.
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase<OtherDerived>& other)
+    {
+      internal::conservative_resize_like_impl<Derived,OtherDerived>::run(*this, other);
+    }
+
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    EIGEN_STRONG_INLINE Derived& operator=(const PlainObjectBase& other)
+    {
+      return _set(other);
+    }
+
+    /** \sa MatrixBase::lazyAssign() */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase<OtherDerived>& other)
+    {
+      _resize_to_match(other);
+      return Base::lazyAssign(other.derived());
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue<OtherDerived>& func)
+    {
+      resize(func.rows(), func.cols());
+      return Base::operator=(func);
+    }
+
+    EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
+    {
+//       _check_template_params();
+//       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    // FIXME is it still needed ?
+    /** \internal */
+    PlainObjectBase(internal::constructor_without_unaligned_array_assert)
+      : m_storage(internal::constructor_without_unaligned_array_assert())
+    {
+//       _check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+#endif
+
+    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
+      : m_storage(a_size, nbRows, nbCols)
+    {
+//       _check_template_params();
+//       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
+    {
+      _resize_to_match(other);
+      Base::operator=(other.derived());
+      return this->derived();
+    }
+
+    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
+      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
+    {
+      _check_template_params();
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.derived().rows(), other.derived().cols());
+      Base::operator=(other.derived());
+    }
+
+    /** \name Map
+      * These are convenience functions returning Map objects. The Map() static functions return unaligned Map objects,
+      * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
+      * \a data pointers.
+      *
+      * \see class Map
+      */
+    //@{
+    static inline ConstMapType Map(const Scalar* data)
+    { return ConstMapType(data); }
+    static inline MapType Map(Scalar* data)
+    { return MapType(data); }
+    static inline ConstMapType Map(const Scalar* data, Index size)
+    { return ConstMapType(data, size); }
+    static inline MapType Map(Scalar* data, Index size)
+    { return MapType(data, size); }
+    static inline ConstMapType Map(const Scalar* data, Index rows, Index cols)
+    { return ConstMapType(data, rows, cols); }
+    static inline MapType Map(Scalar* data, Index rows, Index cols)
+    { return MapType(data, rows, cols); }
+
+    static inline ConstAlignedMapType MapAligned(const Scalar* data)
+    { return ConstAlignedMapType(data); }
+    static inline AlignedMapType MapAligned(Scalar* data)
+    { return AlignedMapType(data); }
+    static inline ConstAlignedMapType MapAligned(const Scalar* data, Index size)
+    { return ConstAlignedMapType(data, size); }
+    static inline AlignedMapType MapAligned(Scalar* data, Index size)
+    { return AlignedMapType(data, size); }
+    static inline ConstAlignedMapType MapAligned(const Scalar* data, Index rows, Index cols)
+    { return ConstAlignedMapType(data, rows, cols); }
+    static inline AlignedMapType MapAligned(Scalar* data, Index rows, Index cols)
+    { return AlignedMapType(data, rows, cols); }
+
+    template<int Outer, int Inner>
+    static inline typename StridedConstMapType<Stride<Outer, Inner> >::type Map(const Scalar* data, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstMapType<Stride<Outer, Inner> >::type(data, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedMapType<Stride<Outer, Inner> >::type Map(Scalar* data, const Stride<Outer, Inner>& stride)
+    { return typename StridedMapType<Stride<Outer, Inner> >::type(data, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedConstMapType<Stride<Outer, Inner> >::type Map(const Scalar* data, Index size, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstMapType<Stride<Outer, Inner> >::type(data, size, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedMapType<Stride<Outer, Inner> >::type Map(Scalar* data, Index size, const Stride<Outer, Inner>& stride)
+    { return typename StridedMapType<Stride<Outer, Inner> >::type(data, size, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedConstMapType<Stride<Outer, Inner> >::type Map(const Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedMapType<Stride<Outer, Inner> >::type Map(Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
+    { return typename StridedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
+
+    template<int Outer, int Inner>
+    static inline typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type MapAligned(const Scalar* data, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type(data, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, const Stride<Outer, Inner>& stride)
+    { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type MapAligned(const Scalar* data, Index size, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type(data, size, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, Index size, const Stride<Outer, Inner>& stride)
+    { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, size, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type MapAligned(const Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
+    { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
+    //@}
+
+    using Base::setConstant;
+    Derived& setConstant(Index size, const Scalar& value);
+    Derived& setConstant(Index rows, Index cols, const Scalar& value);
+
+    using Base::setZero;
+    Derived& setZero(Index size);
+    Derived& setZero(Index rows, Index cols);
+
+    using Base::setOnes;
+    Derived& setOnes(Index size);
+    Derived& setOnes(Index rows, Index cols);
+
+    using Base::setRandom;
+    Derived& setRandom(Index size);
+    Derived& setRandom(Index rows, Index cols);
+
+    #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN
+    #include EIGEN_PLAINOBJECTBASE_PLUGIN
+    #endif
+
+  protected:
+    /** \internal Resizes *this in preparation for assigning \a other to it.
+      * Takes care of doing all the checking that's needed.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
+    {
+      #ifdef EIGEN_NO_AUTOMATIC_RESIZING
+      eigen_assert((this->size()==0 || (IsVectorAtCompileTime ? (this->size() == other.size())
+                 : (rows() == other.rows() && cols() == other.cols())))
+        && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+      EIGEN_ONLY_USED_FOR_DEBUG(other);
+      #else
+      resizeLike(other);
+      #endif
+    }
+
+    /**
+      * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
+      *
+      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
+      * it will be initialized.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      *
+      * \sa operator=(const MatrixBase<OtherDerived>&), _set_noalias()
+      *
+      * \internal
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
+    {
+      _set_selector(other.derived(), typename internal::conditional<static_cast<bool>(int(OtherDerived::Flags) & EvalBeforeAssigningBit), internal::true_type, internal::false_type>::type());
+      return this->derived();
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::true_type&) { _set_noalias(other.eval()); }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::false_type&) { _set_noalias(other); }
+
+    /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
+      * is the case when creating a new matrix) so one can enforce lazy evaluation.
+      *
+      * \sa operator=(const MatrixBase<OtherDerived>&), _set()
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
+    {
+      // I don't think we need this resize call since the lazyAssign will anyways resize
+      // and lazyAssign will be called by the assign selector.
+      //_resize_to_match(other);
+      // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
+      // it wouldn't allow to copy a row-vector into a column-vector.
+      return internal::assign_selector<Derived,OtherDerived,false>::run(this->derived(), other.derived());
+    }
+
+    template<typename T0, typename T1>
+    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
+                          bool(NumTraits<T1>::IsInteger),
+                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
+      resize(nbRows,nbCols);
+    }
+    template<typename T0, typename T1>
+    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+      m_storage.data()[0] = val0;
+      m_storage.data()[1] = val1;
+    }
+
+    template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
+    friend struct internal::matrix_swap_impl;
+
+    /** \internal generic implementation of swap for dense storage since for dynamic-sized matrices of same type it is enough to swap the
+      * data pointers.
+      */
+    template<typename OtherDerived>
+    void _swap(DenseBase<OtherDerived> const & other)
+    {
+      enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
+      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.const_cast_derived());
+    }
+
+  public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    static EIGEN_STRONG_INLINE void _check_template_params()
+    {
+      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
+                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (Options&RowMajor)==0)
+                        && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0))
+                        && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0))
+                        && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0))
+                        && ((MaxColsAtCompileTime == Dynamic) || (MaxColsAtCompileTime >= 0))
+                        && (MaxRowsAtCompileTime == RowsAtCompileTime || RowsAtCompileTime==Dynamic)
+                        && (MaxColsAtCompileTime == ColsAtCompileTime || ColsAtCompileTime==Dynamic)
+                        && (Options & (DontAlign|RowMajor)) == Options),
+        INVALID_MATRIX_TEMPLATE_PARAMETERS)
+    }
+#endif
+
+private:
+    enum { ThisConstantIsPrivateInPlainObjectBase };
+};
+
+namespace internal {
+
+template <typename Derived, typename OtherDerived, bool IsVector>
+struct conservative_resize_like_impl
+{
+  typedef typename Derived::Index Index;
+  static void run(DenseBase<Derived>& _this, Index rows, Index cols)
+  {
+    if (_this.rows() == rows && _this.cols() == cols) return;
+    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
+
+    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
+         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
+    {
+      internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
+      _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
+    }
+    else
+    {
+      // The storage order does not allow us to use reallocation.
+      typename Derived::PlainObject tmp(rows,cols);
+      const Index common_rows = (std::min)(rows, _this.rows());
+      const Index common_cols = (std::min)(cols, _this.cols());
+      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
+      _this.derived().swap(tmp);
+    }
+  }
+
+  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
+  {
+    if (_this.rows() == other.rows() && _this.cols() == other.cols()) return;
+
+    // Note: Here is space for improvement. Basically, for conservativeResize(Index,Index),
+    // neither RowsAtCompileTime or ColsAtCompileTime must be Dynamic. If only one of the
+    // dimensions is dynamic, one could use either conservativeResize(Index rows, NoChange_t) or
+    // conservativeResize(NoChange_t, Index cols). For these methods new static asserts like
+    // EIGEN_STATIC_ASSERT_DYNAMIC_ROWS and EIGEN_STATIC_ASSERT_DYNAMIC_COLS would be good.
+    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
+    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
+
+    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
+         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
+    {
+      const Index new_rows = other.rows() - _this.rows();
+      const Index new_cols = other.cols() - _this.cols();
+      _this.derived().m_storage.conservativeResize(other.size(),other.rows(),other.cols());
+      if (new_rows>0)
+        _this.bottomRightCorner(new_rows, other.cols()) = other.bottomRows(new_rows);
+      else if (new_cols>0)
+        _this.bottomRightCorner(other.rows(), new_cols) = other.rightCols(new_cols);
+    }
+    else
+    {
+      // The storage order does not allow us to use reallocation.
+      typename Derived::PlainObject tmp(other);
+      const Index common_rows = (std::min)(tmp.rows(), _this.rows());
+      const Index common_cols = (std::min)(tmp.cols(), _this.cols());
+      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
+      _this.derived().swap(tmp);
+    }
+  }
+};
+
+// Here, the specialization for vectors inherits from the general matrix case
+// to allow calling .conservativeResize(rows,cols) on vectors.
+template <typename Derived, typename OtherDerived>
+struct conservative_resize_like_impl<Derived,OtherDerived,true>
+  : conservative_resize_like_impl<Derived,OtherDerived,false>
+{
+  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
+  
+  typedef typename Derived::Index Index;
+  static void run(DenseBase<Derived>& _this, Index size)
+  {
+    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
+    const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
+    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+  }
+
+  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
+  {
+    if (_this.rows() == other.rows() && _this.cols() == other.cols()) return;
+
+    const Index num_new_elements = other.size() - _this.size();
+
+    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
+    const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
+    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+
+    if (num_new_elements > 0)
+      _this.tail(num_new_elements) = other.tail(num_new_elements);
+  }
+};
+
+template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
+struct matrix_swap_impl
+{
+  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
+  {
+    a.base().swap(b);
+  }
+};
+
+template<typename MatrixTypeA, typename MatrixTypeB>
+struct matrix_swap_impl<MatrixTypeA, MatrixTypeB, true>
+{
+  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
+  {
+    static_cast<typename MatrixTypeA::Base&>(a).m_storage.swap(static_cast<typename MatrixTypeB::Base&>(b).m_storage);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_DENSESTORAGEBASE_H
diff --git a/include/eigen3/Eigen/src/Core/ProductBase.h b/include/eigen3/Eigen/src/Core/ProductBase.h
new file mode 100644
index 0000000..cf74470
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/ProductBase.h
@@ -0,0 +1,290 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PRODUCTBASE_H
+#define EIGEN_PRODUCTBASE_H
+
+namespace Eigen { 
+
+/** \class ProductBase
+  * \ingroup Core_Module
+  *
+  */
+
+namespace internal {
+template<typename Derived, typename _Lhs, typename _Rhs>
+struct traits<ProductBase<Derived,_Lhs,_Rhs> >
+{
+  typedef MatrixXpr XprKind;
+  typedef typename remove_all<_Lhs>::type Lhs;
+  typedef typename remove_all<_Rhs>::type Rhs;
+  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
+                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<Lhs>::Index,
+                                         typename traits<Rhs>::Index>::type Index;
+  enum {
+    RowsAtCompileTime = traits<Lhs>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<Rhs>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<Lhs>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<Rhs>::MaxColsAtCompileTime,
+    Flags = (MaxRowsAtCompileTime==1 ? RowMajorBit : 0)
+          | EvalBeforeNestingBit | EvalBeforeAssigningBit | NestByRefBit,
+                  // Note that EvalBeforeNestingBit and NestByRefBit
+                  // are not used in practice because nested is overloaded for products
+    CoeffReadCost = 0 // FIXME why is it needed ?
+  };
+};
+}
+
+#define EIGEN_PRODUCT_PUBLIC_INTERFACE(Derived) \
+  typedef ProductBase<Derived, Lhs, Rhs > Base; \
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
+  typedef typename Base::LhsNested LhsNested; \
+  typedef typename Base::_LhsNested _LhsNested; \
+  typedef typename Base::LhsBlasTraits LhsBlasTraits; \
+  typedef typename Base::ActualLhsType ActualLhsType; \
+  typedef typename Base::_ActualLhsType _ActualLhsType; \
+  typedef typename Base::RhsNested RhsNested; \
+  typedef typename Base::_RhsNested _RhsNested; \
+  typedef typename Base::RhsBlasTraits RhsBlasTraits; \
+  typedef typename Base::ActualRhsType ActualRhsType; \
+  typedef typename Base::_ActualRhsType _ActualRhsType; \
+  using Base::m_lhs; \
+  using Base::m_rhs;
+
+template<typename Derived, typename Lhs, typename Rhs>
+class ProductBase : public MatrixBase<Derived>
+{
+  public:
+    typedef MatrixBase<Derived> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ProductBase)
+    
+    typedef typename Lhs::Nested LhsNested;
+    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
+    typedef internal::blas_traits<_LhsNested> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef typename internal::remove_all<ActualLhsType>::type _ActualLhsType;
+    typedef typename internal::traits<Lhs>::Scalar LhsScalar;
+
+    typedef typename Rhs::Nested RhsNested;
+    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
+    typedef internal::blas_traits<_RhsNested> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type _ActualRhsType;
+    typedef typename internal::traits<Rhs>::Scalar RhsScalar;
+
+    // Diagonal of a product: no need to evaluate the arguments because they are going to be evaluated only once
+    typedef CoeffBasedProduct<LhsNested, RhsNested, 0> FullyLazyCoeffBaseProductType;
+
+  public:
+
+#ifndef EIGEN_NO_MALLOC
+    typedef typename Base::PlainObject BasePlainObject;
+    typedef Matrix<Scalar,RowsAtCompileTime==1?1:Dynamic,ColsAtCompileTime==1?1:Dynamic,BasePlainObject::Options> DynPlainObject;
+    typedef typename internal::conditional<(BasePlainObject::SizeAtCompileTime==Dynamic) || (BasePlainObject::SizeAtCompileTime*int(sizeof(Scalar)) < int(EIGEN_STACK_ALLOCATION_LIMIT)),
+                                           BasePlainObject, DynPlainObject>::type PlainObject;
+#else
+    typedef typename Base::PlainObject PlainObject;
+#endif
+
+    ProductBase(const Lhs& a_lhs, const Rhs& a_rhs)
+      : m_lhs(a_lhs), m_rhs(a_rhs)
+    {
+      eigen_assert(a_lhs.cols() == a_rhs.rows()
+        && "invalid matrix product"
+        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+    }
+
+    inline Index rows() const { return m_lhs.rows(); }
+    inline Index cols() const { return m_rhs.cols(); }
+
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst,Scalar(1)); }
+
+    template<typename Dest>
+    inline void addTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(1)); }
+
+    template<typename Dest>
+    inline void subTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(-1)); }
+
+    template<typename Dest>
+    inline void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { derived().scaleAndAddTo(dst,alpha); }
+
+    const _LhsNested& lhs() const { return m_lhs; }
+    const _RhsNested& rhs() const { return m_rhs; }
+
+    // Implicit conversion to the nested type (trigger the evaluation of the product)
+    operator const PlainObject& () const
+    {
+      m_result.resize(m_lhs.rows(), m_rhs.cols());
+      derived().evalTo(m_result);
+      return m_result;
+    }
+
+    const Diagonal<const FullyLazyCoeffBaseProductType,0> diagonal() const
+    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
+
+    template<int Index>
+    const Diagonal<FullyLazyCoeffBaseProductType,Index> diagonal() const
+    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
+
+    const Diagonal<FullyLazyCoeffBaseProductType,Dynamic> diagonal(Index index) const
+    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs).diagonal(index); }
+
+    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isnt a Lvalue expression
+    typename Base::CoeffReturnType coeff(Index row, Index col) const
+    {
+#ifdef EIGEN2_SUPPORT
+      return lhs().row(row).cwiseProduct(rhs().col(col).transpose()).sum();
+#else
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      Matrix<Scalar,1,1> result = *this;
+      return result.coeff(row,col);
+#endif
+    }
+
+    typename Base::CoeffReturnType coeff(Index i) const
+    {
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      Matrix<Scalar,1,1> result = *this;
+      return result.coeff(i);
+    }
+
+    const Scalar& coeffRef(Index row, Index col) const
+    {
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      return derived().coeffRef(row,col);
+    }
+
+    const Scalar& coeffRef(Index i) const
+    {
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      return derived().coeffRef(i);
+    }
+
+  protected:
+
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+
+    mutable PlainObject m_result;
+};
+
+// here we need to overload the nested rule for products
+// such that the nested type is a const reference to a plain matrix
+namespace internal {
+template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
+struct nested<GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
+{
+  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
+};
+template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
+struct nested<const GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
+{
+  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
+};
+}
+
+template<typename NestedProduct>
+class ScaledProduct;
+
+// Note that these two operator* functions are not defined as member
+// functions of ProductBase, because, otherwise we would have to
+// define all overloads defined in MatrixBase. Furthermore, Using
+// "using Base::operator*" would not work with MSVC.
+//
+// Also note that here we accept any compatible scalar types
+template<typename Derived,typename Lhs,typename Rhs>
+const ScaledProduct<Derived>
+operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::Scalar& x)
+{ return ScaledProduct<Derived>(prod.derived(), x); }
+
+template<typename Derived,typename Lhs,typename Rhs>
+typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
+                      const ScaledProduct<Derived> >::type
+operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::RealScalar& x)
+{ return ScaledProduct<Derived>(prod.derived(), x); }
+
+
+template<typename Derived,typename Lhs,typename Rhs>
+const ScaledProduct<Derived>
+operator*(const typename Derived::Scalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
+{ return ScaledProduct<Derived>(prod.derived(), x); }
+
+template<typename Derived,typename Lhs,typename Rhs>
+typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
+                      const ScaledProduct<Derived> >::type
+operator*(const typename Derived::RealScalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
+{ return ScaledProduct<Derived>(prod.derived(), x); }
+
+namespace internal {
+template<typename NestedProduct>
+struct traits<ScaledProduct<NestedProduct> >
+ : traits<ProductBase<ScaledProduct<NestedProduct>,
+                         typename NestedProduct::_LhsNested,
+                         typename NestedProduct::_RhsNested> >
+{
+  typedef typename traits<NestedProduct>::StorageKind StorageKind;
+};
+}
+
+template<typename NestedProduct>
+class ScaledProduct
+  : public ProductBase<ScaledProduct<NestedProduct>,
+                       typename NestedProduct::_LhsNested,
+                       typename NestedProduct::_RhsNested>
+{
+  public:
+    typedef ProductBase<ScaledProduct<NestedProduct>,
+                       typename NestedProduct::_LhsNested,
+                       typename NestedProduct::_RhsNested> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::PlainObject PlainObject;
+//     EIGEN_PRODUCT_PUBLIC_INTERFACE(ScaledProduct)
+
+    ScaledProduct(const NestedProduct& prod, const Scalar& x)
+    : Base(prod.lhs(),prod.rhs()), m_prod(prod), m_alpha(x) {}
+
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst, Scalar(1)); }
+
+    template<typename Dest>
+    inline void addTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(1)); }
+
+    template<typename Dest>
+    inline void subTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(-1)); }
+
+    template<typename Dest>
+    inline void scaleAndAddTo(Dest& dst, const Scalar& a_alpha) const { m_prod.derived().scaleAndAddTo(dst,a_alpha * m_alpha); }
+
+    const Scalar& alpha() const { return m_alpha; }
+    
+  protected:
+    const NestedProduct& m_prod;
+    Scalar m_alpha;
+};
+
+/** \internal
+  * Overloaded to perform an efficient C = (A*B).lazy() */
+template<typename Derived>
+template<typename ProductDerived, typename Lhs, typename Rhs>
+Derived& MatrixBase<Derived>::lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other)
+{
+  other.derived().evalTo(derived());
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCTBASE_H
diff --git a/include/eigen3/Eigen/src/Core/Random.h b/include/eigen3/Eigen/src/Core/Random.h
new file mode 100644
index 0000000..480fea4
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Random.h
@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANDOM_H
+#define EIGEN_RANDOM_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Scalar> struct scalar_random_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_random_op)
+  template<typename Index>
+  inline const Scalar operator() (Index, Index = 0) const { return random<Scalar>(); }
+};
+
+template<typename Scalar>
+struct functor_traits<scalar_random_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false, IsRepeatable = false }; };
+
+} // end namespace internal
+
+/** \returns a random matrix expression
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_random_int_int.cpp
+  * Output: \verbinclude MatrixBase_random_int_int.out
+  *
+  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
+  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
+  * behavior with expressions involving random matrices.
+  *
+  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index), MatrixBase::Random()
+  */
+template<typename Derived>
+inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+DenseBase<Derived>::Random(Index rows, Index cols)
+{
+  return NullaryExpr(rows, cols, internal::scalar_random_op<Scalar>());
+}
+
+/** \returns a random vector expression
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Random() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_random_int.cpp
+  * Output: \verbinclude MatrixBase_random_int.out
+  *
+  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
+  * a temporary vector whenever it is nested in a larger expression. This prevents unexpected
+  * behavior with expressions involving random matrices.
+  *
+  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random()
+  */
+template<typename Derived>
+inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+DenseBase<Derived>::Random(Index size)
+{
+  return NullaryExpr(size, internal::scalar_random_op<Scalar>());
+}
+
+/** \returns a fixed-size random matrix or vector expression
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * Example: \include MatrixBase_random.cpp
+  * Output: \verbinclude MatrixBase_random.out
+  *
+  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
+  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
+  * behavior with expressions involving random matrices.
+  *
+  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random(Index)
+  */
+template<typename Derived>
+inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+DenseBase<Derived>::Random()
+{
+  return NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_random_op<Scalar>());
+}
+
+/** Sets all coefficients in this expression to random values.
+  *
+  * Example: \include MatrixBase_setRandom.cpp
+  * Output: \verbinclude MatrixBase_setRandom.out
+  *
+  * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
+  */
+template<typename Derived>
+inline Derived& DenseBase<Derived>::setRandom()
+{
+  return *this = Random(rows(), cols());
+}
+
+/** Resizes to the given \a newSize, and sets all coefficients in this expression to random values.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include Matrix_setRandom_int.cpp
+  * Output: \verbinclude Matrix_setRandom_int.out
+  *
+  * \sa MatrixBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, MatrixBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(Index newSize)
+{
+  resize(newSize);
+  return setRandom();
+}
+
+/** Resizes to the given size, and sets all coefficients in this expression to random values.
+  *
+  * \param nbRows the new number of rows
+  * \param nbCols the new number of columns
+  *
+  * Example: \include Matrix_setRandom_int_int.cpp
+  * Output: \verbinclude Matrix_setRandom_int_int.out
+  *
+  * \sa MatrixBase::setRandom(), setRandom(Index), class CwiseNullaryOp, MatrixBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(Index nbRows, Index nbCols)
+{
+  resize(nbRows, nbCols);
+  return setRandom();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_RANDOM_H
diff --git a/include/eigen3/Eigen/src/Core/Redux.h b/include/eigen3/Eigen/src/Core/Redux.h
new file mode 100644
index 0000000..50548fa
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Redux.h
@@ -0,0 +1,408 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUX_H
+#define EIGEN_REDUX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// TODO
+//  * implement other kind of vectorization
+//  * factorize code
+
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for vectorization and unrolling
+***************************************************************************/
+
+template<typename Func, typename Derived>
+struct redux_traits
+{
+public:
+  enum {
+    PacketSize = packet_traits<typename Derived::Scalar>::size,
+    InnerMaxSize = int(Derived::IsRowMajor)
+                 ? Derived::MaxColsAtCompileTime
+                 : Derived::MaxRowsAtCompileTime
+  };
+
+  enum {
+    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+                  && (functor_traits<Func>::PacketAccess),
+    MayLinearVectorize = MightVectorize && (int(Derived::Flags)&LinearAccessBit),
+    MaySliceVectorize  = MightVectorize && int(InnerMaxSize)>=3*PacketSize
+  };
+
+public:
+  enum {
+    Traversal = int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
+              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
+                                        : int(DefaultTraversal)
+  };
+
+public:
+  enum {
+    Cost = (  Derived::SizeAtCompileTime == Dynamic
+           || Derived::CoeffReadCost == Dynamic
+           || (Derived::SizeAtCompileTime!=1 && functor_traits<Func>::Cost == Dynamic)
+           ) ? Dynamic
+           : Derived::SizeAtCompileTime * Derived::CoeffReadCost
+               + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
+  };
+
+public:
+  enum {
+    Unrolling = Cost != Dynamic && Cost <= UnrollingLimit
+              ? CompleteUnrolling
+              : NoUnrolling
+  };
+};
+
+/***************************************************************************
+* Part 2 : unrollers
+***************************************************************************/
+
+/*** no vectorization ***/
+
+template<typename Func, typename Derived, int Start, int Length>
+struct redux_novec_unroller
+{
+  enum {
+    HalfLength = Length/2
+  };
+
+  typedef typename Derived::Scalar Scalar;
+
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  {
+    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+  }
+};
+
+template<typename Func, typename Derived, int Start>
+struct redux_novec_unroller<Func, Derived, Start, 1>
+{
+  enum {
+    outer = Start / Derived::InnerSizeAtCompileTime,
+    inner = Start % Derived::InnerSizeAtCompileTime
+  };
+
+  typedef typename Derived::Scalar Scalar;
+
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
+  {
+    return mat.coeffByOuterInner(outer, inner);
+  }
+};
+
+// This is actually dead code and will never be called. It is required
+// to prevent false warnings regarding failed inlining though
+// for 0 length run() will never be called at all.
+template<typename Func, typename Derived, int Start>
+struct redux_novec_unroller<Func, Derived, Start, 0>
+{
+  typedef typename Derived::Scalar Scalar;
+  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+};
+
+/*** vectorization ***/
+
+template<typename Func, typename Derived, int Start, int Length>
+struct redux_vec_unroller
+{
+  enum {
+    PacketSize = packet_traits<typename Derived::Scalar>::size,
+    HalfLength = Length/2
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  {
+    return func.packetOp(
+            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+  }
+};
+
+template<typename Func, typename Derived, int Start>
+struct redux_vec_unroller<Func, Derived, Start, 1>
+{
+  enum {
+    index = Start * packet_traits<typename Derived::Scalar>::size,
+    outer = index / int(Derived::InnerSizeAtCompileTime),
+    inner = index % int(Derived::InnerSizeAtCompileTime),
+    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
+  {
+    return mat.template packetByOuterInner<alignment>(outer, inner);
+  }
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+template<typename Func, typename Derived,
+         int Traversal = redux_traits<Func, Derived>::Traversal,
+         int Unrolling = redux_traits<Func, Derived>::Unrolling
+>
+struct redux_impl;
+
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Index Index;
+  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+  {
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    Scalar res;
+    res = mat.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < mat.innerSize(); ++i)
+      res = func(res, mat.coeffByOuterInner(0, i));
+    for(Index i = 1; i < mat.outerSize(); ++i)
+      for(Index j = 0; j < mat.innerSize(); ++j)
+        res = func(res, mat.coeffByOuterInner(i, j));
+    return res;
+  }
+};
+
+template<typename Func, typename Derived>
+struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
+  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
+{};
+
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename Derived::Index Index;
+
+  static Scalar run(const Derived& mat, const Func& func)
+  {
+    const Index size = mat.size();
+    eigen_assert(size && "you are using an empty matrix");
+    const Index packetSize = packet_traits<Scalar>::size;
+    const Index alignedStart = internal::first_aligned(mat);
+    enum {
+      alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit)
+                ? Aligned : Unaligned
+    };
+    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
+    const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
+    const Index alignedEnd2 = alignedStart + alignedSize2;
+    const Index alignedEnd  = alignedStart + alignedSize;
+    Scalar res;
+    if(alignedSize)
+    {
+      PacketScalar packet_res0 = mat.template packet<alignment>(alignedStart);
+      if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
+      {
+        PacketScalar packet_res1 = mat.template packet<alignment>(alignedStart+packetSize);
+        for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
+        {
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(index));
+          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment>(index+packetSize));
+        }
+
+        packet_res0 = func.packetOp(packet_res0,packet_res1);
+        if(alignedEnd>alignedEnd2)
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(alignedEnd2));
+      }
+      res = func.predux(packet_res0);
+
+      for(Index index = 0; index < alignedStart; ++index)
+        res = func(res,mat.coeff(index));
+
+      for(Index index = alignedEnd; index < size; ++index)
+        res = func(res,mat.coeff(index));
+    }
+    else // too small to vectorize anything.
+         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
+    {
+      res = mat.coeff(0);
+      for(Index index = 1; index < size; ++index)
+        res = func(res,mat.coeff(index));
+    }
+
+    return res;
+  }
+};
+
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename Derived::Index Index;
+
+  static Scalar run(const Derived& mat, const Func& func)
+  {
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = mat.innerSize();
+    const Index outerSize = mat.outerSize();
+    enum {
+      packetSize = packet_traits<Scalar>::size
+    };
+    const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
+    Scalar res;
+    if(packetedInnerSize)
+    {
+      PacketScalar packet_res = mat.template packet<Unaligned>(0,0);
+      for(Index j=0; j<outerSize; ++j)
+        for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
+          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned>(j,i));
+
+      res = func.predux(packet_res);
+      for(Index j=0; j<outerSize; ++j)
+        for(Index i=packetedInnerSize; i<innerSize; ++i)
+          res = func(res, mat.coeffByOuterInner(j,i));
+    }
+    else // too small to vectorize anything.
+         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
+    {
+      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+    }
+
+    return res;
+  }
+};
+
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+  enum {
+    PacketSize = packet_traits<Scalar>::size,
+    Size = Derived::SizeAtCompileTime,
+    VectorizedSize = (Size / PacketSize) * PacketSize
+  };
+  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+  {
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+    if (VectorizedSize != Size)
+      res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+    return res;
+  }
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* Part 4 : public API
+***************************************************************************/
+
+
+/** \returns the result of a full redux operation on the whole matrix or vector using \a func
+  *
+  * The template parameter \a BinaryOp is the type of the functor \a func which must be
+  * an associative operator. Both current STL and TR1 functor styles are handled.
+  *
+  * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
+  */
+template<typename Derived>
+template<typename Func>
+EIGEN_STRONG_INLINE typename internal::result_of<Func(typename internal::traits<Derived>::Scalar)>::type
+DenseBase<Derived>::redux(const Func& func) const
+{
+  typedef typename internal::remove_all<typename Derived::Nested>::type ThisNested;
+  return internal::redux_impl<Func, ThisNested>
+            ::run(derived(), func);
+}
+
+/** \returns the minimum of all coefficients of \c *this.
+  * \warning the result is undefined if \c *this contains NaN.
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::minCoeff() const
+{
+  return this->redux(Eigen::internal::scalar_min_op<Scalar>());
+}
+
+/** \returns the maximum of all coefficients of \c *this.
+  * \warning the result is undefined if \c *this contains NaN.
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::maxCoeff() const
+{
+  return this->redux(Eigen::internal::scalar_max_op<Scalar>());
+}
+
+/** \returns the sum of all coefficients of *this
+  *
+  * \sa trace(), prod(), mean()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::sum() const
+{
+  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
+    return Scalar(0);
+  return this->redux(Eigen::internal::scalar_sum_op<Scalar>());
+}
+
+/** \returns the mean of all coefficients of *this
+*
+* \sa trace(), prod(), sum()
+*/
+template<typename Derived>
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::mean() const
+{
+  return Scalar(this->redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+}
+
+/** \returns the product of all coefficients of *this
+  *
+  * Example: \include MatrixBase_prod.cpp
+  * Output: \verbinclude MatrixBase_prod.out
+  *
+  * \sa sum(), mean(), trace()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::prod() const
+{
+  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
+    return Scalar(1);
+  return this->redux(Eigen::internal::scalar_product_op<Scalar>());
+}
+
+/** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
+  *
+  * \c *this can be any matrix, not necessarily square.
+  *
+  * \sa diagonal(), sum()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+MatrixBase<Derived>::trace() const
+{
+  return derived().diagonal().sum();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_REDUX_H
diff --git a/include/eigen3/Eigen/src/Core/Ref.h b/include/eigen3/Eigen/src/Core/Ref.h
new file mode 100644
index 0000000..df87b1a
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Ref.h
@@ -0,0 +1,263 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REF_H
+#define EIGEN_REF_H
+
+namespace Eigen { 
+
+template<typename Derived> class RefBase;
+template<typename PlainObjectType, int Options = 0,
+         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
+
+/** \class Ref
+  * \ingroup Core_Module
+  *
+  * \brief A matrix or vector expression mapping an existing expressions
+  *
+  * \tparam PlainObjectType the equivalent matrix type of the mapped data
+  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  *                The default is \c #Unaligned.
+  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
+  *                   but accept a variable outer stride (leading dimension).
+  *                   This can be overridden by specifying strides.
+  *                   The type passed here must be a specialization of the Stride template, see examples below.
+  *
+  * This class permits to write non template functions taking Eigen's object as parameters while limiting the number of copies.
+  * A Ref<> object can represent either a const expression or a l-value:
+  * \code
+  * // in-out argument:
+  * void foo1(Ref<VectorXf> x);
+  *
+  * // read-only const argument:
+  * void foo2(const Ref<const VectorXf>& x);
+  * \endcode
+  *
+  * In the in-out case, the input argument must satisfies the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
+  * Likewise, a Ref<MatrixXf> can reference any column major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space inbetween each column, i.e.: the inner stride mmust be equal to 1, but the outer-stride (or leading dimension),
+  * can be greater than the number of rows.
+  *
+  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
+  * Here are some examples:
+  * \code
+  * MatrixXf A;
+  * VectorXf a;
+  * foo1(a.head());             // OK
+  * foo1(A.col());              // OK
+  * foo1(A.row());              // compilation error because here innerstride!=1
+  * foo2(A.row());              // The row is copied into a contiguous temporary
+  * foo2(2*a);                  // The expression is evaluated into a temporary
+  * foo2(A.col().segment(2,4)); // No temporary
+  * \endcode
+  *
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameter.
+  * Here is an example accepting an innerstride!=1:
+  * \code
+  * // in-out argument:
+  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
+  * foo3(A.row());              // OK
+  * \endcode
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involved more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overloads internally calling a
+  * template function, e.g.:
+  * \code
+  * // in the .h:
+  * void foo(const Ref<MatrixXf>& A);
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
+  *
+  * // in the .cpp:
+  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
+  *     ... // crazy code goes here
+  * }
+  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
+  * \endcode
+  *
+  *
+  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+  */
+
+namespace internal {
+
+template<typename _PlainObjectType, int _Options, typename _StrideType>
+struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
+  : public traits<Map<_PlainObjectType, _Options, _StrideType> >
+{
+  typedef _PlainObjectType PlainObjectType;
+  typedef _StrideType StrideType;
+  enum {
+    Options = _Options,
+    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      HasDirectAccess = internal::has_direct_access<Derived>::ret,
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
+                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
+                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
+      OuterStrideMatch = Derived::IsVectorAtCompileTime
+                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
+      AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+  
+};
+
+template<typename Derived>
+struct traits<RefBase<Derived> > : public traits<Derived> {};
+
+}
+
+template<typename Derived> class RefBase
+ : public MapBase<Derived>
+{
+  typedef typename internal::traits<Derived>::PlainObjectType PlainObjectType;
+  typedef typename internal::traits<Derived>::StrideType StrideType;
+
+public:
+
+  typedef MapBase<Derived> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
+
+  inline Index innerStride() const
+  {
+    return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
+  }
+
+  inline Index outerStride() const
+  {
+    return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
+         : IsVectorAtCompileTime ? this->size()
+         : int(Flags)&RowMajorBit ? this->cols()
+         : this->rows();
+  }
+
+  RefBase()
+    : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
+      // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
+      m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
+               StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime)
+  {}
+  
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase)
+
+protected:
+
+  typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
+
+  template<typename Expression>
+  void construct(Expression& expr)
+  {
+    if(PlainObjectType::RowsAtCompileTime==1)
+    {
+      eigen_assert(expr.rows()==1 || expr.cols()==1);
+      ::new (static_cast<Base*>(this)) Base(expr.data(), 1, expr.size());
+    }
+    else if(PlainObjectType::ColsAtCompileTime==1)
+    {
+      eigen_assert(expr.rows()==1 || expr.cols()==1);
+      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.size(), 1);
+    }
+    else
+      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
+    
+    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
+      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
+    else
+      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
+                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+  }
+
+  StrideBase m_stride;
+};
+
+
+template<typename PlainObjectType, int Options, typename StrideType> class Ref
+  : public RefBase<Ref<PlainObjectType, Options, StrideType> >
+{
+    typedef internal::traits<Ref> Traits;
+    template<typename Derived>
+    inline Ref(const PlainObjectBase<Derived>& expr);
+  public:
+
+    typedef RefBase<Ref> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
+
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename Derived>
+    inline Ref(PlainObjectBase<Derived>& expr)
+    {
+      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
+    }
+    template<typename Derived>
+    inline Ref(const DenseBase<Derived>& expr)
+    #else
+    template<typename Derived>
+    inline Ref(DenseBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(static_cast<bool>(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      enum { THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY = Derived::ThisConstantIsPrivateInPlainObjectBase};
+      Base::construct(expr.const_cast_derived());
+    }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
+
+};
+
+// this is the const ref version
+template<typename TPlainObjectType, int Options, typename StrideType> class Ref<const TPlainObjectType, Options, StrideType>
+  : public RefBase<Ref<const TPlainObjectType, Options, StrideType> >
+{
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef RefBase<Ref> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const DenseBase<Derived>& expr)
+    {
+//      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
+//      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
+//      std::cout << int(StrideType::InnerStrideAtCompileTime) << " - " << int(Derived::InnerStrideAtCompileTime) << "\n";
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      Base::construct(expr);
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      m_object.lazyAssign(expr);
+      Base::construct(m_object);
+    }
+
+  protected:
+    TPlainObjectType m_object;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_REF_H
diff --git a/include/eigen3/Eigen/src/Core/Replicate.h b/include/eigen3/Eigen/src/Core/Replicate.h
new file mode 100644
index 0000000..ac4537c
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Replicate.h
@@ -0,0 +1,177 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REPLICATE_H
+#define EIGEN_REPLICATE_H
+
+namespace Eigen { 
+
+/**
+  * \class Replicate
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the multiple replication of a matrix or vector
+  *
+  * \param MatrixType the type of the object we are replicating
+  *
+  * This class represents an expression of the multiple replication of a matrix or vector.
+  * It is the return type of DenseBase::replicate() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa DenseBase::replicate()
+  */
+
+namespace internal {
+template<typename MatrixType,int RowFactor,int ColFactor>
+struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
+ : traits<MatrixType>
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename traits<MatrixType>::StorageKind StorageKind;
+  typedef typename traits<MatrixType>::XprKind XprKind;
+  enum {
+    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
+  };
+  typedef typename nested<MatrixType,Factor>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    RowsAtCompileTime = RowFactor==Dynamic || int(MatrixType::RowsAtCompileTime)==Dynamic
+                      ? Dynamic
+                      : RowFactor * MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = ColFactor==Dynamic || int(MatrixType::ColsAtCompileTime)==Dynamic
+                      ? Dynamic
+                      : ColFactor * MatrixType::ColsAtCompileTime,
+   //FIXME we don't propagate the max sizes !!!
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
+               : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
+               : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
+    Flags = (_MatrixTypeNested::Flags & HereditaryBits & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0),
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+  };
+};
+}
+
+template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
+  : public internal::dense_xpr_base< Replicate<MatrixType,RowFactor,ColFactor> >::type
+{
+    typedef typename internal::traits<Replicate>::MatrixTypeNested MatrixTypeNested;
+    typedef typename internal::traits<Replicate>::_MatrixTypeNested _MatrixTypeNested;
+  public:
+
+    typedef typename internal::dense_xpr_base<Replicate>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Replicate)
+
+    template<typename OriginalMatrixType>
+    inline explicit Replicate(const OriginalMatrixType& a_matrix)
+      : m_matrix(a_matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
+    {
+      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
+                          THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
+      eigen_assert(RowFactor!=Dynamic && ColFactor!=Dynamic);
+    }
+
+    template<typename OriginalMatrixType>
+    inline Replicate(const OriginalMatrixType& a_matrix, Index rowFactor, Index colFactor)
+      : m_matrix(a_matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
+    {
+      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
+                          THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
+    }
+
+    inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+    inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
+
+    inline Scalar coeff(Index rowId, Index colId) const
+    {
+      // try to avoid using modulo; this is a pure optimization strategy
+      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
+                            : RowFactor==1 ? rowId
+                            : rowId%m_matrix.rows();
+      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
+                            : ColFactor==1 ? colId
+                            : colId%m_matrix.cols();
+
+      return m_matrix.coeff(actual_row, actual_col);
+    }
+    template<int LoadMode>
+    inline PacketScalar packet(Index rowId, Index colId) const
+    {
+      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
+                            : RowFactor==1 ? rowId
+                            : rowId%m_matrix.rows();
+      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
+                            : ColFactor==1 ? colId
+                            : colId%m_matrix.cols();
+
+      return m_matrix.template packet<LoadMode>(actual_row, actual_col);
+    }
+
+    const _MatrixTypeNested& nestedExpression() const
+    { 
+      return m_matrix; 
+    }
+
+  protected:
+    MatrixTypeNested m_matrix;
+    const internal::variable_if_dynamic<Index, RowFactor> m_rowFactor;
+    const internal::variable_if_dynamic<Index, ColFactor> m_colFactor;
+};
+
+/**
+  * \return an expression of the replication of \c *this
+  *
+  * Example: \include MatrixBase_replicate.cpp
+  * Output: \verbinclude MatrixBase_replicate.out
+  *
+  * \sa VectorwiseOp::replicate(), DenseBase::replicate(Index,Index), class Replicate
+  */
+template<typename Derived>
+template<int RowFactor, int ColFactor>
+const Replicate<Derived,RowFactor,ColFactor>
+DenseBase<Derived>::replicate() const
+{
+  return Replicate<Derived,RowFactor,ColFactor>(derived());
+}
+
+/**
+  * \return an expression of the replication of \c *this
+  *
+  * Example: \include MatrixBase_replicate_int_int.cpp
+  * Output: \verbinclude MatrixBase_replicate_int_int.out
+  *
+  * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
+  */
+template<typename Derived>
+const typename DenseBase<Derived>::ReplicateReturnType
+DenseBase<Derived>::replicate(Index rowFactor,Index colFactor) const
+{
+  return Replicate<Derived,Dynamic,Dynamic>(derived(),rowFactor,colFactor);
+}
+
+/**
+  * \return an expression of the replication of each column (or row) of \c *this
+  *
+  * Example: \include DirectionWise_replicate_int.cpp
+  * Output: \verbinclude DirectionWise_replicate_int.out
+  *
+  * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
+  */
+template<typename ExpressionType, int Direction>
+const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
+{
+  return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_REPLICATE_H
diff --git a/include/eigen3/Eigen/src/Core/ReturnByValue.h b/include/eigen3/Eigen/src/Core/ReturnByValue.h
new file mode 100644
index 0000000..d66c24b
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/ReturnByValue.h
@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RETURNBYVALUE_H
+#define EIGEN_RETURNBYVALUE_H
+
+namespace Eigen {
+
+/** \class ReturnByValue
+  * \ingroup Core_Module
+  *
+  */
+
+namespace internal {
+
+template<typename Derived>
+struct traits<ReturnByValue<Derived> >
+  : public traits<typename traits<Derived>::ReturnType>
+{
+  enum {
+    // We're disabling the DirectAccess because e.g. the constructor of
+    // the Block-with-DirectAccess expression requires to have a coeffRef method.
+    // Also, we don't want to have to implement the stride stuff.
+    Flags = (traits<typename traits<Derived>::ReturnType>::Flags
+             | EvalBeforeNestingBit) & ~DirectAccessBit
+  };
+};
+
+/* The ReturnByValue object doesn't even have a coeff() method.
+ * So the only way that nesting it in an expression can work, is by evaluating it into a plain matrix.
+ * So internal::nested always gives the plain return matrix type.
+ *
+ * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
+ */
+template<typename Derived,int n,typename PlainObject>
+struct nested<ReturnByValue<Derived>, n, PlainObject>
+{
+  typedef typename traits<Derived>::ReturnType type;
+};
+
+} // end namespace internal
+
+template<typename Derived> class ReturnByValue
+  : internal::no_assignment_operator, public internal::dense_xpr_base< ReturnByValue<Derived> >::type
+{
+  public:
+    typedef typename internal::traits<Derived>::ReturnType ReturnType;
+
+    typedef typename internal::dense_xpr_base<ReturnByValue>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ReturnByValue)
+
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const
+    { static_cast<const Derived*>(this)->evalTo(dst); }
+    inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
+    inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+#define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
+    class Unusable{
+      Unusable(const Unusable&) {}
+      Unusable& operator=(const Unusable&) {return *this;}
+    };
+    const Unusable& coeff(Index) const { return *reinterpret_cast<const Unusable*>(this); }
+    const Unusable& coeff(Index,Index) const { return *reinterpret_cast<const Unusable*>(this); }
+    Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
+    Unusable& coeffRef(Index,Index) { return *reinterpret_cast<Unusable*>(this); }
+#endif
+};
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+{
+  other.evalTo(derived());
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_RETURNBYVALUE_H
diff --git a/include/eigen3/Eigen/src/Core/Reverse.h b/include/eigen3/Eigen/src/Core/Reverse.h
new file mode 100644
index 0000000..e30ae3d
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Reverse.h
@@ -0,0 +1,224 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2009 Ricard Marxer <email at ricardmarxer.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REVERSE_H
+#define EIGEN_REVERSE_H
+
+namespace Eigen { 
+
+/** \class Reverse
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the reverse of a vector or matrix
+  *
+  * \param MatrixType the type of the object of which we are taking the reverse
+  *
+  * This class represents an expression of the reverse of a vector.
+  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
+  */
+
+namespace internal {
+
+template<typename MatrixType, int Direction>
+struct traits<Reverse<MatrixType, Direction> >
+ : traits<MatrixType>
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename traits<MatrixType>::StorageKind StorageKind;
+  typedef typename traits<MatrixType>::XprKind XprKind;
+  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    LinearAccess = ( (Direction==BothDirections) && (int(_MatrixTypeNested::Flags)&PacketAccessBit) )
+                 ? LinearAccessBit : 0,
+
+    Flags = int(_MatrixTypeNested::Flags) & (HereditaryBits | LvalueBit | PacketAccessBit | LinearAccess),
+
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+  };
+};
+
+template<typename PacketScalar, bool ReversePacket> struct reverse_packet_cond
+{
+  static inline PacketScalar run(const PacketScalar& x) { return preverse(x); }
+};
+
+template<typename PacketScalar> struct reverse_packet_cond<PacketScalar,false>
+{
+  static inline PacketScalar run(const PacketScalar& x) { return x; }
+};
+
+} // end namespace internal 
+
+template<typename MatrixType, int Direction> class Reverse
+  : public internal::dense_xpr_base< Reverse<MatrixType, Direction> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<Reverse>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
+    using Base::IsRowMajor;
+
+    // next line is necessary because otherwise const version of operator()
+    // is hidden by non-const version defined in this file
+    using Base::operator(); 
+
+  protected:
+    enum {
+      PacketSize = internal::packet_traits<Scalar>::size,
+      IsColMajor = !IsRowMajor,
+      ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
+      ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1,
+      ReversePacket = (Direction == BothDirections)
+                    || ((Direction == Vertical)   && IsColMajor)
+                    || ((Direction == Horizontal) && IsRowMajor)
+    };
+    typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
+  public:
+
+    inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+
+    inline Index innerStride() const
+    {
+      return -m_matrix.innerStride();
+    }
+
+    inline Scalar& operator()(Index row, Index col)
+    {
+      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+      return coeffRef(row, col);
+    }
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
+                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
+    }
+
+    inline CoeffReturnType coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
+                            ReverseCol ? m_matrix.cols() - col - 1 : col);
+    }
+
+    inline CoeffReturnType coeff(Index index) const
+    {
+      return m_matrix.coeff(m_matrix.size() - index - 1);
+    }
+
+    inline Scalar& coeffRef(Index index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
+    }
+
+    inline Scalar& operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < m_matrix.size());
+      return coeffRef(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index row, Index col) const
+    {
+      return reverse_packet::run(m_matrix.template packet<LoadMode>(
+                                    ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
+                                    ReverseCol ? m_matrix.cols() - col - OffsetCol : col));
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<LoadMode>(
+                                      ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
+                                      ReverseCol ? m_matrix.cols() - col - OffsetCol : col,
+                                      reverse_packet::run(x));
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return internal::preverse(m_matrix.template packet<LoadMode>( m_matrix.size() - index - PacketSize ));
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
+    }
+
+    const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    nestedExpression() const 
+    {
+      return m_matrix;
+    }
+
+  protected:
+    typename MatrixType::Nested m_matrix;
+};
+
+/** \returns an expression of the reverse of *this.
+  *
+  * Example: \include MatrixBase_reverse.cpp
+  * Output: \verbinclude MatrixBase_reverse.out
+  *
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::ReverseReturnType
+DenseBase<Derived>::reverse()
+{
+  return derived();
+}
+
+/** This is the const version of reverse(). */
+template<typename Derived>
+inline const typename DenseBase<Derived>::ConstReverseReturnType
+DenseBase<Derived>::reverse() const
+{
+  return derived();
+}
+
+/** This is the "in place" version of reverse: it reverses \c *this.
+  *
+  * In most cases it is probably better to simply use the reversed expression
+  * of a matrix. However, when reversing the matrix data itself is really needed,
+  * then this "in-place" version is probably the right choice because it provides
+  * the following additional features:
+  *  - less error prone: doing the same operation with .reverse() requires special care:
+  *    \code m = m.reverse().eval(); \endcode
+  *  - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap)
+  *  - it allows future optimizations (cache friendliness, etc.)
+  *
+  * \sa reverse() */
+template<typename Derived>
+inline void DenseBase<Derived>::reverseInPlace()
+{
+  derived() = derived().reverse().eval();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_REVERSE_H
diff --git a/include/eigen3/Eigen/src/Core/Select.h b/include/eigen3/Eigen/src/Core/Select.h
new file mode 100644
index 0000000..87993bb
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Select.h
@@ -0,0 +1,162 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELECT_H
+#define EIGEN_SELECT_H
+
+namespace Eigen { 
+
+/** \class Select
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a coefficient wise version of the C++ ternary operator ?:
+  *
+  * \param ConditionMatrixType the type of the \em condition expression which must be a boolean matrix
+  * \param ThenMatrixType the type of the \em then expression
+  * \param ElseMatrixType the type of the \em else expression
+  *
+  * This class represents an expression of a coefficient wise version of the C++ ternary operator ?:.
+  * It is the return type of DenseBase::select() and most of the time this is the only way it is used.
+  *
+  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const
+  */
+
+namespace internal {
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
+struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+ : traits<ThenMatrixType>
+{
+  typedef typename traits<ThenMatrixType>::Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef typename traits<ThenMatrixType>::XprKind XprKind;
+  typedef typename ConditionMatrixType::Nested ConditionMatrixNested;
+  typedef typename ThenMatrixType::Nested ThenMatrixNested;
+  typedef typename ElseMatrixType::Nested ElseMatrixNested;
+  enum {
+    RowsAtCompileTime = ConditionMatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
+    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & HereditaryBits,
+    CoeffReadCost = traits<typename remove_all<ConditionMatrixNested>::type>::CoeffReadCost
+                  + EIGEN_SIZE_MAX(traits<typename remove_all<ThenMatrixNested>::type>::CoeffReadCost,
+                                   traits<typename remove_all<ElseMatrixNested>::type>::CoeffReadCost)
+  };
+};
+}
+
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
+class Select : internal::no_assignment_operator,
+  public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<Select>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Select)
+
+    Select(const ConditionMatrixType& a_conditionMatrix,
+           const ThenMatrixType& a_thenMatrix,
+           const ElseMatrixType& a_elseMatrix)
+      : m_condition(a_conditionMatrix), m_then(a_thenMatrix), m_else(a_elseMatrix)
+    {
+      eigen_assert(m_condition.rows() == m_then.rows() && m_condition.rows() == m_else.rows());
+      eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
+    }
+
+    Index rows() const { return m_condition.rows(); }
+    Index cols() const { return m_condition.cols(); }
+
+    const Scalar coeff(Index i, Index j) const
+    {
+      if (m_condition.coeff(i,j))
+        return m_then.coeff(i,j);
+      else
+        return m_else.coeff(i,j);
+    }
+
+    const Scalar coeff(Index i) const
+    {
+      if (m_condition.coeff(i))
+        return m_then.coeff(i);
+      else
+        return m_else.coeff(i);
+    }
+
+    const ConditionMatrixType& conditionMatrix() const
+    {
+      return m_condition;
+    }
+
+    const ThenMatrixType& thenMatrix() const
+    {
+      return m_then;
+    }
+
+    const ElseMatrixType& elseMatrix() const
+    {
+      return m_else;
+    }
+
+  protected:
+    typename ConditionMatrixType::Nested m_condition;
+    typename ThenMatrixType::Nested m_then;
+    typename ElseMatrixType::Nested m_else;
+};
+
+
+/** \returns a matrix where each coefficient (i,j) is equal to \a thenMatrix(i,j)
+  * if \c *this(i,j), and \a elseMatrix(i,j) otherwise.
+  *
+  * Example: \include MatrixBase_select.cpp
+  * Output: \verbinclude MatrixBase_select.out
+  *
+  * \sa class Select
+  */
+template<typename Derived>
+template<typename ThenDerived,typename ElseDerived>
+inline const Select<Derived,ThenDerived,ElseDerived>
+DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
+                            const DenseBase<ElseDerived>& elseMatrix) const
+{
+  return Select<Derived,ThenDerived,ElseDerived>(derived(), thenMatrix.derived(), elseMatrix.derived());
+}
+
+/** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
+  * the \em else expression being a scalar value.
+  *
+  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
+  */
+template<typename Derived>
+template<typename ThenDerived>
+inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
+                           const typename ThenDerived::Scalar& elseScalar) const
+{
+  return Select<Derived,ThenDerived,typename ThenDerived::ConstantReturnType>(
+    derived(), thenMatrix.derived(), ThenDerived::Constant(rows(),cols(),elseScalar));
+}
+
+/** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
+  * the \em then expression being a scalar value.
+  *
+  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
+  */
+template<typename Derived>
+template<typename ElseDerived>
+inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+DenseBase<Derived>::select(const typename ElseDerived::Scalar& thenScalar,
+                           const DenseBase<ElseDerived>& elseMatrix) const
+{
+  return Select<Derived,typename ElseDerived::ConstantReturnType,ElseDerived>(
+    derived(), ElseDerived::Constant(rows(),cols(),thenScalar), elseMatrix.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELECT_H
diff --git a/include/eigen3/Eigen/src/Core/SelfAdjointView.h b/include/eigen3/Eigen/src/Core/SelfAdjointView.h
new file mode 100644
index 0000000..6fa7cd1
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/SelfAdjointView.h
@@ -0,0 +1,314 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINTMATRIX_H
+#define EIGEN_SELFADJOINTMATRIX_H
+
+namespace Eigen { 
+
+/** \class SelfAdjointView
+  * \ingroup Core_Module
+  *
+  *
+  * \brief Expression of a selfadjoint matrix from a triangular part of a dense matrix
+  *
+  * \param MatrixType the type of the dense matrix storing the coefficients
+  * \param TriangularPart can be either \c #Lower or \c #Upper
+  *
+  * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
+  * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
+  * and most of the time this is the only way that it is used.
+  *
+  * \sa class TriangularBase, MatrixBase::selfadjointView()
+  */
+
+namespace internal {
+template<typename MatrixType, unsigned int UpLo>
+struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
+{
+  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+  typedef MatrixType ExpressionType;
+  typedef typename MatrixType::PlainObject DenseMatrixType;
+  enum {
+    Mode = UpLo | SelfAdjoint,
+    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits)
+           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)), // FIXME these flags should be preserved
+    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+  };
+};
+}
+
+template <typename Lhs, int LhsMode, bool LhsIsVector,
+          typename Rhs, int RhsMode, bool RhsIsVector>
+struct SelfadjointProductMatrix;
+
+// FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
+template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
+  : public TriangularBase<SelfAdjointView<MatrixType, UpLo> >
+{
+  public:
+
+    typedef TriangularBase<SelfAdjointView> Base;
+    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
+    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
+
+    /** \brief The type of coefficients in this matrix */
+    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
+
+    typedef typename MatrixType::Index Index;
+
+    enum {
+      Mode = internal::traits<SelfAdjointView>::Mode
+    };
+    typedef typename MatrixType::PlainObject PlainObject;
+
+    inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
+    {}
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+    inline Index outerStride() const { return m_matrix.outerStride(); }
+    inline Index innerStride() const { return m_matrix.innerStride(); }
+
+    /** \sa MatrixBase::coeff()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    inline Scalar coeff(Index row, Index col) const
+    {
+      Base::check_coordinates_internal(row, col);
+      return m_matrix.coeff(row, col);
+    }
+
+    /** \sa MatrixBase::coeffRef()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      Base::check_coordinates_internal(row, col);
+      return m_matrix.const_cast_derived().coeffRef(row, col);
+    }
+
+    /** \internal */
+    const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }
+
+    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
+    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
+
+    /** Efficient self-adjoint matrix times vector/matrix product */
+    template<typename OtherDerived>
+    SelfadjointProductMatrix<MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
+    operator*(const MatrixBase<OtherDerived>& rhs) const
+    {
+      return SelfadjointProductMatrix
+              <MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
+              (m_matrix, rhs.derived());
+    }
+
+    /** Efficient vector/matrix times self-adjoint matrix product */
+    template<typename OtherDerived> friend
+    SelfadjointProductMatrix<OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
+    operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
+    {
+      return SelfadjointProductMatrix
+              <OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
+              (lhs.derived(),rhs.m_matrix);
+    }
+
+    /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
+      * \f$ this = this + \alpha u v^* + conj(\alpha) v u^* \f$
+      * \returns a reference to \c *this
+      *
+      * The vectors \a u and \c v \b must be column vectors, however they can be
+      * a adjoint expression without any overhead. Only the meaningful triangular
+      * part of the matrix is updated, the rest is left unchanged.
+      *
+      * \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
+      */
+    template<typename DerivedU, typename DerivedV>
+    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha = Scalar(1));
+
+    /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
+      * \f$ this = this + \alpha ( u u^* ) \f$ where \a u is a vector or matrix.
+      *
+      * \returns a reference to \c *this
+      *
+      * Note that to perform \f$ this = this + \alpha ( u^* u ) \f$ you can simply
+      * call this function with u.adjoint().
+      *
+      * \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
+      */
+    template<typename DerivedU>
+    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
+
+/////////// Cholesky module ///////////
+
+    const LLT<PlainObject, UpLo> llt() const;
+    const LDLT<PlainObject, UpLo> ldlt() const;
+
+/////////// Eigenvalue module ///////////
+
+    /** Real part of #Scalar */
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    /** Return type of eigenvalues() */
+    typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;
+
+    EigenvaluesReturnType eigenvalues() const;
+    RealScalar operatorNorm() const;
+    
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    SelfAdjointView& operator=(const MatrixBase<OtherDerived>& other)
+    {
+      enum {
+        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
+      };
+      m_matrix.const_cast_derived().template triangularView<UpLo>() = other;
+      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.adjoint();
+      return *this;
+    }
+    template<typename OtherMatrixType, unsigned int OtherMode>
+    SelfAdjointView& operator=(const TriangularView<OtherMatrixType, OtherMode>& other)
+    {
+      enum {
+        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
+      };
+      m_matrix.const_cast_derived().template triangularView<UpLo>() = other.toDenseMatrix();
+      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.toDenseMatrix().adjoint();
+      return *this;
+    }
+    #endif
+
+  protected:
+    MatrixTypeNested m_matrix;
+};
+
+
+// template<typename OtherDerived, typename MatrixType, unsigned int UpLo>
+// internal::selfadjoint_matrix_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >
+// operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView<MatrixType,UpLo>& rhs)
+// {
+//   return internal::matrix_selfadjoint_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >(lhs.derived(),rhs);
+// }
+
+// selfadjoint to dense matrix
+
+namespace internal {
+
+template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount, ClearOpposite>
+{
+  enum {
+    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
+  };
+
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount-1, ClearOpposite>::run(dst, src);
+
+    if(row == col)
+      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
+    else if(row < col)
+      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
+  }
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, 0, ClearOpposite>
+{
+  static inline void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount, ClearOpposite>
+{
+  enum {
+    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
+  };
+
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount-1, ClearOpposite>::run(dst, src);
+
+    if(row == col)
+      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
+    else if(row > col)
+      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
+  }
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, 0, ClearOpposite>
+{
+  static inline void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dynamic, ClearOpposite>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    for(Index j = 0; j < dst.cols(); ++j)
+    {
+      for(Index i = 0; i < j; ++i)
+      {
+        dst.copyCoeff(i, j, src);
+        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
+      }
+      dst.copyCoeff(j, j, src);
+    }
+  }
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dynamic, ClearOpposite>
+{
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+  typedef typename Derived1::Index Index;
+    for(Index i = 0; i < dst.rows(); ++i)
+    {
+      for(Index j = 0; j < i; ++j)
+      {
+        dst.copyCoeff(i, j, src);
+        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
+      }
+      dst.copyCoeff(i, i, src);
+    }
+  }
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* Implementation of MatrixBase methods
+***************************************************************************/
+
+template<typename Derived>
+template<unsigned int UpLo>
+typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+MatrixBase<Derived>::selfadjointView() const
+{
+  return derived();
+}
+
+template<typename Derived>
+template<unsigned int UpLo>
+typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+MatrixBase<Derived>::selfadjointView()
+{
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINTMATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h b/include/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h
new file mode 100644
index 0000000..22f3047
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -0,0 +1,197 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFCWISEBINARYOP_H
+#define EIGEN_SELFCWISEBINARYOP_H
+
+namespace Eigen { 
+
+/** \class SelfCwiseBinaryOp
+  * \ingroup Core_Module
+  *
+  * \internal
+  *
+  * \brief Internal helper class for optimizing operators like +=, -=
+  *
+  * This is a pseudo expression class re-implementing the copyCoeff/copyPacket
+  * method to directly performs a +=/-= operations in an optimal way. In particular,
+  * this allows to make sure that the input/output data are loaded only once using
+  * aligned packet loads.
+  *
+  * \sa class SwapWrapper for a similar trick.
+  */
+
+namespace internal {
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
+  : traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >
+{
+  enum {
+    // Note that it is still a good idea to preserve the DirectAccessBit
+    // so that assign can correctly align the data.
+    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
+    OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
+    InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
+  };
+};
+}
+
+template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
+  : public internal::dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<SelfCwiseBinaryOp>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(SelfCwiseBinaryOp)
+
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+
+    inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+    inline Index outerStride() const { return m_matrix.outerStride(); }
+    inline Index innerStride() const { return m_matrix.innerStride(); }
+    inline const Scalar* data() const { return m_matrix.data(); }
+
+    // note that this function is needed by assign to correctly align loads/stores
+    // TODO make Assign use .data()
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
+      return m_matrix.const_cast_derived().coeffRef(row, col);
+    }
+    inline const Scalar& coeffRef(Index row, Index col) const
+    {
+      return m_matrix.coeffRef(row, col);
+    }
+
+    // note that this function is needed by assign to correctly align loads/stores
+    // TODO make Assign use .data()
+    inline Scalar& coeffRef(Index index)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(row >= 0 && row < rows()
+                         && col >= 0 && col < cols());
+      Scalar& tmp = m_matrix.coeffRef(row,col);
+      tmp = m_functor(tmp, _other.coeff(row,col));
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(index >= 0 && index < m_matrix.size());
+      Scalar& tmp = m_matrix.coeffRef(index);
+      tmp = m_functor(tmp, _other.coeff(index));
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      m_matrix.template writePacket<StoreMode>(row, col,
+        m_functor.packetOp(m_matrix.template packet<StoreMode>(row, col),_other.template packet<LoadMode>(row, col)) );
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(index >= 0 && index < m_matrix.size());
+      m_matrix.template writePacket<StoreMode>(index,
+        m_functor.packetOp(m_matrix.template packet<StoreMode>(index),_other.template packet<LoadMode>(index)) );
+    }
+
+    // reimplement lazyAssign to handle complex *= real
+    // see CwiseBinaryOp ctor for details
+    template<typename RhsDerived>
+    EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
+    {
+      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
+      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename RhsDerived::Scalar);
+      
+    #ifdef EIGEN_DEBUG_ASSIGN
+      internal::assign_traits<SelfCwiseBinaryOp, RhsDerived>::debug();
+    #endif
+      eigen_assert(rows() == rhs.rows() && cols() == rhs.cols());
+      internal::assign_impl<SelfCwiseBinaryOp, RhsDerived>::run(*this,rhs.derived());
+    #ifndef EIGEN_NO_DEBUG
+      this->checkTransposeAliasing(rhs.derived());
+    #endif
+      return *this;
+    }
+    
+    // overloaded to honor evaluation of special matrices
+    // maybe another solution would be to not use SelfCwiseBinaryOp
+    // at first...
+    SelfCwiseBinaryOp& operator=(const Rhs& _rhs)
+    {
+      typename internal::nested<Rhs>::type rhs(_rhs);
+      return Base::operator=(rhs);
+    }
+
+    Lhs& expression() const 
+    { 
+      return m_matrix;
+    }
+
+    const BinaryOp& functor() const 
+    { 
+      return m_functor;
+    }
+
+  protected:
+    Lhs& m_matrix;
+    const BinaryOp& m_functor;
+
+  private:
+    SelfCwiseBinaryOp& operator=(const SelfCwiseBinaryOp&);
+};
+
+template<typename Derived>
+inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+{
+  typedef typename Derived::PlainObject PlainObject;
+  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
+  tmp = PlainObject::Constant(rows(),cols(),other);
+  return derived();
+}
+
+template<typename Derived>
+inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+{
+  typedef typename internal::conditional<NumTraits<Scalar>::IsInteger,
+                                        internal::scalar_quotient_op<Scalar>,
+                                        internal::scalar_product_op<Scalar> >::type BinOp;
+  typedef typename Derived::PlainObject PlainObject;
+  SelfCwiseBinaryOp<BinOp, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
+  Scalar actual_other;
+  if(NumTraits<Scalar>::IsInteger)  actual_other = other;
+  else                              actual_other = Scalar(1)/other;
+  tmp = PlainObject::Constant(rows(),cols(), actual_other);
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFCWISEBINARYOP_H
diff --git a/include/eigen3/Eigen/src/Core/SolveTriangular.h b/include/eigen3/Eigen/src/Core/SolveTriangular.h
new file mode 100644
index 0000000..ef17f28
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/SolveTriangular.h
@@ -0,0 +1,260 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVETRIANGULAR_H
+#define EIGEN_SOLVETRIANGULAR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// Forward declarations:
+// The following two routines are implemented in the products/TriangularSolver*.h files
+template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
+struct triangular_solve_vector;
+
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
+struct triangular_solve_matrix;
+
+// small helper struct extracting some traits on the underlying solver operation
+template<typename Lhs, typename Rhs, int Side>
+class trsolve_traits
+{
+  private:
+    enum {
+      RhsIsVectorAtCompileTime = (Side==OnTheLeft ? Rhs::ColsAtCompileTime : Rhs::RowsAtCompileTime)==1
+    };
+  public:
+    enum {
+      Unrolling   = (RhsIsVectorAtCompileTime && Rhs::SizeAtCompileTime != Dynamic && Rhs::SizeAtCompileTime <= 8)
+                  ? CompleteUnrolling : NoUnrolling,
+      RhsVectors  = RhsIsVectorAtCompileTime ? 1 : Dynamic
+    };
+};
+
+template<typename Lhs, typename Rhs,
+  int Side, // can be OnTheLeft/OnTheRight
+  int Mode, // can be Upper/Lower | UnitDiag
+  int Unrolling = trsolve_traits<Lhs,Rhs,Side>::Unrolling,
+  int RhsVectors = trsolve_traits<Lhs,Rhs,Side>::RhsVectors
+  >
+struct triangular_solver_selector;
+
+template<typename Lhs, typename Rhs, int Side, int Mode>
+struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
+{
+  typedef typename Lhs::Scalar LhsScalar;
+  typedef typename Rhs::Scalar RhsScalar;
+  typedef blas_traits<Lhs> LhsProductTraits;
+  typedef typename LhsProductTraits::ExtractType ActualLhsType;
+  typedef Map<Matrix<RhsScalar,Dynamic,1>, Aligned> MappedRhs;
+  static void run(const Lhs& lhs, Rhs& rhs)
+  {
+    ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
+
+    // FIXME find a way to allow an inner stride if packet_traits<Scalar>::size==1
+
+    bool useRhsDirectly = Rhs::InnerStrideAtCompileTime==1 || rhs.innerStride()==1;
+
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
+                                                  (useRhsDirectly ? rhs.data() : 0));
+                                                  
+    if(!useRhsDirectly)
+      MappedRhs(actualRhs,rhs.size()) = rhs;
+
+    triangular_solve_vector<LhsScalar, RhsScalar, typename Lhs::Index, Side, Mode, LhsProductTraits::NeedToConjugate,
+                            (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>
+      ::run(actualLhs.cols(), actualLhs.data(), actualLhs.outerStride(), actualRhs);
+
+    if(!useRhsDirectly)
+      rhs = MappedRhs(actualRhs, rhs.size());
+  }
+};
+
+// the rhs is a matrix
+template<typename Lhs, typename Rhs, int Side, int Mode>
+struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
+{
+  typedef typename Rhs::Scalar Scalar;
+  typedef typename Rhs::Index Index;
+  typedef blas_traits<Lhs> LhsProductTraits;
+  typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
+
+  static void run(const Lhs& lhs, Rhs& rhs)
+  {
+    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsProductTraits::extract(lhs);
+
+    const Index size = lhs.rows();
+    const Index othersize = Side==OnTheLeft? rhs.cols() : rhs.rows();
+
+    typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
+
+    BlockingType blocking(rhs.rows(), rhs.cols(), size);
+
+    triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
+      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
+  }
+};
+
+/***************************************************************************
+* meta-unrolling implementation
+***************************************************************************/
+
+template<typename Lhs, typename Rhs, int Mode, int Index, int Size,
+         bool Stop = Index==Size>
+struct triangular_solver_unroller;
+
+template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,false> {
+  enum {
+    IsLower = ((Mode&Lower)==Lower),
+    I = IsLower ? Index : Size - Index - 1,
+    S = IsLower ? 0     : I+1
+  };
+  static void run(const Lhs& lhs, Rhs& rhs)
+  {
+    if (Index>0)
+      rhs.coeffRef(I) -= lhs.row(I).template segment<Index>(S).transpose()
+                         .cwiseProduct(rhs.template segment<Index>(S)).sum();
+
+    if(!(Mode & UnitDiag))
+      rhs.coeffRef(I) /= lhs.coeff(I,I);
+
+    triangular_solver_unroller<Lhs,Rhs,Mode,Index+1,Size>::run(lhs,rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,true> {
+  static void run(const Lhs&, Rhs&) {}
+};
+
+template<typename Lhs, typename Rhs, int Mode>
+struct triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,CompleteUnrolling,1> {
+  static void run(const Lhs& lhs, Rhs& rhs)
+  { triangular_solver_unroller<Lhs,Rhs,Mode,0,Rhs::SizeAtCompileTime>::run(lhs,rhs); }
+};
+
+template<typename Lhs, typename Rhs, int Mode>
+struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
+  static void run(const Lhs& lhs, Rhs& rhs)
+  {
+    Transpose<const Lhs> trLhs(lhs);
+    Transpose<Rhs> trRhs(rhs);
+    
+    triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
+                              ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
+                              0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
+  }
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* TriangularView methods
+***************************************************************************/
+
+/** "in-place" version of TriangularView::solve() where the result is written in \a other
+  *
+  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+  * This function will const_cast it, so constness isn't honored here.
+  *
+  * See TriangularView:solve() for the details.
+  */
+template<typename MatrixType, unsigned int Mode>
+template<int Side, typename OtherDerived>
+void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+{
+  OtherDerived& other = _other.const_cast_derived();
+  eigen_assert( cols() == rows() && ((Side==OnTheLeft && cols() == other.rows()) || (Side==OnTheRight && cols() == other.cols())) );
+  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+
+  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
+  typedef typename internal::conditional<copy,
+    typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
+  OtherCopy otherCopy(other);
+
+  internal::triangular_solver_selector<MatrixType, typename internal::remove_reference<OtherCopy>::type,
+    Side, Mode>::run(nestedExpression(), otherCopy);
+
+  if (copy)
+    other = otherCopy;
+}
+
+/** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
+  *
+  * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
+  * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
+  * \a Side==OnTheRight.
+  *
+  * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
+  * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
+  * is an upper (resp. lower) triangular matrix.
+  *
+  * Example: \include MatrixBase_marked.cpp
+  * Output: \verbinclude MatrixBase_marked.out
+  *
+  * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
+  * to the same matrix or vector \a other.
+  *
+  * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
+  * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
+  *
+  * \sa TriangularView::solveInPlace()
+  */
+template<typename Derived, unsigned int Mode>
+template<int Side, typename Other>
+const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
+TriangularView<Derived,Mode>::solve(const MatrixBase<Other>& other) const
+{
+  return internal::triangular_solve_retval<Side,TriangularView,Other>(*this, other.derived());
+}
+
+namespace internal {
+
+
+template<int Side, typename TriangularType, typename Rhs>
+struct traits<triangular_solve_retval<Side, TriangularType, Rhs> >
+{
+  typedef typename internal::plain_matrix_type_column_major<Rhs>::type ReturnType;
+};
+
+template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval
+ : public ReturnByValue<triangular_solve_retval<Side, TriangularType, Rhs> >
+{
+  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
+  typedef ReturnByValue<triangular_solve_retval> Base;
+  typedef typename Base::Index Index;
+
+  triangular_solve_retval(const TriangularType& tri, const Rhs& rhs)
+    : m_triangularMatrix(tri), m_rhs(rhs)
+  {}
+
+  inline Index rows() const { return m_rhs.rows(); }
+  inline Index cols() const { return m_rhs.cols(); }
+
+  template<typename Dest> inline void evalTo(Dest& dst) const
+  {
+    if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs)))
+      dst = m_rhs;
+    m_triangularMatrix.template solveInPlace<Side>(dst);
+  }
+
+  protected:
+    const TriangularType& m_triangularMatrix;
+    typename Rhs::Nested m_rhs;
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVETRIANGULAR_H
diff --git a/include/eigen3/Eigen/src/Core/StableNorm.h b/include/eigen3/Eigen/src/Core/StableNorm.h
new file mode 100644
index 0000000..389d942
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/StableNorm.h
@@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STABLENORM_H
+#define EIGEN_STABLENORM_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename ExpressionType, typename Scalar>
+inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
+{
+  using std::max;
+  Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
+  
+  if (maxCoeff>scale)
+  {
+    ssq = ssq * numext::abs2(scale/maxCoeff);
+    Scalar tmp = Scalar(1)/maxCoeff;
+    if(tmp > NumTraits<Scalar>::highest())
+    {
+      invScale = NumTraits<Scalar>::highest();
+      scale = Scalar(1)/invScale;
+    }
+    else
+    {
+      scale = maxCoeff;
+      invScale = tmp;
+    }
+  }
+  
+  // TODO if the maxCoeff is much much smaller than the current scale,
+  // then we can neglect this sub vector
+  if(scale>Scalar(0)) // if scale==0, then bl is 0 
+    ssq += (bl*invScale).squaredNorm();
+}
+
+template<typename Derived>
+inline typename NumTraits<typename traits<Derived>::Scalar>::Real
+blueNorm_impl(const EigenBase<Derived>& _vec)
+{
+  typedef typename Derived::RealScalar RealScalar;  
+  typedef typename Derived::Index Index;
+  using std::pow;
+  using std::min;
+  using std::max;
+  using std::sqrt;
+  using std::abs;
+  const Derived& vec(_vec.derived());
+  static bool initialized = false;
+  static RealScalar b1, b2, s1m, s2m, overfl, rbig, relerr;
+  if(!initialized)
+  {
+    int ibeta, it, iemin, iemax, iexp;
+    RealScalar eps;
+    // This program calculates the machine-dependent constants
+    // bl, b2, slm, s2m, relerr overfl
+    // from the "basic" machine-dependent numbers
+    // nbig, ibeta, it, iemin, iemax, rbig.
+    // The following define the basic machine-dependent constants.
+    // For portability, the PORT subprograms "ilmaeh" and "rlmach"
+    // are used. For any specific computer, each of the assignment
+    // statements can be replaced
+    ibeta = std::numeric_limits<RealScalar>::radix;                 // base for floating-point numbers
+    it    = std::numeric_limits<RealScalar>::digits;                // number of base-beta digits in mantissa
+    iemin = std::numeric_limits<RealScalar>::min_exponent;          // minimum exponent
+    iemax = std::numeric_limits<RealScalar>::max_exponent;          // maximum exponent
+    rbig  = (std::numeric_limits<RealScalar>::max)();               // largest floating-point number
+
+    iexp  = -((1-iemin)/2);
+    b1    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // lower boundary of midrange
+    iexp  = (iemax + 1 - it)/2;
+    b2    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // upper boundary of midrange
+
+    iexp  = (2-iemin)/2;
+    s1m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for lower range
+    iexp  = - ((iemax+it)/2);
+    s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range
+
+    overfl  = rbig*s2m;                                             // overflow boundary for abig
+    eps     = RealScalar(pow(double(ibeta), 1-it));
+    relerr  = sqrt(eps);                                            // tolerance for neglecting asml
+    initialized = true;
+  }
+  Index n = vec.size();
+  RealScalar ab2 = b2 / RealScalar(n);
+  RealScalar asml = RealScalar(0);
+  RealScalar amed = RealScalar(0);
+  RealScalar abig = RealScalar(0);
+  for(typename Derived::InnerIterator it(vec, 0); it; ++it)
+  {
+    RealScalar ax = abs(it.value());
+    if(ax > ab2)     abig += numext::abs2(ax*s2m);
+    else if(ax < b1) asml += numext::abs2(ax*s1m);
+    else             amed += numext::abs2(ax);
+  }
+  if(abig > RealScalar(0))
+  {
+    abig = sqrt(abig);
+    if(abig > overfl)
+    {
+      return rbig;
+    }
+    if(amed > RealScalar(0))
+    {
+      abig = abig/s2m;
+      amed = sqrt(amed);
+    }
+    else
+      return abig/s2m;
+  }
+  else if(asml > RealScalar(0))
+  {
+    if (amed > RealScalar(0))
+    {
+      abig = sqrt(amed);
+      amed = sqrt(asml) / s1m;
+    }
+    else
+      return sqrt(asml)/s1m;
+  }
+  else
+    return sqrt(amed);
+  asml = (min)(abig, amed);
+  abig = (max)(abig, amed);
+  if(asml <= abig*relerr)
+    return abig;
+  else
+    return abig * sqrt(RealScalar(1) + numext::abs2(asml/abig));
+}
+
+} // end namespace internal
+
+/** \returns the \em l2 norm of \c *this avoiding underflow and overflow.
+  * This version use a blockwise two passes algorithm:
+  *  1 - find the absolute largest coefficient \c s
+  *  2 - compute \f$ s \Vert \frac{*this}{s} \Vert \f$ in a standard way
+  *
+  * For architecture/scalar types supporting vectorization, this version
+  * is faster than blueNorm(). Otherwise the blueNorm() is much faster.
+  *
+  * \sa norm(), blueNorm(), hypotNorm()
+  */
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::stableNorm() const
+{
+  using std::min;
+  using std::sqrt;
+  const Index blockSize = 4096;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of square
+  enum {
+    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? 1 : 0
+  };
+  Index n = size();
+  Index bi = internal::first_aligned(derived());
+  if (bi>0)
+    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
+  for (; bi<n; bi+=blockSize)
+    internal::stable_norm_kernel(this->segment(bi,(min)(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
+  return scale * sqrt(ssq);
+}
+
+/** \returns the \em l2 norm of \c *this using the Blue's algorithm.
+  * A Portable Fortran Program to Find the Euclidean Norm of a Vector,
+  * ACM TOMS, Vol 4, Issue 1, 1978.
+  *
+  * For architecture/scalar types without vectorization, this version
+  * is much faster than stableNorm(). Otherwise the stableNorm() is faster.
+  *
+  * \sa norm(), stableNorm(), hypotNorm()
+  */
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::blueNorm() const
+{
+  return internal::blueNorm_impl(*this);
+}
+
+/** \returns the \em l2 norm of \c *this avoiding undeflow and overflow.
+  * This version use a concatenation of hypot() calls, and it is very slow.
+  *
+  * \sa norm(), stableNorm()
+  */
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::hypotNorm() const
+{
+  return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_STABLENORM_H
diff --git a/include/eigen3/Eigen/src/Core/Stride.h b/include/eigen3/Eigen/src/Core/Stride.h
new file mode 100644
index 0000000..1e3f5fe
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Stride.h
@@ -0,0 +1,108 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STRIDE_H
+#define EIGEN_STRIDE_H
+
+namespace Eigen { 
+
+/** \class Stride
+  * \ingroup Core_Module
+  *
+  * \brief Holds strides information for Map
+  *
+  * This class holds the strides information for mapping arrays with strides with class Map.
+  *
+  * It holds two values: the inner stride and the outer stride.
+  *
+  * The inner stride is the pointer increment between two consecutive entries within a given row of a
+  * row-major matrix or within a given column of a column-major matrix.
+  *
+  * The outer stride is the pointer increment between two consecutive rows of a row-major matrix or
+  * between two consecutive columns of a column-major matrix.
+  *
+  * These two values can be passed either at compile-time as template parameters, or at runtime as
+  * arguments to the constructor.
+  *
+  * Indeed, this class takes two template parameters:
+  *  \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
+  *  \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
+  *
+  * Here is an example:
+  * \include Map_general_stride.cpp
+  * Output: \verbinclude Map_general_stride.out
+  *
+  * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
+  */
+template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
+class Stride
+{
+  public:
+    typedef DenseIndex Index;
+    enum {
+      InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
+      OuterStrideAtCompileTime = _OuterStrideAtCompileTime
+    };
+
+    /** Default constructor, for use when strides are fixed at compile time */
+    Stride()
+      : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
+    {
+      eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
+    }
+
+    /** Constructor allowing to pass the strides at runtime */
+    Stride(Index outerStride, Index innerStride)
+      : m_outer(outerStride), m_inner(innerStride)
+    {
+      eigen_assert(innerStride>=0 && outerStride>=0);
+    }
+
+    /** Copy constructor */
+    Stride(const Stride& other)
+      : m_outer(other.outer()), m_inner(other.inner())
+    {}
+
+    /** \returns the outer stride */
+    inline Index outer() const { return m_outer.value(); }
+    /** \returns the inner stride */
+    inline Index inner() const { return m_inner.value(); }
+
+  protected:
+    internal::variable_if_dynamic<Index, OuterStrideAtCompileTime> m_outer;
+    internal::variable_if_dynamic<Index, InnerStrideAtCompileTime> m_inner;
+};
+
+/** \brief Convenience specialization of Stride to specify only an inner stride
+  * See class Map for some examples */
+template<int Value = Dynamic>
+class InnerStride : public Stride<0, Value>
+{
+    typedef Stride<0, Value> Base;
+  public:
+    typedef DenseIndex Index;
+    InnerStride() : Base() {}
+    InnerStride(Index v) : Base(0, v) {}
+};
+
+/** \brief Convenience specialization of Stride to specify only an outer stride
+  * See class Map for some examples */
+template<int Value = Dynamic>
+class OuterStride : public Stride<Value, 0>
+{
+    typedef Stride<Value, 0> Base;
+  public:
+    typedef DenseIndex Index;
+    OuterStride() : Base() {}
+    OuterStride(Index v) : Base(v,0) {}
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_STRIDE_H
diff --git a/include/eigen3/Eigen/src/Core/Swap.h b/include/eigen3/Eigen/src/Core/Swap.h
new file mode 100644
index 0000000..bf58bd5
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Swap.h
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SWAP_H
+#define EIGEN_SWAP_H
+
+namespace Eigen { 
+
+/** \class SwapWrapper
+  * \ingroup Core_Module
+  *
+  * \internal
+  *
+  * \brief Internal helper class for swapping two expressions
+  */
+namespace internal {
+template<typename ExpressionType>
+struct traits<SwapWrapper<ExpressionType> > : traits<ExpressionType> {};
+}
+
+template<typename ExpressionType> class SwapWrapper
+  : public internal::dense_xpr_base<SwapWrapper<ExpressionType> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<SwapWrapper>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(SwapWrapper)
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+
+    inline SwapWrapper(ExpressionType& xpr) : m_expression(xpr) {}
+
+    inline Index rows() const { return m_expression.rows(); }
+    inline Index cols() const { return m_expression.cols(); }
+    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index innerStride() const { return m_expression.innerStride(); }
+    
+    typedef typename internal::conditional<
+                       internal::is_lvalue<ExpressionType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+                     
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+    inline const Scalar* data() const { return m_expression.data(); }
+
+    inline Scalar& coeffRef(Index rowId, Index colId)
+    {
+      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+    }
+
+    inline Scalar& coeffRef(Index index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    inline Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return m_expression.coeffRef(rowId, colId);
+    }
+
+    inline Scalar& coeffRef(Index index) const
+    {
+      return m_expression.coeffRef(index);
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(rowId >= 0 && rowId < rows()
+                         && colId >= 0 && colId < cols());
+      Scalar tmp = m_expression.coeff(rowId, colId);
+      m_expression.coeffRef(rowId, colId) = _other.coeff(rowId, colId);
+      _other.coeffRef(rowId, colId) = tmp;
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(index >= 0 && index < m_expression.size());
+      Scalar tmp = m_expression.coeff(index);
+      m_expression.coeffRef(index) = _other.coeff(index);
+      _other.coeffRef(index) = tmp;
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(rowId >= 0 && rowId < rows()
+                        && colId >= 0 && colId < cols());
+      Packet tmp = m_expression.template packet<StoreMode>(rowId, colId);
+      m_expression.template writePacket<StoreMode>(rowId, colId,
+        _other.template packet<LoadMode>(rowId, colId)
+      );
+      _other.template writePacket<LoadMode>(rowId, colId, tmp);
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(index >= 0 && index < m_expression.size());
+      Packet tmp = m_expression.template packet<StoreMode>(index);
+      m_expression.template writePacket<StoreMode>(index,
+        _other.template packet<LoadMode>(index)
+      );
+      _other.template writePacket<LoadMode>(index, tmp);
+    }
+
+    ExpressionType& expression() const { return m_expression; }
+
+  protected:
+    ExpressionType& m_expression;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SWAP_H
diff --git a/include/eigen3/Eigen/src/Core/Transpose.h b/include/eigen3/Eigen/src/Core/Transpose.h
new file mode 100644
index 0000000..22096ea
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Transpose.h
@@ -0,0 +1,419 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRANSPOSE_H
+#define EIGEN_TRANSPOSE_H
+
+namespace Eigen { 
+
+/** \class Transpose
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the transpose of a matrix
+  *
+  * \param MatrixType the type of the object of which we are taking the transpose
+  *
+  * This class represents an expression of the transpose of a matrix.
+  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
+  */
+
+namespace internal {
+template<typename MatrixType>
+struct traits<Transpose<MatrixType> > : traits<MatrixType>
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
+  typedef typename traits<MatrixType>::StorageKind StorageKind;
+  typedef typename traits<MatrixType>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = MatrixType::ColsAtCompileTime,
+    ColsAtCompileTime = MatrixType::RowsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
+    Flags1 = Flags0 | FlagsLvalueBit,
+    Flags = Flags1 ^ RowMajorBit,
+    CoeffReadCost = MatrixTypeNestedPlain::CoeffReadCost,
+    InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
+    OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
+  };
+};
+}
+
+template<typename MatrixType, typename StorageKind> class TransposeImpl;
+
+template<typename MatrixType> class Transpose
+  : public TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>
+{
+  public:
+
+    typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
+
+    inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
+
+    inline Index rows() const { return m_matrix.cols(); }
+    inline Index cols() const { return m_matrix.rows(); }
+
+    /** \returns the nested expression */
+    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() const { return m_matrix; }
+
+    /** \returns the nested expression */
+    typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() { return m_matrix.const_cast_derived(); }
+
+  protected:
+    typename MatrixType::Nested m_matrix;
+};
+
+namespace internal {
+
+template<typename MatrixType, bool HasDirectAccess = has_direct_access<MatrixType>::ret>
+struct TransposeImpl_base
+{
+  typedef typename dense_xpr_base<Transpose<MatrixType> >::type type;
+};
+
+template<typename MatrixType>
+struct TransposeImpl_base<MatrixType, false>
+{
+  typedef typename dense_xpr_base<Transpose<MatrixType> >::type type;
+};
+
+} // end namespace internal
+
+template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
+  : public internal::TransposeImpl_base<MatrixType>::type
+{
+  public:
+
+    typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
+
+    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+
+    typedef typename internal::conditional<
+                       internal::is_lvalue<MatrixType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+
+    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    inline const Scalar* data() const { return derived().nestedExpression().data(); }
+
+    inline ScalarWithConstIfNotLvalue& coeffRef(Index rowId, Index colId)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      return derived().nestedExpression().const_cast_derived().coeffRef(colId, rowId);
+    }
+
+    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      return derived().nestedExpression().const_cast_derived().coeffRef(index);
+    }
+
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return derived().nestedExpression().coeffRef(colId, rowId);
+    }
+
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return derived().nestedExpression().coeffRef(index);
+    }
+
+    inline CoeffReturnType coeff(Index rowId, Index colId) const
+    {
+      return derived().nestedExpression().coeff(colId, rowId);
+    }
+
+    inline CoeffReturnType coeff(Index index) const
+    {
+      return derived().nestedExpression().coeff(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index rowId, Index colId) const
+    {
+      return derived().nestedExpression().template packet<LoadMode>(colId, rowId);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index rowId, Index colId, const PacketScalar& x)
+    {
+      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(colId, rowId, x);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return derived().nestedExpression().template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& x)
+    {
+      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
+};
+
+/** \returns an expression of the transpose of *this.
+  *
+  * Example: \include MatrixBase_transpose.cpp
+  * Output: \verbinclude MatrixBase_transpose.out
+  *
+  * \warning If you want to replace a matrix by its own transpose, do \b NOT do this:
+  * \code
+  * m = m.transpose(); // bug!!! caused by aliasing effect
+  * \endcode
+  * Instead, use the transposeInPlace() method:
+  * \code
+  * m.transposeInPlace();
+  * \endcode
+  * which gives Eigen good opportunities for optimization, or alternatively you can also do:
+  * \code
+  * m = m.transpose().eval();
+  * \endcode
+  *
+  * \sa transposeInPlace(), adjoint() */
+template<typename Derived>
+inline Transpose<Derived>
+DenseBase<Derived>::transpose()
+{
+  return derived();
+}
+
+/** This is the const version of transpose().
+  *
+  * Make sure you read the warning for transpose() !
+  *
+  * \sa transposeInPlace(), adjoint() */
+template<typename Derived>
+inline typename DenseBase<Derived>::ConstTransposeReturnType
+DenseBase<Derived>::transpose() const
+{
+  return ConstTransposeReturnType(derived());
+}
+
+/** \returns an expression of the adjoint (i.e. conjugate transpose) of *this.
+  *
+  * Example: \include MatrixBase_adjoint.cpp
+  * Output: \verbinclude MatrixBase_adjoint.out
+  *
+  * \warning If you want to replace a matrix by its own adjoint, do \b NOT do this:
+  * \code
+  * m = m.adjoint(); // bug!!! caused by aliasing effect
+  * \endcode
+  * Instead, use the adjointInPlace() method:
+  * \code
+  * m.adjointInPlace();
+  * \endcode
+  * which gives Eigen good opportunities for optimization, or alternatively you can also do:
+  * \code
+  * m = m.adjoint().eval();
+  * \endcode
+  *
+  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
+template<typename Derived>
+inline const typename MatrixBase<Derived>::AdjointReturnType
+MatrixBase<Derived>::adjoint() const
+{
+  return this->transpose(); // in the complex case, the .conjugate() is be implicit here
+                            // due to implicit conversion to return type
+}
+
+/***************************************************************************
+* "in place" transpose implementation
+***************************************************************************/
+
+namespace internal {
+
+template<typename MatrixType,
+  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
+struct inplace_transpose_selector;
+
+template<typename MatrixType>
+struct inplace_transpose_selector<MatrixType,true> { // square matrix
+  static void run(MatrixType& m) {
+    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
+  }
+};
+
+template<typename MatrixType>
+struct inplace_transpose_selector<MatrixType,false> { // non square matrix
+  static void run(MatrixType& m) {
+    if (m.rows()==m.cols())
+      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
+    else
+      m = m.transpose().eval();
+  }
+};
+
+} // end namespace internal
+
+/** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
+  * Thus, doing
+  * \code
+  * m.transposeInPlace();
+  * \endcode
+  * has the same effect on m as doing
+  * \code
+  * m = m.transpose().eval();
+  * \endcode
+  * and is faster and also safer because in the latter line of code, forgetting the eval() results
+  * in a bug caused by \ref TopicAliasing "aliasing".
+  *
+  * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
+  * If you just need the transpose of a matrix, use transpose().
+  *
+  * \note if the matrix is not square, then \c *this must be a resizable matrix. 
+  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
+  *
+  * \sa transpose(), adjoint(), adjointInPlace() */
+template<typename Derived>
+inline void DenseBase<Derived>::transposeInPlace()
+{
+  eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
+               && "transposeInPlace() called on a non-square non-resizable matrix");
+  internal::inplace_transpose_selector<Derived>::run(derived());
+}
+
+/***************************************************************************
+* "in place" adjoint implementation
+***************************************************************************/
+
+/** This is the "in place" version of adjoint(): it replaces \c *this by its own transpose.
+  * Thus, doing
+  * \code
+  * m.adjointInPlace();
+  * \endcode
+  * has the same effect on m as doing
+  * \code
+  * m = m.adjoint().eval();
+  * \endcode
+  * and is faster and also safer because in the latter line of code, forgetting the eval() results
+  * in a bug caused by aliasing.
+  *
+  * Notice however that this method is only useful if you want to replace a matrix by its own adjoint.
+  * If you just need the adjoint of a matrix, use adjoint().
+  *
+  * \note if the matrix is not square, then \c *this must be a resizable matrix.
+  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
+  *
+  * \sa transpose(), adjoint(), transposeInPlace() */
+template<typename Derived>
+inline void MatrixBase<Derived>::adjointInPlace()
+{
+  derived() = adjoint().eval();
+}
+
+#ifndef EIGEN_NO_DEBUG
+
+// The following is to detect aliasing problems in most common cases.
+
+namespace internal {
+
+template<typename BinOp,typename NestedXpr,typename Rhs>
+struct blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> >
+ : blas_traits<NestedXpr>
+{
+  typedef SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> XprType;
+  static inline const XprType extract(const XprType& x) { return x; }
+};
+
+template<bool DestIsTransposed, typename OtherDerived>
+struct check_transpose_aliasing_compile_time_selector
+{
+  enum { ret = bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed };
+};
+
+template<bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
+struct check_transpose_aliasing_compile_time_selector<DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
+{
+  enum { ret =    bool(blas_traits<DerivedA>::IsTransposed) != DestIsTransposed
+               || bool(blas_traits<DerivedB>::IsTransposed) != DestIsTransposed
+  };
+};
+
+template<typename Scalar, bool DestIsTransposed, typename OtherDerived>
+struct check_transpose_aliasing_run_time_selector
+{
+  static bool run(const Scalar* dest, const OtherDerived& src)
+  {
+    return (bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src));
+  }
+};
+
+template<typename Scalar, bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
+struct check_transpose_aliasing_run_time_selector<Scalar,DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
+{
+  static bool run(const Scalar* dest, const CwiseBinaryOp<BinOp,DerivedA,DerivedB>& src)
+  {
+    return ((blas_traits<DerivedA>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.lhs())))
+        || ((blas_traits<DerivedB>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.rhs())));
+  }
+};
+
+// the following selector, checkTransposeAliasing_impl, based on MightHaveTransposeAliasing,
+// is because when the condition controlling the assert is known at compile time, ICC emits a warning.
+// This is actually a good warning: in expressions that don't have any transposing, the condition is
+// known at compile time to be false, and using that, we can avoid generating the code of the assert again
+// and again for all these expressions that don't need it.
+
+template<typename Derived, typename OtherDerived,
+         bool MightHaveTransposeAliasing
+                 = check_transpose_aliasing_compile_time_selector
+                     <blas_traits<Derived>::IsTransposed,OtherDerived>::ret
+        >
+struct checkTransposeAliasing_impl
+{
+    static void run(const Derived& dst, const OtherDerived& other)
+    {
+        eigen_assert((!check_transpose_aliasing_run_time_selector
+                      <typename Derived::Scalar,blas_traits<Derived>::IsTransposed,OtherDerived>
+                      ::run(extract_data(dst), other))
+          && "aliasing detected during transposition, use transposeInPlace() "
+             "or evaluate the rhs into a temporary using .eval()");
+
+    }
+};
+
+template<typename Derived, typename OtherDerived>
+struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
+{
+    static void run(const Derived&, const OtherDerived&)
+    {
+    }
+};
+
+} // end namespace internal
+
+template<typename Derived>
+template<typename OtherDerived>
+void DenseBase<Derived>::checkTransposeAliasing(const OtherDerived& other) const
+{
+    internal::checkTransposeAliasing_impl<Derived, OtherDerived>::run(derived(), other);
+}
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRANSPOSE_H
diff --git a/include/eigen3/Eigen/src/Core/Transpositions.h b/include/eigen3/Eigen/src/Core/Transpositions.h
new file mode 100644
index 0000000..e4ba075
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Transpositions.h
@@ -0,0 +1,436 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRANSPOSITIONS_H
+#define EIGEN_TRANSPOSITIONS_H
+
+namespace Eigen { 
+
+/** \class Transpositions
+  * \ingroup Core_Module
+  *
+  * \brief Represents a sequence of transpositions (row/column interchange)
+  *
+  * \param SizeAtCompileTime the number of transpositions, or Dynamic
+  * \param MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  *
+  * This class represents a permutation transformation as a sequence of \em n transpositions
+  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
+  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
+  * the rows \c i and \c indices[i] of the matrix \c M.
+  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
+  *
+  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
+  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
+  * 
+  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
+  * \code
+  * Transpositions tr;
+  * MatrixXf mat;
+  * mat = tr * mat;
+  * \endcode
+  * In this example, we detect that the matrix appears on both side, and so the transpositions
+  * are applied in-place without any temporary or extra copy.
+  *
+  * \sa class PermutationMatrix
+  */
+
+namespace internal {
+template<typename TranspositionType, typename MatrixType, int Side, bool Transposed=false> struct transposition_matrix_product_retval;
+}
+
+template<typename Derived>
+class TranspositionsBase
+{
+    typedef internal::traits<Derived> Traits;
+    
+  public:
+
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar Index;
+
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+    /** Copies the \a other transpositions into \c *this */
+    template<typename OtherDerived>
+    Derived& operator=(const TranspositionsBase<OtherDerived>& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Derived& operator=(const TranspositionsBase& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+    #endif
+
+    /** \returns the number of transpositions */
+    inline Index size() const { return indices().size(); }
+
+    /** Direct access to the underlying index vector */
+    inline const Index& coeff(Index i) const { return indices().coeff(i); }
+    /** Direct access to the underlying index vector */
+    inline Index& coeffRef(Index i) { return indices().coeffRef(i); }
+    /** Direct access to the underlying index vector */
+    inline const Index& operator()(Index i) const { return indices()(i); }
+    /** Direct access to the underlying index vector */
+    inline Index& operator()(Index i) { return indices()(i); }
+    /** Direct access to the underlying index vector */
+    inline const Index& operator[](Index i) const { return indices()(i); }
+    /** Direct access to the underlying index vector */
+    inline Index& operator[](Index i) { return indices()(i); }
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return derived().indices(); }
+    /** \returns a reference to the stored array representing the transpositions. */
+    IndicesType& indices() { return derived().indices(); }
+
+    /** Resizes to given size. */
+    inline void resize(int newSize)
+    {
+      indices().resize(newSize);
+    }
+
+    /** Sets \c *this to represents an identity transformation */
+    void setIdentity()
+    {
+      for(int i = 0; i < indices().size(); ++i)
+        coeffRef(i) = i;
+    }
+
+    // FIXME: do we want such methods ?
+    // might be usefull when the target matrix expression is complex, e.g.:
+    // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
+    /*
+    template<typename MatrixType>
+    void applyForwardToRows(MatrixType& mat) const
+    {
+      for(Index k=0 ; k<size() ; ++k)
+        if(m_indices(k)!=k)
+          mat.row(k).swap(mat.row(m_indices(k)));
+    }
+
+    template<typename MatrixType>
+    void applyBackwardToRows(MatrixType& mat) const
+    {
+      for(Index k=size()-1 ; k>=0 ; --k)
+        if(m_indices(k)!=k)
+          mat.row(k).swap(mat.row(m_indices(k)));
+    }
+    */
+
+    /** \returns the inverse transformation */
+    inline Transpose<TranspositionsBase> inverse() const
+    { return Transpose<TranspositionsBase>(derived()); }
+
+    /** \returns the tranpose transformation */
+    inline Transpose<TranspositionsBase> transpose() const
+    { return Transpose<TranspositionsBase>(derived()); }
+
+  protected:
+};
+
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+{
+  typedef IndexType Index;
+  typedef Matrix<Index, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+};
+}
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+{
+    typedef internal::traits<Transpositions> Traits;
+  public:
+
+    typedef TranspositionsBase<Transpositions> Base;
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar Index;
+
+    inline Transpositions() {}
+
+    /** Copy constructor. */
+    template<typename OtherDerived>
+    inline Transpositions(const TranspositionsBase<OtherDerived>& other)
+      : m_indices(other.indices()) {}
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Standard copy constructor. Defined only to prevent a default copy constructor
+      * from hiding the other templated constructor */
+    inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
+    #endif
+
+    /** Generic constructor from expression of the transposition indices. */
+    template<typename Other>
+    explicit inline Transpositions(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    {}
+
+    /** Copies the \a other transpositions into \c *this */
+    template<typename OtherDerived>
+    Transpositions& operator=(const TranspositionsBase<OtherDerived>& other)
+    {
+      return Base::operator=(other);
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Transpositions& operator=(const Transpositions& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
+    /** Constructs an uninitialized permutation matrix of given size.
+      */
+    inline Transpositions(Index size) : m_indices(size)
+    {}
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return m_indices; }
+    /** \returns a reference to the stored array representing the transpositions. */
+    IndicesType& indices() { return m_indices; }
+
+  protected:
+
+    IndicesType m_indices;
+};
+
+
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
+struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,_PacketAccess> >
+{
+  typedef IndexType Index;
+  typedef Map<const Matrix<Index,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+};
+}
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int PacketAccess>
+class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess>
+ : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess> >
+{
+    typedef internal::traits<Map> Traits;
+  public:
+
+    typedef TranspositionsBase<Map> Base;
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar Index;
+
+    inline Map(const Index* indicesPtr)
+      : m_indices(indicesPtr)
+    {}
+
+    inline Map(const Index* indicesPtr, Index size)
+      : m_indices(indicesPtr,size)
+    {}
+
+    /** Copies the \a other transpositions into \c *this */
+    template<typename OtherDerived>
+    Map& operator=(const TranspositionsBase<OtherDerived>& other)
+    {
+      return Base::operator=(other);
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Map& operator=(const Map& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return m_indices; }
+    
+    /** \returns a reference to the stored array representing the transpositions. */
+    IndicesType& indices() { return m_indices; }
+
+  protected:
+
+    IndicesType m_indices;
+};
+
+namespace internal {
+template<typename _IndicesType>
+struct traits<TranspositionsWrapper<_IndicesType> >
+{
+  typedef typename _IndicesType::Scalar Index;
+  typedef _IndicesType IndicesType;
+};
+}
+
+template<typename _IndicesType>
+class TranspositionsWrapper
+ : public TranspositionsBase<TranspositionsWrapper<_IndicesType> >
+{
+    typedef internal::traits<TranspositionsWrapper> Traits;
+  public:
+
+    typedef TranspositionsBase<TranspositionsWrapper> Base;
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar Index;
+
+    inline TranspositionsWrapper(IndicesType& a_indices)
+      : m_indices(a_indices)
+    {}
+
+    /** Copies the \a other transpositions into \c *this */
+    template<typename OtherDerived>
+    TranspositionsWrapper& operator=(const TranspositionsBase<OtherDerived>& other)
+    {
+      return Base::operator=(other);
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return m_indices; }
+
+    /** \returns a reference to the stored array representing the transpositions. */
+    IndicesType& indices() { return m_indices; }
+
+  protected:
+
+    const typename IndicesType::Nested m_indices;
+};
+
+/** \returns the \a matrix with the \a transpositions applied to the columns.
+  */
+template<typename Derived, typename TranspositionsDerived>
+inline const internal::transposition_matrix_product_retval<TranspositionsDerived, Derived, OnTheRight>
+operator*(const MatrixBase<Derived>& matrix,
+          const TranspositionsBase<TranspositionsDerived> &transpositions)
+{
+  return internal::transposition_matrix_product_retval
+           <TranspositionsDerived, Derived, OnTheRight>
+           (transpositions.derived(), matrix.derived());
+}
+
+/** \returns the \a matrix with the \a transpositions applied to the rows.
+  */
+template<typename Derived, typename TranspositionDerived>
+inline const internal::transposition_matrix_product_retval
+               <TranspositionDerived, Derived, OnTheLeft>
+operator*(const TranspositionsBase<TranspositionDerived> &transpositions,
+          const MatrixBase<Derived>& matrix)
+{
+  return internal::transposition_matrix_product_retval
+           <TranspositionDerived, Derived, OnTheLeft>
+           (transpositions.derived(), matrix.derived());
+}
+
+namespace internal {
+
+template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
+struct traits<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
+{
+  typedef typename MatrixType::PlainObject ReturnType;
+};
+
+template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
+struct transposition_matrix_product_retval
+ : public ReturnByValue<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
+{
+    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
+    typedef typename TranspositionType::Index Index;
+
+    transposition_matrix_product_retval(const TranspositionType& tr, const MatrixType& matrix)
+      : m_transpositions(tr), m_matrix(matrix)
+    {}
+
+    inline int rows() const { return m_matrix.rows(); }
+    inline int cols() const { return m_matrix.cols(); }
+
+    template<typename Dest> inline void evalTo(Dest& dst) const
+    {
+      const int size = m_transpositions.size();
+      Index j = 0;
+
+      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix)))
+        dst = m_matrix;
+
+      for(int k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
+        if((j=m_transpositions.coeff(k))!=k)
+        {
+          if(Side==OnTheLeft)
+            dst.row(k).swap(dst.row(j));
+          else if(Side==OnTheRight)
+            dst.col(k).swap(dst.col(j));
+        }
+    }
+
+  protected:
+    const TranspositionType& m_transpositions;
+    typename MatrixType::Nested m_matrix;
+};
+
+} // end namespace internal
+
+/* Template partial specialization for transposed/inverse transpositions */
+
+template<typename TranspositionsDerived>
+class Transpose<TranspositionsBase<TranspositionsDerived> >
+{
+    typedef TranspositionsDerived TranspositionType;
+    typedef typename TranspositionType::IndicesType IndicesType;
+  public:
+
+    Transpose(const TranspositionType& t) : m_transpositions(t) {}
+
+    inline int size() const { return m_transpositions.size(); }
+
+    /** \returns the \a matrix with the inverse transpositions applied to the columns.
+      */
+    template<typename Derived> friend
+    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>
+    operator*(const MatrixBase<Derived>& matrix, const Transpose& trt)
+    {
+      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>(trt.m_transpositions, matrix.derived());
+    }
+
+    /** \returns the \a matrix with the inverse transpositions applied to the rows.
+      */
+    template<typename Derived>
+    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>
+    operator*(const MatrixBase<Derived>& matrix) const
+    {
+      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>(m_transpositions, matrix.derived());
+    }
+
+  protected:
+    const TranspositionType& m_transpositions;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRANSPOSITIONS_H
diff --git a/include/eigen3/Eigen/src/Core/TriangularMatrix.h b/include/eigen3/Eigen/src/Core/TriangularMatrix.h
new file mode 100644
index 0000000..4d65392
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/TriangularMatrix.h
@@ -0,0 +1,839 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULARMATRIX_H
+#define EIGEN_TRIANGULARMATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+  
+template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
+  
+}
+
+/** \internal
+  *
+  * \class TriangularBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for triangular part in a matrix
+  */
+template<typename Derived> class TriangularBase : public EigenBase<Derived>
+{
+  public:
+
+    enum {
+      Mode = internal::traits<Derived>::Mode,
+      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime
+    };
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::DenseMatrixType DenseMatrixType;
+    typedef DenseMatrixType DenseType;
+
+    inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
+
+    inline Index rows() const { return derived().rows(); }
+    inline Index cols() const { return derived().cols(); }
+    inline Index outerStride() const { return derived().outerStride(); }
+    inline Index innerStride() const { return derived().innerStride(); }
+
+    inline Scalar coeff(Index row, Index col) const  { return derived().coeff(row,col); }
+    inline Scalar& coeffRef(Index row, Index col) { return derived().coeffRef(row,col); }
+
+    /** \see MatrixBase::copyCoeff(row,col)
+      */
+    template<typename Other>
+    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, Other& other)
+    {
+      derived().coeffRef(row, col) = other.coeff(row, col);
+    }
+
+    inline Scalar operator()(Index row, Index col) const
+    {
+      check_coordinates(row, col);
+      return coeff(row,col);
+    }
+    inline Scalar& operator()(Index row, Index col)
+    {
+      check_coordinates(row, col);
+      return coeffRef(row,col);
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    inline Derived& derived() { return *static_cast<Derived*>(this); }
+    #endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived> &other) const;
+    template<typename DenseDerived>
+    void evalToLazy(MatrixBase<DenseDerived> &other) const;
+
+    DenseMatrixType toDenseMatrix() const
+    {
+      DenseMatrixType res(rows(), cols());
+      evalToLazy(res);
+      return res;
+    }
+
+  protected:
+
+    void check_coordinates(Index row, Index col) const
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(row);
+      EIGEN_ONLY_USED_FOR_DEBUG(col);
+      eigen_assert(col>=0 && col<cols() && row>=0 && row<rows());
+      const int mode = int(Mode) & ~SelfAdjoint;
+      EIGEN_ONLY_USED_FOR_DEBUG(mode);
+      eigen_assert((mode==Upper && col>=row)
+                || (mode==Lower && col<=row)
+                || ((mode==StrictlyUpper || mode==UnitUpper) && col>row)
+                || ((mode==StrictlyLower || mode==UnitLower) && col<row));
+    }
+
+    #ifdef EIGEN_INTERNAL_DEBUGGING
+    void check_coordinates_internal(Index row, Index col) const
+    {
+      check_coordinates(row, col);
+    }
+    #else
+    void check_coordinates_internal(Index , Index ) const {}
+    #endif
+
+};
+
+/** \class TriangularView
+  * \ingroup Core_Module
+  *
+  * \brief Base class for triangular part in a matrix
+  *
+  * \param MatrixType the type of the object in which we are taking the triangular part
+  * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
+  *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
+  *             This is in fact a bit field; it must have either #Upper or #Lower, 
+  *             and additionnaly it may have #UnitDiag or #ZeroDiag or neither.
+  *
+  * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
+  * matrices one should speak of "trapezoid" parts. This class is the return type
+  * of MatrixBase::triangularView() and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::triangularView()
+  */
+namespace internal {
+template<typename MatrixType, unsigned int _Mode>
+struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
+{
+  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
+  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+  typedef MatrixType ExpressionType;
+  typedef typename MatrixType::PlainObject DenseMatrixType;
+  enum {
+    Mode = _Mode,
+    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit))) | Mode,
+    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+  };
+};
+}
+
+template<int Mode, bool LhsIsTriangular,
+         typename Lhs, bool LhsIsVector,
+         typename Rhs, bool RhsIsVector>
+struct TriangularProduct;
+
+template<typename _MatrixType, unsigned int _Mode> class TriangularView
+  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
+{
+  public:
+
+    typedef TriangularBase<TriangularView> Base;
+    typedef typename internal::traits<TriangularView>::Scalar Scalar;
+
+    typedef _MatrixType MatrixType;
+    typedef typename internal::traits<TriangularView>::DenseMatrixType DenseMatrixType;
+    typedef DenseMatrixType PlainObject;
+
+  protected:
+    typedef typename internal::traits<TriangularView>::MatrixTypeNested MatrixTypeNested;
+    typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
+    typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
+
+    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+    
+  public:
+    using Base::evalToLazy;
+  
+
+    typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
+    typedef typename internal::traits<TriangularView>::Index Index;
+
+    enum {
+      Mode = _Mode,
+      TransposeMode = (Mode & Upper ? Lower : 0)
+                    | (Mode & Lower ? Upper : 0)
+                    | (Mode & (UnitDiag))
+                    | (Mode & (ZeroDiag))
+    };
+
+    inline TriangularView(const MatrixType& matrix) : m_matrix(matrix)
+    {}
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+    inline Index outerStride() const { return m_matrix.outerStride(); }
+    inline Index innerStride() const { return m_matrix.innerStride(); }
+
+    /** \sa MatrixBase::operator+=() */
+    template<typename Other> TriangularView&  operator+=(const DenseBase<Other>& other) { return *this = m_matrix + other.derived(); }
+    /** \sa MatrixBase::operator-=() */
+    template<typename Other> TriangularView&  operator-=(const DenseBase<Other>& other) { return *this = m_matrix - other.derived(); }
+    /** \sa MatrixBase::operator*=() */
+    TriangularView&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix * other; }
+    /** \sa MatrixBase::operator/=() */
+    TriangularView&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix / other; }
+
+    /** \sa MatrixBase::fill() */
+    void fill(const Scalar& value) { setConstant(value); }
+    /** \sa MatrixBase::setConstant() */
+    TriangularView& setConstant(const Scalar& value)
+    { return *this = MatrixType::Constant(rows(), cols(), value); }
+    /** \sa MatrixBase::setZero() */
+    TriangularView& setZero() { return setConstant(Scalar(0)); }
+    /** \sa MatrixBase::setOnes() */
+    TriangularView& setOnes() { return setConstant(Scalar(1)); }
+
+    /** \sa MatrixBase::coeff()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    inline Scalar coeff(Index row, Index col) const
+    {
+      Base::check_coordinates_internal(row, col);
+      return m_matrix.coeff(row, col);
+    }
+
+    /** \sa MatrixBase::coeffRef()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      Base::check_coordinates_internal(row, col);
+      return m_matrix.const_cast_derived().coeffRef(row, col);
+    }
+
+    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
+    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
+
+    /** Assigns a triangular matrix to a triangular part of a dense matrix */
+    template<typename OtherDerived>
+    TriangularView& operator=(const TriangularBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    TriangularView& operator=(const MatrixBase<OtherDerived>& other);
+
+    TriangularView& operator=(const TriangularView& other)
+    { return *this = other.nestedExpression(); }
+
+    template<typename OtherDerived>
+    void lazyAssign(const TriangularBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    void lazyAssign(const MatrixBase<OtherDerived>& other);
+
+    /** \sa MatrixBase::conjugate() */
+    inline TriangularView<MatrixConjugateReturnType,Mode> conjugate()
+    { return m_matrix.conjugate(); }
+    /** \sa MatrixBase::conjugate() const */
+    inline const TriangularView<MatrixConjugateReturnType,Mode> conjugate() const
+    { return m_matrix.conjugate(); }
+
+    /** \sa MatrixBase::adjoint() const */
+    inline const TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> adjoint() const
+    { return m_matrix.adjoint(); }
+
+    /** \sa MatrixBase::transpose() */
+    inline TriangularView<Transpose<MatrixType>,TransposeMode> transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      return m_matrix.const_cast_derived().transpose();
+    }
+    /** \sa MatrixBase::transpose() const */
+    inline const TriangularView<Transpose<MatrixType>,TransposeMode> transpose() const
+    {
+      return m_matrix.transpose();
+    }
+
+    /** Efficient triangular matrix times vector/matrix product */
+    template<typename OtherDerived>
+    TriangularProduct<Mode, true, MatrixType, false, OtherDerived, OtherDerived::ColsAtCompileTime==1>
+    operator*(const MatrixBase<OtherDerived>& rhs) const
+    {
+      return TriangularProduct
+              <Mode, true, MatrixType, false, OtherDerived, OtherDerived::ColsAtCompileTime==1>
+              (m_matrix, rhs.derived());
+    }
+
+    /** Efficient vector/matrix times triangular matrix product */
+    template<typename OtherDerived> friend
+    TriangularProduct<Mode, false, OtherDerived, OtherDerived::RowsAtCompileTime==1, MatrixType, false>
+    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularView& rhs)
+    {
+      return TriangularProduct
+              <Mode, false, OtherDerived, OtherDerived::RowsAtCompileTime==1, MatrixType, false>
+              (lhs.derived(),rhs.m_matrix);
+    }
+
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    struct eigen2_product_return_type
+    {
+      typedef typename TriangularView<MatrixType,Mode>::DenseMatrixType DenseMatrixType;
+      typedef typename OtherDerived::PlainObject::DenseType OtherPlainObject;
+      typedef typename ProductReturnType<DenseMatrixType, OtherPlainObject>::Type ProdRetType;
+      typedef typename ProdRetType::PlainObject type;
+    };
+    template<typename OtherDerived>
+    const typename eigen2_product_return_type<OtherDerived>::type
+    operator*(const EigenBase<OtherDerived>& rhs) const
+    {
+      typename OtherDerived::PlainObject::DenseType rhsPlainObject;
+      rhs.evalTo(rhsPlainObject);
+      return this->toDenseMatrix() * rhsPlainObject;
+    }
+    template<typename OtherMatrixType>
+    bool isApprox(const TriangularView<OtherMatrixType, Mode>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return this->toDenseMatrix().isApprox(other.toDenseMatrix(), precision);
+    }
+    template<typename OtherDerived>
+    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return this->toDenseMatrix().isApprox(other, precision);
+    }
+    #endif // EIGEN2_SUPPORT
+
+    template<int Side, typename Other>
+    inline const internal::triangular_solve_retval<Side,TriangularView, Other>
+    solve(const MatrixBase<Other>& other) const;
+
+    template<int Side, typename OtherDerived>
+    void solveInPlace(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename Other>
+    inline const internal::triangular_solve_retval<OnTheLeft,TriangularView, Other> 
+    solve(const MatrixBase<Other>& other) const
+    { return solve<OnTheLeft>(other); }
+
+    template<typename OtherDerived>
+    void solveInPlace(const MatrixBase<OtherDerived>& other) const
+    { return solveInPlace<OnTheLeft>(other); }
+
+    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
+    {
+      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
+    }
+    SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
+    {
+      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
+    }
+
+    template<typename OtherDerived>
+    void swap(TriangularBase<OtherDerived> const & other)
+    {
+      TriangularView<SwapWrapper<MatrixType>,Mode>(const_cast<MatrixType&>(m_matrix)).lazyAssign(other.derived());
+    }
+
+    template<typename OtherDerived>
+    void swap(MatrixBase<OtherDerived> const & other)
+    {
+      SwapWrapper<MatrixType> swaper(const_cast<MatrixType&>(m_matrix));
+      TriangularView<SwapWrapper<MatrixType>,Mode>(swaper).lazyAssign(other.derived());
+    }
+
+    Scalar determinant() const
+    {
+      if (Mode & UnitDiag)
+        return 1;
+      else if (Mode & ZeroDiag)
+        return 0;
+      else
+        return m_matrix.diagonal().prod();
+    }
+    
+    // TODO simplify the following:
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE TriangularView& operator=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
+    {
+      setZero();
+      return assignProduct(other.derived(),1);
+    }
+    
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE TriangularView& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
+    {
+      return assignProduct(other.derived(),1);
+    }
+    
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE TriangularView& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
+    {
+      return assignProduct(other.derived(),-1);
+    }
+    
+    
+    template<typename ProductDerived>
+    EIGEN_STRONG_INLINE TriangularView& operator=(const ScaledProduct<ProductDerived>& other)
+    {
+      setZero();
+      return assignProduct(other.derived(),other.alpha());
+    }
+    
+    template<typename ProductDerived>
+    EIGEN_STRONG_INLINE TriangularView& operator+=(const ScaledProduct<ProductDerived>& other)
+    {
+      return assignProduct(other.derived(),other.alpha());
+    }
+    
+    template<typename ProductDerived>
+    EIGEN_STRONG_INLINE TriangularView& operator-=(const ScaledProduct<ProductDerived>& other)
+    {
+      return assignProduct(other.derived(),-other.alpha());
+    }
+    
+  protected:
+    
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE TriangularView& assignProduct(const ProductBase<ProductDerived, Lhs,Rhs>& prod, const Scalar& alpha);
+    
+    template<int Mode, bool LhsIsTriangular,
+         typename Lhs, bool LhsIsVector,
+         typename Rhs, bool RhsIsVector>
+    EIGEN_STRONG_INLINE TriangularView& assignProduct(const TriangularProduct<Mode, LhsIsTriangular, Lhs, LhsIsVector, Rhs, RhsIsVector>& prod, const Scalar& alpha)
+    {
+      lazyAssign(alpha*prod.eval());
+      return *this;
+    }
+
+    MatrixTypeNested m_matrix;
+};
+
+/***************************************************************************
+* Implementation of triangular evaluation/assignment
+***************************************************************************/
+
+namespace internal {
+
+template<typename Derived1, typename Derived2, unsigned int Mode, int UnrollCount, bool ClearOpposite>
+struct triangular_assignment_selector
+{
+  enum {
+    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
+  };
+  
+  typedef typename Derived1::Scalar Scalar;
+
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    triangular_assignment_selector<Derived1, Derived2, Mode, UnrollCount-1, ClearOpposite>::run(dst, src);
+
+    eigen_assert( Mode == Upper || Mode == Lower
+            || Mode == StrictlyUpper || Mode == StrictlyLower
+            || Mode == UnitUpper || Mode == UnitLower);
+    if((Mode == Upper && row <= col)
+    || (Mode == Lower && row >= col)
+    || (Mode == StrictlyUpper && row < col)
+    || (Mode == StrictlyLower && row > col)
+    || (Mode == UnitUpper && row < col)
+    || (Mode == UnitLower && row > col))
+      dst.copyCoeff(row, col, src);
+    else if(ClearOpposite)
+    {
+      if (Mode&UnitDiag && row==col)
+        dst.coeffRef(row, col) = Scalar(1);
+      else
+        dst.coeffRef(row, col) = Scalar(0);
+    }
+  }
+};
+
+// prevent buggy user code from causing an infinite recursion
+template<typename Derived1, typename Derived2, unsigned int Mode, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, Mode, 0, ClearOpposite>
+{
+  static inline void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, Upper, Dynamic, ClearOpposite>
+{
+  typedef typename Derived1::Index Index;
+  typedef typename Derived1::Scalar Scalar;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    for(Index j = 0; j < dst.cols(); ++j)
+    {
+      Index maxi = (std::min)(j, dst.rows()-1);
+      for(Index i = 0; i <= maxi; ++i)
+        dst.copyCoeff(i, j, src);
+      if (ClearOpposite)
+        for(Index i = maxi+1; i < dst.rows(); ++i)
+          dst.coeffRef(i, j) = Scalar(0);
+    }
+  }
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, Lower, Dynamic, ClearOpposite>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    for(Index j = 0; j < dst.cols(); ++j)
+    {
+      for(Index i = j; i < dst.rows(); ++i)
+        dst.copyCoeff(i, j, src);
+      Index maxi = (std::min)(j, dst.rows());
+      if (ClearOpposite)
+        for(Index i = 0; i < maxi; ++i)
+          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
+    }
+  }
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, StrictlyUpper, Dynamic, ClearOpposite>
+{
+  typedef typename Derived1::Index Index;
+  typedef typename Derived1::Scalar Scalar;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    for(Index j = 0; j < dst.cols(); ++j)
+    {
+      Index maxi = (std::min)(j, dst.rows());
+      for(Index i = 0; i < maxi; ++i)
+        dst.copyCoeff(i, j, src);
+      if (ClearOpposite)
+        for(Index i = maxi; i < dst.rows(); ++i)
+          dst.coeffRef(i, j) = Scalar(0);
+    }
+  }
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, StrictlyLower, Dynamic, ClearOpposite>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    for(Index j = 0; j < dst.cols(); ++j)
+    {
+      for(Index i = j+1; i < dst.rows(); ++i)
+        dst.copyCoeff(i, j, src);
+      Index maxi = (std::min)(j, dst.rows()-1);
+      if (ClearOpposite)
+        for(Index i = 0; i <= maxi; ++i)
+          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
+    }
+  }
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, UnitUpper, Dynamic, ClearOpposite>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    for(Index j = 0; j < dst.cols(); ++j)
+    {
+      Index maxi = (std::min)(j, dst.rows());
+      for(Index i = 0; i < maxi; ++i)
+        dst.copyCoeff(i, j, src);
+      if (ClearOpposite)
+      {
+        for(Index i = maxi+1; i < dst.rows(); ++i)
+          dst.coeffRef(i, j) = 0;
+      }
+    }
+    dst.diagonal().setOnes();
+  }
+};
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, UnitLower, Dynamic, ClearOpposite>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    for(Index j = 0; j < dst.cols(); ++j)
+    {
+      Index maxi = (std::min)(j, dst.rows());
+      for(Index i = maxi+1; i < dst.rows(); ++i)
+        dst.copyCoeff(i, j, src);
+      if (ClearOpposite)
+      {
+        for(Index i = 0; i < maxi; ++i)
+          dst.coeffRef(i, j) = 0;
+      }
+    }
+    dst.diagonal().setOnes();
+  }
+};
+
+} // end namespace internal
+
+// FIXME should we keep that possibility
+template<typename MatrixType, unsigned int Mode>
+template<typename OtherDerived>
+inline TriangularView<MatrixType, Mode>&
+TriangularView<MatrixType, Mode>::operator=(const MatrixBase<OtherDerived>& other)
+{
+  if(OtherDerived::Flags & EvalBeforeAssigningBit)
+  {
+    typename internal::plain_matrix_type<OtherDerived>::type other_evaluated(other.rows(), other.cols());
+    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived());
+    lazyAssign(other_evaluated);
+  }
+  else
+    lazyAssign(other.derived());
+  return *this;
+}
+
+// FIXME should we keep that possibility
+template<typename MatrixType, unsigned int Mode>
+template<typename OtherDerived>
+void TriangularView<MatrixType, Mode>::lazyAssign(const MatrixBase<OtherDerived>& other)
+{
+  enum {
+    unroll = MatrixType::SizeAtCompileTime != Dynamic
+          && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
+          && MatrixType::SizeAtCompileTime*internal::traits<OtherDerived>::CoeffReadCost/2 <= EIGEN_UNROLLING_LIMIT
+  };
+  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
+
+  internal::triangular_assignment_selector
+    <MatrixType, OtherDerived, int(Mode),
+    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
+    false // do not change the opposite triangular part
+    >::run(m_matrix.const_cast_derived(), other.derived());
+}
+
+
+
+template<typename MatrixType, unsigned int Mode>
+template<typename OtherDerived>
+inline TriangularView<MatrixType, Mode>&
+TriangularView<MatrixType, Mode>::operator=(const TriangularBase<OtherDerived>& other)
+{
+  eigen_assert(Mode == int(OtherDerived::Mode));
+  if(internal::traits<OtherDerived>::Flags & EvalBeforeAssigningBit)
+  {
+    typename OtherDerived::DenseMatrixType other_evaluated(other.rows(), other.cols());
+    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived().nestedExpression());
+    lazyAssign(other_evaluated);
+  }
+  else
+    lazyAssign(other.derived().nestedExpression());
+  return *this;
+}
+
+template<typename MatrixType, unsigned int Mode>
+template<typename OtherDerived>
+void TriangularView<MatrixType, Mode>::lazyAssign(const TriangularBase<OtherDerived>& other)
+{
+  enum {
+    unroll = MatrixType::SizeAtCompileTime != Dynamic
+                   && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
+                   && MatrixType::SizeAtCompileTime * internal::traits<OtherDerived>::CoeffReadCost / 2
+                        <= EIGEN_UNROLLING_LIMIT
+  };
+  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
+
+  internal::triangular_assignment_selector
+    <MatrixType, OtherDerived, int(Mode),
+    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
+    false // preserve the opposite triangular part
+    >::run(m_matrix.const_cast_derived(), other.derived().nestedExpression());
+}
+
+/***************************************************************************
+* Implementation of TriangularBase methods
+***************************************************************************/
+
+/** Assigns a triangular or selfadjoint matrix to a dense matrix.
+  * If the matrix is triangular, the opposite part is set to zero. */
+template<typename Derived>
+template<typename DenseDerived>
+void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+{
+  if(internal::traits<Derived>::Flags & EvalBeforeAssigningBit)
+  {
+    typename internal::plain_matrix_type<Derived>::type other_evaluated(rows(), cols());
+    evalToLazy(other_evaluated);
+    other.derived().swap(other_evaluated);
+  }
+  else
+    evalToLazy(other.derived());
+}
+
+/** Assigns a triangular or selfadjoint matrix to a dense matrix.
+  * If the matrix is triangular, the opposite part is set to zero. */
+template<typename Derived>
+template<typename DenseDerived>
+void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+{
+  enum {
+    unroll = DenseDerived::SizeAtCompileTime != Dynamic
+                   && internal::traits<Derived>::CoeffReadCost != Dynamic
+                   && DenseDerived::SizeAtCompileTime * internal::traits<Derived>::CoeffReadCost / 2
+                        <= EIGEN_UNROLLING_LIMIT
+  };
+  other.derived().resize(this->rows(), this->cols());
+
+  internal::triangular_assignment_selector
+    <DenseDerived, typename internal::traits<Derived>::MatrixTypeNestedCleaned, Derived::Mode,
+    unroll ? int(DenseDerived::SizeAtCompileTime) : Dynamic,
+    true // clear the opposite triangular part
+    >::run(other.derived(), derived().nestedExpression());
+}
+
+/***************************************************************************
+* Implementation of TriangularView methods
+***************************************************************************/
+
+/***************************************************************************
+* Implementation of MatrixBase methods
+***************************************************************************/
+
+#ifdef EIGEN2_SUPPORT
+
+// implementation of part<>(), including the SelfAdjoint case.
+
+namespace internal {
+template<typename MatrixType, unsigned int Mode>
+struct eigen2_part_return_type
+{
+  typedef TriangularView<MatrixType, Mode> type;
+};
+
+template<typename MatrixType>
+struct eigen2_part_return_type<MatrixType, SelfAdjoint>
+{
+  typedef SelfAdjointView<MatrixType, Upper> type;
+};
+}
+
+/** \deprecated use MatrixBase::triangularView() */
+template<typename Derived>
+template<unsigned int Mode>
+const typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part() const
+{
+  return derived();
+}
+
+/** \deprecated use MatrixBase::triangularView() */
+template<typename Derived>
+template<unsigned int Mode>
+typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part()
+{
+  return derived();
+}
+#endif
+
+/**
+  * \returns an expression of a triangular view extracted from the current matrix
+  *
+  * The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
+  * \c #Lower, \c #StrictlyLower, \c #UnitLower.
+  *
+  * Example: \include MatrixBase_extract.cpp
+  * Output: \verbinclude MatrixBase_extract.out
+  *
+  * \sa class TriangularView
+  */
+template<typename Derived>
+template<unsigned int Mode>
+typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
+MatrixBase<Derived>::triangularView()
+{
+  return derived();
+}
+
+/** This is the const version of MatrixBase::triangularView() */
+template<typename Derived>
+template<unsigned int Mode>
+typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
+MatrixBase<Derived>::triangularView() const
+{
+  return derived();
+}
+
+/** \returns true if *this is approximately equal to an upper triangular matrix,
+  *          within the precision given by \a prec.
+  *
+  * \sa isLowerTriangular()
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const
+{
+  using std::abs;
+  RealScalar maxAbsOnUpperPart = static_cast<RealScalar>(-1);
+  for(Index j = 0; j < cols(); ++j)
+  {
+    Index maxi = (std::min)(j, rows()-1);
+    for(Index i = 0; i <= maxi; ++i)
+    {
+      RealScalar absValue = abs(coeff(i,j));
+      if(absValue > maxAbsOnUpperPart) maxAbsOnUpperPart = absValue;
+    }
+  }
+  RealScalar threshold = maxAbsOnUpperPart * prec;
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = j+1; i < rows(); ++i)
+      if(abs(coeff(i, j)) > threshold) return false;
+  return true;
+}
+
+/** \returns true if *this is approximately equal to a lower triangular matrix,
+  *          within the precision given by \a prec.
+  *
+  * \sa isUpperTriangular()
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
+{
+  using std::abs;
+  RealScalar maxAbsOnLowerPart = static_cast<RealScalar>(-1);
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = j; i < rows(); ++i)
+    {
+      RealScalar absValue = abs(coeff(i,j));
+      if(absValue > maxAbsOnLowerPart) maxAbsOnLowerPart = absValue;
+    }
+  RealScalar threshold = maxAbsOnLowerPart * prec;
+  for(Index j = 1; j < cols(); ++j)
+  {
+    Index maxi = (std::min)(j, rows()-1);
+    for(Index i = 0; i < maxi; ++i)
+      if(abs(coeff(i, j)) > threshold) return false;
+  }
+  return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULARMATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/VectorBlock.h b/include/eigen3/Eigen/src/Core/VectorBlock.h
new file mode 100644
index 0000000..1a7330f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/VectorBlock.h
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_VECTORBLOCK_H
+#define EIGEN_VECTORBLOCK_H
+
+namespace Eigen { 
+
+/** \class VectorBlock
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size sub-vector
+  *
+  * \param VectorType the type of the object in which we are taking a sub-vector
+  * \param Size size of the sub-vector we are taking at compile time (optional)
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size sub-vector.
+  * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
+  * most of the time this is the only way it is used.
+  *
+  * However, if you want to directly maniputate sub-vector expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_VectorBlock.cpp
+  * Output: \verbinclude class_VectorBlock.out
+  *
+  * \note Even though this expression has dynamic size, in the case where \a VectorType
+  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+  * it does not cause a dynamic memory allocation.
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedVectorBlock.cpp
+  * Output: \verbinclude class_FixedVectorBlock.out
+  *
+  * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index)
+  */
+
+namespace internal {
+template<typename VectorType, int Size>
+struct traits<VectorBlock<VectorType, Size> >
+  : public traits<Block<VectorType,
+                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
+{
+};
+}
+
+template<typename VectorType, int Size> class VectorBlock
+  : public Block<VectorType,
+                     internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1>
+{
+    typedef Block<VectorType,
+                     internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1> Base;
+    enum {
+      IsColVector = !(internal::traits<VectorType>::Flags & RowMajorBit)
+    };
+  public:
+    EIGEN_DENSE_PUBLIC_INTERFACE(VectorBlock)
+
+    using Base::operator=;
+
+    /** Dynamic-size constructor
+      */
+    inline VectorBlock(VectorType& vector, Index start, Index size)
+      : Base(vector,
+             IsColVector ? start : 0, IsColVector ? 0 : start,
+             IsColVector ? size  : 1, IsColVector ? 1 : size)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
+    }
+
+    /** Fixed-size constructor
+      */
+    inline VectorBlock(VectorType& vector, Index start)
+      : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
+    }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_VECTORBLOCK_H
diff --git a/include/eigen3/Eigen/src/Core/VectorwiseOp.h b/include/eigen3/Eigen/src/Core/VectorwiseOp.h
new file mode 100644
index 0000000..d5ab036
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/VectorwiseOp.h
@@ -0,0 +1,642 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARTIAL_REDUX_H
+#define EIGEN_PARTIAL_REDUX_H
+
+namespace Eigen { 
+
+/** \class PartialReduxExpr
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression of a partially reduxed matrix
+  *
+  * \tparam MatrixType the type of the matrix we are applying the redux operation
+  * \tparam MemberOp type of the member functor
+  * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
+  *
+  * This class represents an expression of a partial redux operator of a matrix.
+  * It is the return type of some VectorwiseOp functions,
+  * and most of the time this is the only way it is used.
+  *
+  * \sa class VectorwiseOp
+  */
+
+template< typename MatrixType, typename MemberOp, int Direction>
+class PartialReduxExpr;
+
+namespace internal {
+template<typename MatrixType, typename MemberOp, int Direction>
+struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
+ : traits<MatrixType>
+{
+  typedef typename MemberOp::result_type Scalar;
+  typedef typename traits<MatrixType>::StorageKind StorageKind;
+  typedef typename traits<MatrixType>::XprKind XprKind;
+  typedef typename MatrixType::Scalar InputScalar;
+  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    RowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
+    Flags0 = (unsigned int)_MatrixTypeNested::Flags & HereditaryBits,
+    Flags = (Flags0 & ~RowMajorBit) | (RowsAtCompileTime == 1 ? RowMajorBit : 0),
+    TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
+  };
+  #if EIGEN_GNUC_AT_LEAST(3,4)
+  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
+  #else
+  typedef typename MemberOp::template Cost<InputScalar,TraversalSize> CostOpType;
+  #endif
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
+                  : TraversalSize * traits<_MatrixTypeNested>::CoeffReadCost + int(CostOpType::value)
+  };
+};
+}
+
+template< typename MatrixType, typename MemberOp, int Direction>
+class PartialReduxExpr : internal::no_assignment_operator,
+  public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<PartialReduxExpr>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr)
+    typedef typename internal::traits<PartialReduxExpr>::MatrixTypeNested MatrixTypeNested;
+    typedef typename internal::traits<PartialReduxExpr>::_MatrixTypeNested _MatrixTypeNested;
+
+    PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
+      : m_matrix(mat), m_functor(func) {}
+
+    Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
+    Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
+    {
+      if (Direction==Vertical)
+        return m_functor(m_matrix.col(j));
+      else
+        return m_functor(m_matrix.row(i));
+    }
+
+    const Scalar coeff(Index index) const
+    {
+      if (Direction==Vertical)
+        return m_functor(m_matrix.col(index));
+      else
+        return m_functor(m_matrix.row(index));
+    }
+
+  protected:
+    MatrixTypeNested m_matrix;
+    const MemberOp m_functor;
+};
+
+#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST)                               \
+  template <typename ResultType>                                        \
+  struct member_##MEMBER {                                              \
+    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                            \
+    typedef ResultType result_type;                                     \
+    template<typename Scalar, int Size> struct Cost                     \
+    { enum { value = COST }; };                                         \
+    template<typename XprType>                                          \
+    EIGEN_STRONG_INLINE ResultType operator()(const XprType& mat) const \
+    { return mat.MEMBER(); } \
+  }
+
+namespace internal {
+
+EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits<scalar_hypot_op<Scalar> >::Cost );
+EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost);
+EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
+
+
+template <typename BinaryOp, typename Scalar>
+struct member_redux {
+  typedef typename result_of<
+                     BinaryOp(Scalar)
+                   >::type  result_type;
+  template<typename _Scalar, int Size> struct Cost
+  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
+  member_redux(const BinaryOp func) : m_functor(func) {}
+  template<typename Derived>
+  inline result_type operator()(const DenseBase<Derived>& mat) const
+  { return mat.redux(m_functor); }
+  const BinaryOp m_functor;
+};
+}
+
+/** \class VectorwiseOp
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression providing partial reduction operations
+  *
+  * \param ExpressionType the type of the object on which to do partial reductions
+  * \param Direction indicates the direction of the redux (#Vertical or #Horizontal)
+  *
+  * This class represents a pseudo expression with partial reduction features.
+  * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
+  * and most of the time this is the only way it is used.
+  *
+  * Example: \include MatrixBase_colwise.cpp
+  * Output: \verbinclude MatrixBase_colwise.out
+  *
+  * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
+  */
+template<typename ExpressionType, int Direction> class VectorwiseOp
+{
+  public:
+
+    typedef typename ExpressionType::Scalar Scalar;
+    typedef typename ExpressionType::RealScalar RealScalar;
+    typedef typename ExpressionType::Index Index;
+    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
+        ExpressionType, ExpressionType&>::type ExpressionTypeNested;
+    typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
+
+    template<template<typename _Scalar> class Functor,
+                      typename Scalar=typename internal::traits<ExpressionType>::Scalar> struct ReturnType
+    {
+      typedef PartialReduxExpr<ExpressionType,
+                               Functor<Scalar>,
+                               Direction
+                              > Type;
+    };
+
+    template<typename BinaryOp> struct ReduxReturnType
+    {
+      typedef PartialReduxExpr<ExpressionType,
+                               internal::member_redux<BinaryOp,typename internal::traits<ExpressionType>::Scalar>,
+                               Direction
+                              > Type;
+    };
+
+    enum {
+      IsVertical   = (Direction==Vertical) ? 1 : 0,
+      IsHorizontal = (Direction==Horizontal) ? 1 : 0
+    };
+
+  protected:
+
+    /** \internal
+      * \returns the i-th subvector according to the \c Direction */
+    typedef typename internal::conditional<Direction==Vertical,
+                               typename ExpressionType::ColXpr,
+                               typename ExpressionType::RowXpr>::type SubVector;
+    SubVector subVector(Index i)
+    {
+      return SubVector(m_matrix.derived(),i);
+    }
+
+    /** \internal
+      * \returns the number of subvectors in the direction \c Direction */
+    Index subVectors() const
+    { return Direction==Vertical?m_matrix.cols():m_matrix.rows(); }
+
+    template<typename OtherDerived> struct ExtendedType {
+      typedef Replicate<OtherDerived,
+                        Direction==Vertical   ? 1 : ExpressionType::RowsAtCompileTime,
+                        Direction==Horizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
+    };
+
+    /** \internal
+      * Replicates a vector to match the size of \c *this */
+    template<typename OtherDerived>
+    typename ExtendedType<OtherDerived>::Type
+    extendedTo(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxColsAtCompileTime==1),
+                          YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxRowsAtCompileTime==1),
+                          YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
+      return typename ExtendedType<OtherDerived>::Type
+                      (other.derived(),
+                       Direction==Vertical   ? 1 : m_matrix.rows(),
+                       Direction==Horizontal ? 1 : m_matrix.cols());
+    }
+    
+    template<typename OtherDerived> struct OppositeExtendedType {
+      typedef Replicate<OtherDerived,
+                        Direction==Horizontal ? 1 : ExpressionType::RowsAtCompileTime,
+                        Direction==Vertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
+    };
+
+    /** \internal
+      * Replicates a vector in the opposite direction to match the size of \c *this */
+    template<typename OtherDerived>
+    typename OppositeExtendedType<OtherDerived>::Type
+    extendedToOpposite(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxColsAtCompileTime==1),
+                          YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxRowsAtCompileTime==1),
+                          YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
+      return typename OppositeExtendedType<OtherDerived>::Type
+                      (other.derived(),
+                       Direction==Horizontal  ? 1 : m_matrix.rows(),
+                       Direction==Vertical    ? 1 : m_matrix.cols());
+    }
+
+  public:
+
+    inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
+
+    /** \internal */
+    inline const ExpressionType& _expression() const { return m_matrix; }
+
+    /** \returns a row or column vector expression of \c *this reduxed by \a func
+      *
+      * The template parameter \a BinaryOp is the type of the functor
+      * of the custom redux operator. Note that func must be an associative operator.
+      *
+      * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
+      */
+    template<typename BinaryOp>
+    const typename ReduxReturnType<BinaryOp>::Type
+    redux(const BinaryOp& func = BinaryOp()) const
+    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), func); }
+
+    /** \returns a row (or column) vector expression of the smallest coefficient
+      * of each column (or row) of the referenced expression.
+      * 
+      * \warning the result is undefined if \c *this contains NaN.
+      *
+      * Example: \include PartialRedux_minCoeff.cpp
+      * Output: \verbinclude PartialRedux_minCoeff.out
+      *
+      * \sa DenseBase::minCoeff() */
+    const typename ReturnType<internal::member_minCoeff>::Type minCoeff() const
+    { return _expression(); }
+
+    /** \returns a row (or column) vector expression of the largest coefficient
+      * of each column (or row) of the referenced expression.
+      * 
+      * \warning the result is undefined if \c *this contains NaN.
+      *
+      * Example: \include PartialRedux_maxCoeff.cpp
+      * Output: \verbinclude PartialRedux_maxCoeff.out
+      *
+      * \sa DenseBase::maxCoeff() */
+    const typename ReturnType<internal::member_maxCoeff>::Type maxCoeff() const
+    { return _expression(); }
+
+    /** \returns a row (or column) vector expression of the squared norm
+      * of each column (or row) of the referenced expression.
+      *
+      * Example: \include PartialRedux_squaredNorm.cpp
+      * Output: \verbinclude PartialRedux_squaredNorm.out
+      *
+      * \sa DenseBase::squaredNorm() */
+    const typename ReturnType<internal::member_squaredNorm,RealScalar>::Type squaredNorm() const
+    { return _expression(); }
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression.
+      *
+      * Example: \include PartialRedux_norm.cpp
+      * Output: \verbinclude PartialRedux_norm.out
+      *
+      * \sa DenseBase::norm() */
+    const typename ReturnType<internal::member_norm,RealScalar>::Type norm() const
+    { return _expression(); }
+
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression, using
+      * blue's algorithm.
+      *
+      * \sa DenseBase::blueNorm() */
+    const typename ReturnType<internal::member_blueNorm,RealScalar>::Type blueNorm() const
+    { return _expression(); }
+
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression, avoiding
+      * underflow and overflow.
+      *
+      * \sa DenseBase::stableNorm() */
+    const typename ReturnType<internal::member_stableNorm,RealScalar>::Type stableNorm() const
+    { return _expression(); }
+
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression, avoiding
+      * underflow and overflow using a concatenation of hypot() calls.
+      *
+      * \sa DenseBase::hypotNorm() */
+    const typename ReturnType<internal::member_hypotNorm,RealScalar>::Type hypotNorm() const
+    { return _expression(); }
+
+    /** \returns a row (or column) vector expression of the sum
+      * of each column (or row) of the referenced expression.
+      *
+      * Example: \include PartialRedux_sum.cpp
+      * Output: \verbinclude PartialRedux_sum.out
+      *
+      * \sa DenseBase::sum() */
+    const typename ReturnType<internal::member_sum>::Type sum() const
+    { return _expression(); }
+
+    /** \returns a row (or column) vector expression of the mean
+    * of each column (or row) of the referenced expression.
+    *
+    * \sa DenseBase::mean() */
+    const typename ReturnType<internal::member_mean>::Type mean() const
+    { return _expression(); }
+
+    /** \returns a row (or column) vector expression representing
+      * whether \b all coefficients of each respective column (or row) are \c true.
+      *
+      * \sa DenseBase::all() */
+    const typename ReturnType<internal::member_all>::Type all() const
+    { return _expression(); }
+
+    /** \returns a row (or column) vector expression representing
+      * whether \b at \b least one coefficient of each respective column (or row) is \c true.
+      *
+      * \sa DenseBase::any() */
+    const typename ReturnType<internal::member_any>::Type any() const
+    { return _expression(); }
+
+    /** \returns a row (or column) vector expression representing
+      * the number of \c true coefficients of each respective column (or row).
+      *
+      * Example: \include PartialRedux_count.cpp
+      * Output: \verbinclude PartialRedux_count.out
+      *
+      * \sa DenseBase::count() */
+    const PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> count() const
+    { return _expression(); }
+
+    /** \returns a row (or column) vector expression of the product
+      * of each column (or row) of the referenced expression.
+      *
+      * Example: \include PartialRedux_prod.cpp
+      * Output: \verbinclude PartialRedux_prod.out
+      *
+      * \sa DenseBase::prod() */
+    const typename ReturnType<internal::member_prod>::Type prod() const
+    { return _expression(); }
+
+
+    /** \returns a matrix expression
+      * where each column (or row) are reversed.
+      *
+      * Example: \include Vectorwise_reverse.cpp
+      * Output: \verbinclude Vectorwise_reverse.out
+      *
+      * \sa DenseBase::reverse() */
+    const Reverse<ExpressionType, Direction> reverse() const
+    { return Reverse<ExpressionType, Direction>( _expression() ); }
+
+    typedef Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1> ReplicateReturnType;
+    const ReplicateReturnType replicate(Index factor) const;
+
+    /**
+      * \return an expression of the replication of each column (or row) of \c *this
+      *
+      * Example: \include DirectionWise_replicate.cpp
+      * Output: \verbinclude DirectionWise_replicate.out
+      *
+      * \sa VectorwiseOp::replicate(Index), DenseBase::replicate(), class Replicate
+      */
+    // NOTE implemented here because of sunstudio's compilation errors
+    template<int Factor> const Replicate<ExpressionType,(IsVertical?Factor:1),(IsHorizontal?Factor:1)>
+    replicate(Index factor = Factor) const
+    {
+      return Replicate<ExpressionType,Direction==Vertical?Factor:1,Direction==Horizontal?Factor:1>
+          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+    }
+
+/////////// Artithmetic operators ///////////
+
+    /** Copies the vector \a other to each subvector of \c *this */
+    template<typename OtherDerived>
+    ExpressionType& operator=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
+      return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
+    }
+
+    /** Adds the vector \a other to each subvector of \c *this */
+    template<typename OtherDerived>
+    ExpressionType& operator+=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
+    }
+
+    /** Substracts the vector \a other to each subvector of \c *this */
+    template<typename OtherDerived>
+    ExpressionType& operator-=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
+    }
+
+    /** Multiples each subvector of \c *this by the vector \a other */
+    template<typename OtherDerived>
+    ExpressionType& operator*=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      m_matrix *= extendedTo(other.derived());
+      return const_cast<ExpressionType&>(m_matrix);
+    }
+
+    /** Divides each subvector of \c *this by the vector \a other */
+    template<typename OtherDerived>
+    ExpressionType& operator/=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      m_matrix /= extendedTo(other.derived());
+      return const_cast<ExpressionType&>(m_matrix);
+    }
+
+    /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
+    template<typename OtherDerived> EIGEN_STRONG_INLINE
+    CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename ExtendedType<OtherDerived>::Type>
+    operator+(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix + extendedTo(other.derived());
+    }
+
+    /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
+    template<typename OtherDerived>
+    CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename ExtendedType<OtherDerived>::Type>
+    operator-(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix - extendedTo(other.derived());
+    }
+
+    /** Returns the expression where each subvector is the product of the vector \a other
+      * by the corresponding subvector of \c *this */
+    template<typename OtherDerived> EIGEN_STRONG_INLINE
+    CwiseBinaryOp<internal::scalar_product_op<Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename ExtendedType<OtherDerived>::Type>
+    operator*(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix * extendedTo(other.derived());
+    }
+
+    /** Returns the expression where each subvector is the quotient of the corresponding
+      * subvector of \c *this by the vector \a other */
+    template<typename OtherDerived>
+    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename ExtendedType<OtherDerived>::Type>
+    operator/(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix / extendedTo(other.derived());
+    }
+    
+    /** \returns an expression where each column of row of the referenced matrix are normalized.
+      * The referenced matrix is \b not modified.
+      * \sa MatrixBase::normalized(), normalize()
+      */
+    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
+    normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }
+    
+    
+    /** Normalize in-place each row or columns of the referenced matrix.
+      * \sa MatrixBase::normalize(), normalized()
+      */
+    void normalize() {
+      m_matrix = this->normalized();
+    }
+
+/////////// Geometry module ///////////
+
+    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+    Homogeneous<ExpressionType,Direction> homogeneous() const;
+    #endif
+
+    typedef typename ExpressionType::PlainObject CrossReturnType;
+    template<typename OtherDerived>
+    const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;
+
+    enum {
+      HNormalized_Size = Direction==Vertical ? internal::traits<ExpressionType>::RowsAtCompileTime
+                                             : internal::traits<ExpressionType>::ColsAtCompileTime,
+      HNormalized_SizeMinusOne = HNormalized_Size==Dynamic ? Dynamic : HNormalized_Size-1
+    };
+    typedef Block<const ExpressionType,
+                  Direction==Vertical   ? int(HNormalized_SizeMinusOne)
+                                        : int(internal::traits<ExpressionType>::RowsAtCompileTime),
+                  Direction==Horizontal ? int(HNormalized_SizeMinusOne)
+                                        : int(internal::traits<ExpressionType>::ColsAtCompileTime)>
+            HNormalized_Block;
+    typedef Block<const ExpressionType,
+                  Direction==Vertical   ? 1 : int(internal::traits<ExpressionType>::RowsAtCompileTime),
+                  Direction==Horizontal ? 1 : int(internal::traits<ExpressionType>::ColsAtCompileTime)>
+            HNormalized_Factors;
+    typedef CwiseBinaryOp<internal::scalar_quotient_op<typename internal::traits<ExpressionType>::Scalar>,
+                const HNormalized_Block,
+                const Replicate<HNormalized_Factors,
+                  Direction==Vertical   ? HNormalized_SizeMinusOne : 1,
+                  Direction==Horizontal ? HNormalized_SizeMinusOne : 1> >
+            HNormalizedReturnType;
+
+    const HNormalizedReturnType hnormalized() const;
+
+  protected:
+    ExpressionTypeNested m_matrix;
+};
+
+/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+  *
+  * Example: \include MatrixBase_colwise.cpp
+  * Output: \verbinclude MatrixBase_colwise.out
+  *
+  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+  */
+template<typename Derived>
+inline const typename DenseBase<Derived>::ConstColwiseReturnType
+DenseBase<Derived>::colwise() const
+{
+  return derived();
+}
+
+/** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
+  *
+  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::ColwiseReturnType
+DenseBase<Derived>::colwise()
+{
+  return derived();
+}
+
+/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+  *
+  * Example: \include MatrixBase_rowwise.cpp
+  * Output: \verbinclude MatrixBase_rowwise.out
+  *
+  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+  */
+template<typename Derived>
+inline const typename DenseBase<Derived>::ConstRowwiseReturnType
+DenseBase<Derived>::rowwise() const
+{
+  return derived();
+}
+
+/** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
+  *
+  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::RowwiseReturnType
+DenseBase<Derived>::rowwise()
+{
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARTIAL_REDUX_H
diff --git a/include/eigen3/Eigen/src/Core/Visitor.h b/include/eigen3/Eigen/src/Core/Visitor.h
new file mode 100644
index 0000000..64867b7
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/Visitor.h
@@ -0,0 +1,237 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_VISITOR_H
+#define EIGEN_VISITOR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Visitor, typename Derived, int UnrollCount>
+struct visitor_impl
+{
+  enum {
+    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+  };
+
+  static inline void run(const Derived &mat, Visitor& visitor)
+  {
+    visitor_impl<Visitor, Derived, UnrollCount-1>::run(mat, visitor);
+    visitor(mat.coeff(row, col), row, col);
+  }
+};
+
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, 1>
+{
+  static inline void run(const Derived &mat, Visitor& visitor)
+  {
+    return visitor.init(mat.coeff(0, 0), 0, 0);
+  }
+};
+
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, Dynamic>
+{
+  typedef typename Derived::Index Index;
+  static inline void run(const Derived& mat, Visitor& visitor)
+  {
+    visitor.init(mat.coeff(0,0), 0, 0);
+    for(Index i = 1; i < mat.rows(); ++i)
+      visitor(mat.coeff(i, 0), i, 0);
+    for(Index j = 1; j < mat.cols(); ++j)
+      for(Index i = 0; i < mat.rows(); ++i)
+        visitor(mat.coeff(i, j), i, j);
+  }
+};
+
+} // end namespace internal
+
+/** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
+  *
+  * The template parameter \a Visitor is the type of the visitor and provides the following interface:
+  * \code
+  * struct MyVisitor {
+  *   // called for the first coefficient
+  *   void init(const Scalar& value, Index i, Index j);
+  *   // called for all other coefficients
+  *   void operator() (const Scalar& value, Index i, Index j);
+  * };
+  * \endcode
+  *
+  * \note compared to one or two \em for \em loops, visitors offer automatic
+  * unrolling for small fixed size matrix.
+  *
+  * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
+  */
+template<typename Derived>
+template<typename Visitor>
+void DenseBase<Derived>::visit(Visitor& visitor) const
+{
+  enum { unroll = SizeAtCompileTime != Dynamic
+                   && CoeffReadCost != Dynamic
+                   && (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
+                   && SizeAtCompileTime * CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
+                      <= EIGEN_UNROLLING_LIMIT };
+  return internal::visitor_impl<Visitor, Derived,
+      unroll ? int(SizeAtCompileTime) : Dynamic
+    >::run(derived(), visitor);
+}
+
+namespace internal {
+
+/** \internal
+  * \brief Base class to implement min and max visitors
+  */
+template <typename Derived>
+struct coeff_visitor
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  Index row, col;
+  Scalar res;
+  inline void init(const Scalar& value, Index i, Index j)
+  {
+    res = value;
+    row = i;
+    col = j;
+  }
+};
+
+/** \internal
+  * \brief Visitor computing the min coefficient with its value and coordinates
+  *
+  * \sa DenseBase::minCoeff(Index*, Index*)
+  */
+template <typename Derived>
+struct min_coeff_visitor : coeff_visitor<Derived>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if(value < this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar>
+struct functor_traits<min_coeff_visitor<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost
+  };
+};
+
+/** \internal
+  * \brief Visitor computing the max coefficient with its value and coordinates
+  *
+  * \sa DenseBase::maxCoeff(Index*, Index*)
+  */
+template <typename Derived>
+struct max_coeff_visitor : coeff_visitor<Derived>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if(value > this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar>
+struct functor_traits<max_coeff_visitor<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost
+  };
+};
+
+} // end namespace internal
+
+/** \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+  * \warning the result is undefined if \c *this contains NaN.
+  *
+  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visitor(), DenseBase::minCoeff()
+  */
+template<typename Derived>
+template<typename IndexType>
+typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+{
+  internal::min_coeff_visitor<Derived> minVisitor;
+  this->visit(minVisitor);
+  *rowId = minVisitor.row;
+  if (colId) *colId = minVisitor.col;
+  return minVisitor.res;
+}
+
+/** \returns the minimum of all coefficients of *this and puts in *index its location.
+  * \warning the result is undefined if \c *this contains NaN. 
+  *
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::minCoeff()
+  */
+template<typename Derived>
+template<typename IndexType>
+typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::minCoeff(IndexType* index) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  internal::min_coeff_visitor<Derived> minVisitor;
+  this->visit(minVisitor);
+  *index = (RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row;
+  return minVisitor.res;
+}
+
+/** \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+  * \warning the result is undefined if \c *this contains NaN. 
+  *
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
+  */
+template<typename Derived>
+template<typename IndexType>
+typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
+{
+  internal::max_coeff_visitor<Derived> maxVisitor;
+  this->visit(maxVisitor);
+  *rowPtr = maxVisitor.row;
+  if (colPtr) *colPtr = maxVisitor.col;
+  return maxVisitor.res;
+}
+
+/** \returns the maximum of all coefficients of *this and puts in *index its location.
+  * \warning the result is undefined if \c *this contains NaN.
+  *
+  * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
+  */
+template<typename Derived>
+template<typename IndexType>
+typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::maxCoeff(IndexType* index) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  internal::max_coeff_visitor<Derived> maxVisitor;
+  this->visit(maxVisitor);
+  *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row;
+  return maxVisitor.res;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_VISITOR_H
diff --git a/include/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h b/include/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h
new file mode 100644
index 0000000..68d9a2b
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -0,0 +1,217 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_ALTIVEC_H
+#define EIGEN_COMPLEX_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+static Packet16uc p16uc_COMPLEX_RE   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_COMPLEX_IM   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_COMPLEX_REV  = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  Packet4f  v;
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
+  if((ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float *)&from);
+  else
+    res.v = ploadu<Packet4f>((const float *)&from);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET_HI);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, p4ui_CONJ_XOR)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  v1 = vec_perm(a.v, a.v, p16uc_COMPLEX_RE);
+  // Get the imaginary parts of a
+  v2 = vec_perm(a.v, a.v, p16uc_COMPLEX_IM);
+  // multiply a_re * b 
+  v1 = vec_madd(v1, b.v, p4f_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(v2, b.v, p4f_ZERO);
+  v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
+  // permute back to a proper order
+  v2 = vec_perm(v2, v2, p16uc_COMPLEX_REV);
+  
+  return Packet2cf(vec_add(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from)
+{
+  return pset1<Packet2cf>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  pstore((float *)&res, a.v);
+
+  return res[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet4f rev_a;
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX_REV2);
+  return Packet2cf(rev_a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  b = (Packet4f) vec_sld(a.v, a.v, 8);
+  b = padd(a.v, b);
+  return pfirst(Packet2cf(b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  Packet4f b1, b2;
+  
+  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
+  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = (Packet4f) vec_sld(b2, b2, 8);
+  b2 = padd(b1, b2);
+
+  return Packet2cf(b2);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  Packet2cf prod;
+  b = (Packet4f) vec_sld(a.v, a.v, 8);
+  prod = pmul(a, Packet2cf(b));
+
+  return pfirst(prod);
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = vec_sld(first.v, second.v, 8);
+    }
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
+  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX_REV))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
+{
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_ALTIVEC_H
diff --git a/include/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h b/include/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h
new file mode 100644
index 0000000..e408996
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -0,0 +1,501 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Konstantinos Margaritis <markos at codex.gr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
+#define EIGEN_PACKET_MATH_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
+#endif
+
+#ifndef EIGEN_HAS_FUSE_CJMADD
+#define EIGEN_HAS_FUSE_CJMADD 1
+#endif
+
+// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#endif
+
+typedef __vector float          Packet4f;
+typedef __vector int            Packet4i;
+typedef __vector unsigned int   Packet4ui;
+typedef __vector __bool int     Packet4bi;
+typedef __vector short int      Packet8i;
+typedef __vector unsigned char  Packet16uc;
+
+// We don't want to write the same code all the time, but we need to reuse the constants
+// and it doesn't really work to declare them global, so we define macros instead
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = vec_splat_s32(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#define DST_CHAN 1
+#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
+
+// Define global static constants:
+static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
+static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
+static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
+static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
+
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
+static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
+
+template<> struct packet_traits<float>  : default_packet_traits
+{
+  typedef Packet4f type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+
+    // FIXME check the Has*
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+};
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet4i type;
+  enum {
+    // FIXME check the Has*
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4
+  };
+};
+
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+/*
+inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
+{
+  union {
+    Packet4f   v;
+    float n[4];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
+{
+  union {
+    Packet4i   v;
+    int n[4];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
+{
+  union {
+    Packet4ui   v;
+    unsigned int n[4];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
+{
+  union {
+    Packet4bi v;
+    unsigned int n[4];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
+  return s;
+}
+*/
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
+  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
+  float EIGEN_ALIGN16 af[4];
+  af[0] = from;
+  Packet4f vc = vec_ld(0, af);
+  vc = vec_splat(vc, 0);
+  return vc;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
+  int EIGEN_ALIGN16 ai[4];
+  ai[0] = from;
+  Packet4i vc = vec_ld(0, ai);
+  vc = vec_splat(vc, 0);
+  return vc;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
+/* Commented out: it's actually slower than processing it scalar
+ *
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
+{
+  // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
+  //Set up constants, variables
+  Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
+
+  // Get the absolute values
+  a1  = vec_abs(a);
+  b1  = vec_abs(b);
+
+  // Get the signs using xor
+  Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
+
+  // Do the multiplication for the asbolute values.
+  bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
+  low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
+  high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
+  high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
+  prod = vec_add( low_prod, high_prod );
+
+  // NOR the product and select only the negative elements according to the sign mask
+  prod_ = vec_nor(prod, prod);
+  prod_ = vec_sel(p4i_ZERO, prod_, sgn);
+
+  // Add 1 to the result to get the negative numbers
+  v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
+  prod_ = vec_add(prod_, v1sel);
+
+  // Merge the results back to the final vector.
+  prod = vec_sel(prod, prod_, sgn);
+
+  return prod;
+}
+*/
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f t, y_0, y_1, res;
+
+  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
+  y_0 = vec_re(b);
+
+  // Do one Newton-Raphson iteration to get the needed accuracy
+  t   = vec_nmsub(y_0, b, p4f_ONE);
+  y_1 = vec_madd(y_0, t, y_0);
+
+  res = vec_madd(a, y_1, p4f_ZERO);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by AltiVec");
+  return pset1<Packet4i>(0);
+}
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
+  Packet16uc MSQ, LSQ;
+  Packet16uc mask;
+  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
+  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
+  mask = vec_lvsl(0, from);                        // create the permute mask
+  return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
+
+}
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
+  Packet16uc MSQ, LSQ;
+  Packet16uc mask;
+  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
+  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
+  mask = vec_lvsl(0, from);                        // create the permute mask
+  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+{
+  Packet4f p;
+  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4f>(from);
+  else                              p = ploadu<Packet4f>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  Packet4i p;
+  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4i>(from);
+  else                              p = ploadu<Packet4i>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE
+  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
+  // Warning: not thread safe!
+  Packet16uc MSQ, LSQ, edges;
+  Packet16uc edgeAlign, align;
+
+  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
+  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
+  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
+  edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
+  align = vec_lvsr( 0, to );                                // permute map to misalign data
+  MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
+  LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
+  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
+  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE
+  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
+  // Warning: not thread safe!
+  Packet16uc MSQ, LSQ, edges;
+  Packet16uc edgeAlign, align;
+
+  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
+  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
+  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
+  edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
+  align = vec_lvsr( 0, to );                                // permute map to misalign data
+  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
+  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
+  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
+  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, sum;
+  b   = (Packet4f) vec_sld(a, a, 8);
+  sum = vec_add(a, b);
+  b   = (Packet4f) vec_sld(sum, sum, 4);
+  sum = vec_add(sum, b);
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  Packet4f v[4], sum[4];
+
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
+
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = vec_add(sum[0], sum[1]);
+  // Lines 2+3
+  sum[1] = vec_add(sum[2], sum[3]);
+  // Add the results
+  sum[0] = vec_add(sum[0], sum[1]);
+
+  return sum[0];
+}
+
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i sum;
+  sum = vec_sums(a, p4i_ZERO);
+  sum = vec_sld(sum, p4i_ZERO, 12);
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  Packet4i v[4], sum[4];
+
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
+
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = vec_add(sum[0], sum[1]);
+  // Lines 2+3
+  sum[1] = vec_add(sum[2], sum[3]);
+  // Add the results
+  sum[0] = vec_add(sum[0], sum[1]);
+
+  return sum[0];
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f prod;
+  prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
+  return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
+}
+
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b = vec_min(a, vec_sld(a, a, 8));
+  res = vec_min(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b = vec_min(a, vec_sld(a, a, 8));
+  res = vec_min(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b = vec_max(a, vec_sld(a, a, 8));
+  res = vec_max(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b = vec_max(a, vec_sld(a, a, 8));
+  res = vec_max(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    if (Offset!=0)
+      first = vec_sld(first, second, Offset*4);
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4i>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
+  {
+    if (Offset!=0)
+      first = vec_sld(first, second, Offset*4);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_ALTIVEC_H
diff --git a/include/eigen3/Eigen/src/Core/arch/Default/Settings.h b/include/eigen3/Eigen/src/Core/arch/Default/Settings.h
new file mode 100644
index 0000000..097373c
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/arch/Default/Settings.h
@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+/* All the parameters defined in this file can be specialized in the
+ * architecture specific files, and/or by the user.
+ * More to come... */
+
+#ifndef EIGEN_DEFAULT_SETTINGS_H
+#define EIGEN_DEFAULT_SETTINGS_H
+
+/** Defines the maximal loop size to enable meta unrolling of loops.
+  * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
+  * it does not correspond to the number of iterations or the number of instructions
+  */
+#ifndef EIGEN_UNROLLING_LIMIT
+#define EIGEN_UNROLLING_LIMIT 100
+#endif
+
+/** Defines the threshold between a "small" and a "large" matrix.
+  * This threshold is mainly used to select the proper product implementation.
+  */
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+/** Defines the maximal width of the blocks used in the triangular product and solver
+  * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
+  */
+#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
+#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
+#endif
+
+
+/** Defines the default number of registers available for that architecture.
+  * Currently it must be 8 or 16. Other values will fail.
+  */
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
+#endif
+
+#endif // EIGEN_DEFAULT_SETTINGS_H
diff --git a/include/eigen3/Eigen/src/Core/arch/NEON/Complex.h b/include/eigen3/Eigen/src/Core/arch/NEON/Complex.h
new file mode 100644
index 0000000..8d9255e
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/arch/NEON/Complex.h
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_NEON_H
+#define EIGEN_COMPLEX_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000);
+static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000);
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  Packet4f  v;
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  float32x2_t r64;
+  r64 = vld1_f32((float *)&from);
+
+  return Packet2cf(vcombine_f32(r64, r64));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  Packet4ui b = vreinterpretq_u32_f32(a.v);
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f v1, v2;
+
+  // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
+  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
+  // Multiply the real a with b
+  v1 = vmulq_f32(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmulq_f32(v2, b.v);
+  // Conjugate v2 
+  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  v2 = vrev64q_f32(v2);
+  // Add and return the result
+  return Packet2cf(vaddq_f32(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 x[2];
+  vst1q_f32((float *)x, a.v);
+  return x[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  float32x2_t a_lo, a_hi;
+  Packet4f a_r128;
+
+  a_lo = vget_low_f32(a.v);
+  a_hi = vget_high_f32(a.v);
+  a_r128 = vcombine_f32(a_hi, a_lo);
+
+  return Packet2cf(a_r128);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
+{
+  return Packet2cf(vrev64q_f32(a.v));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  float32x2_t a1, a2;
+  std::complex<float> s;
+
+  a1 = vget_low_f32(a.v);
+  a2 = vget_high_f32(a.v);
+  a2 = vadd_f32(a1, a2);
+  vst1_f32((float *)&s, a2);
+
+  return s;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  Packet4f sum1, sum2, sum;
+
+  // Add the first two 64-bit float32x2_t of vecs[0]
+  sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
+  sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
+  sum = vaddq_f32(sum1, sum2);
+
+  return Packet2cf(sum);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  float32x2_t a1, a2, v1, v2, prod;
+  std::complex<float> s;
+
+  a1 = vget_low_f32(a.v);
+  a2 = vget_high_f32(a.v);
+   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  v1 = vdup_lane_f32(a1, 0);
+  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  v2 = vdup_lane_f32(a1, 1);
+  // Multiply the real a with b
+  v1 = vmul_f32(v1, a2);
+  // Multiply the imag a with b
+  v2 = vmul_f32(v2, a2);
+  // Conjugate v2 
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  v2 = vrev64_f32(v2);
+  // Add v1, v2
+  prod = vadd_f32(v1, v2);
+
+  vst1_f32((float *)&s, prod);
+
+  return s;
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = vextq_f32(first.v, second.v, 2);
+    }
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet4f s, rev_s;
+
+  // this computes the norm
+  s = vmulq_f32(b.v, b.v);
+  rev_s = vrev64q_f32(s);
+
+  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_NEON_H
diff --git a/include/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h b/include/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h
new file mode 100644
index 0000000..94dfab3
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -0,0 +1,419 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos at codex.gr>
+// Heavily based on Gael's SSE version.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_NEON_H
+#define EIGEN_PACKET_MATH_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+// FIXME NEON has 16 quad registers, but since the current register allocator
+// is so bad, it is much better to reduce it to 8
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
+#endif
+
+typedef float32x4_t Packet4f;
+typedef int32x4_t   Packet4i;
+typedef uint32x4_t  Packet4ui;
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#if defined(__llvm__) && !defined(__clang__)
+  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
+  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
+  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
+#else
+  //Default initializer for packets
+  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
+  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
+#endif
+
+// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
+// which available on LLVM and GCC (at least)
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
+  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#elif defined __pld
+  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
+#elif !defined(__aarch64__)
+  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#else
+  // by default no explicit prefetching
+  #define EIGEN_ARM_PREFETCH(ADDR)
+#endif
+
+template<> struct packet_traits<float>  : default_packet_traits
+{
+  typedef Packet4f type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+   
+    HasDiv  = 1,
+    // FIXME check the Has*
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+};
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet4i type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4
+    // FIXME check the Has*
+  };
+};
+
+#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
+// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
+EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
+EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
+#endif
+
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
+{
+  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  return vaddq_f32(pset1<Packet4f>(a), countdown);
+}
+template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
+{
+  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  return vaddq_s32(pset1<Packet4i>(a), countdown);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f inv, restep, div;
+
+  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
+  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
+  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
+  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
+  // Newton-Raphson and vrecpsq_f32()
+  inv = vrecpeq_f32(b);
+
+  // This returns a differential, by which we will have to multiply inv to get a better
+  // approximation of 1/b.
+  restep = vrecpsq_f32(b, inv);
+  inv = vmulq_f32(restep, inv);
+
+  // Finally, multiply a by 1/b and get the wanted result of the division.
+  div = vmulq_f32(a, inv);
+
+  return div;
+}
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4i>(0);
+}
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*   from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+{
+  float32x2_t lo, hi;
+  lo = vld1_dup_f32(from);
+  hi = vld1_dup_f32(from+1);
+  return vcombine_f32(lo, hi);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  int32x2_t lo, hi;
+  lo = vld1_dup_s32(from);
+  hi = vld1_dup_s32(from+1);
+  return vcombine_s32(lo, hi);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }
+
+// FIXME only store the 2 first elements ?
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  float32x2_t a_lo, a_hi;
+  Packet4f a_r64;
+
+  a_r64 = vrev64q_f32(a);
+  a_lo = vget_low_f32(a_r64);
+  a_hi = vget_high_f32(a_r64);
+  return vcombine_f32(a_hi, a_lo);
+}
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  int32x2_t a_lo, a_hi;
+  Packet4i a_r64;
+
+  a_r64 = vrev64q_s32(a);
+  a_lo = vget_low_s32(a_r64);
+  a_hi = vget_high_s32(a_r64);
+  return vcombine_s32(a_hi, a_lo);
+}
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, sum;
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  sum = vpadd_f32(a_lo, a_hi);
+  sum = vpadd_f32(sum, sum);
+  return vget_lane_f32(sum, 0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  float32x4x2_t vtrn1, vtrn2, res1, res2;
+  Packet4f sum1, sum2, sum;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
+  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
+  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
+  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
+
+  // Do the addition of the resulting vectors
+  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
+  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
+  sum = vaddq_f32(sum1, sum2);
+
+  return sum;
+}
+
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, sum;
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  sum = vpadd_s32(a_lo, a_hi);
+  sum = vpadd_s32(sum, sum);
+  return vget_lane_s32(sum, 0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  int32x4x2_t vtrn1, vtrn2, res1, res2;
+  Packet4i sum1, sum2, sum;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
+  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
+  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
+  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
+
+  // Do the addition of the resulting vectors
+  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
+  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
+  sum = vaddq_s32(sum1, sum2);
+
+  return sum;
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, prod;
+
+  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
+  prod = vmul_f32(a_lo, a_hi);
+  // Multiply prod with its swapped value |a2*a4|a1*a3|
+  prod = vmul_f32(prod, vrev64_f32(prod));
+
+  return vget_lane_f32(prod, 0);
+}
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, prod;
+
+  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
+  prod = vmul_s32(a_lo, a_hi);
+  // Multiply prod with its swapped value |a2*a4|a1*a3|
+  prod = vmul_s32(prod, vrev64_s32(prod));
+
+  return vget_lane_s32(prod, 0);
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, min;
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  min = vpmin_f32(a_lo, a_hi);
+  min = vpmin_f32(min, min);
+
+  return vget_lane_f32(min, 0);
+}
+
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, min;
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  min = vpmin_s32(a_lo, a_hi);
+  min = vpmin_s32(min, min);
+  
+  return vget_lane_s32(min, 0);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, max;
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  max = vpmax_f32(a_lo, a_hi);
+  max = vpmax_f32(max, max);
+
+  return vget_lane_f32(max, 0);
+}
+
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, max;
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  max = vpmax_s32(a_lo, a_hi);
+
+  return vget_lane_s32(max, 0);
+}
+
+// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
+// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
+#define PALIGN_NEON(Offset,Type,Command) \
+template<>\
+struct palign_impl<Offset,Type>\
+{\
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
+    {\
+        if (Offset!=0)\
+            first = Command(first, second, Offset);\
+    }\
+};\
+
+PALIGN_NEON(0,Packet4f,vextq_f32)
+PALIGN_NEON(1,Packet4f,vextq_f32)
+PALIGN_NEON(2,Packet4f,vextq_f32)
+PALIGN_NEON(3,Packet4f,vextq_f32)
+PALIGN_NEON(0,Packet4i,vextq_s32)
+PALIGN_NEON(1,Packet4i,vextq_s32)
+PALIGN_NEON(2,Packet4i,vextq_s32)
+PALIGN_NEON(3,Packet4i,vextq_s32)
+    
+#undef PALIGN_NEON
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_NEON_H
diff --git a/include/eigen3/Eigen/src/Core/arch/SSE/Complex.h b/include/eigen3/Eigen/src/Core/arch/SSE/Complex.h
new file mode 100644
index 0000000..91bba5e
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/arch/SSE/Complex.h
@@ -0,0 +1,442 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_SSE_H
+#define EIGEN_COMPLEX_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
+  __m128  v;
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
+{
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v,mask));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for SSE3 and 4
+  #ifdef EIGEN_VECTORIZE_SSE3
+  return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
+                                 _mm_mul_ps(_mm_movehdup_ps(a.v),
+                                            vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+//   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+//                                  _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
+//                                             vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+  #else
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
+  return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+                              _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                                    vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+#if EIGEN_GNUC_AT_MOST(4,2)
+  // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
+  res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
+#elif EIGEN_GNUC_AT_LEAST(4,6)
+  // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wuninitialized"
+  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+  #pragma GCC diagnostic pop
+#else
+  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+#endif
+  return Packet2cf(_mm_movelh_ps(res.v,res.v));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  #if EIGEN_GNUC_AT_MOST(4,3)
+  // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares...
+  // This workaround also fix invalid code generation with gcc 4.3
+  EIGEN_ALIGN16 std::complex<float> res[2];
+  _mm_store_ps((float*)res, a.v);
+  return res[0];
+  #else
+  std::complex<float> res;
+  _mm_storel_pi((__m64*)&res, a.v);
+  return res;
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = _mm_movehl_ps(first.v, first.v);
+      first.v = _mm_movelh_ps(first.v, second.v);
+    }
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return internal::pmul(a, pconj(b));
+    #else
+    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
+    return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
+                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+    #endif
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return internal::pmul(pconj(a), b);
+    #else
+    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
+    return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+                                _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                                      vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
+    #endif
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return pconj(internal::pmul(a, b));
+    #else
+    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
+    return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
+                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+    #endif
+  }
+};
+
+template<> struct conj_helper<Packet4f, Packet2cf, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
+  { return Packet2cf(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet2cf, Packet4f, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
+  { return Packet2cf(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for SSE3 and 4
+  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  __m128 s = _mm_mul_ps(b.v,b.v);
+  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
+}
+
+
+//---------- double ----------
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
+  __m128d  v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
+{
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+  return Packet1cd(_mm_xor_pd(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for SSE3 and 4
+  #ifdef EIGEN_VECTORIZE_SSE3
+  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
+                                 _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
+                                            vec2d_swizzle1(b.v, 1, 0))));
+  #else
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
+  return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
+                              _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
+                                                    vec2d_swizzle1(b.v, 1, 0)), mask)));
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
+
+// FIXME force unaligned load, this is a temporary fix
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+
+// FIXME force unaligned store, this is a temporary fix
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, a.v);
+  return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+{
+  return vecs[0];
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return internal::pmul(a, pconj(b));
+    #else
+    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+    return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
+                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
+                                           vec2d_swizzle1(b.v, 1, 0))));
+    #endif
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return internal::pmul(pconj(a), b);
+    #else
+    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+    return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
+                                _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
+                                                      vec2d_swizzle1(b.v, 1, 0)), mask)));
+    #endif
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return pconj(internal::pmul(a, b));
+    #else
+    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+    return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
+                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
+                                           vec2d_swizzle1(b.v, 1, 0))));
+    #endif
+  }
+};
+
+template<> struct conj_helper<Packet2d, Packet1cd, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
+  { return Packet1cd(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet1cd, Packet2d, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
+  { return Packet1cd(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for SSE3 and 4
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  __m128d s = _mm_mul_pd(b.v,b.v);
+  return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(x.v));
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_SSE_H
diff --git a/include/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h b/include/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h
new file mode 100644
index 0000000..d16f30b
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -0,0 +1,475 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
+#define EIGEN_MATH_FUNCTIONS_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+  /* the smallest non denormalized float number */
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000);//-1.f/0.f);
+  
+  /* natural logarithm computed for 4 simultaneous float
+    return NaN for x <= 0
+  */
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+
+  Packet4i emm0;
+
+  Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
+  Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
+
+  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, p4f_inv_mant_mask);
+  x = _mm_or_ps(x, p4f_half);
+
+  emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
+  Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
+  Packet4f tmp = _mm_and_ps(x, mask);
+  x = psub(x, p4f_1);
+  e = psub(e, _mm_and_ps(p4f_1, mask));
+  x = padd(x, tmp);
+
+  Packet4f x2 = pmul(x,x);
+  Packet4f x3 = pmul(x2,x);
+
+  Packet4f y, y1, y2;
+  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y  = pmadd(y , x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y1 = pmul(e, p4f_cephes_log_q1);
+  tmp = pmul(x2, p4f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p4f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+  // negative arg will be NAN, 0 will be -INF
+  return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
+                   _mm_and_ps(iszero_mask, p4f_minus_inf));
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+  Packet4f tmp = _mm_setzero_ps(), fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  fx = _mm_floor_ps(fx);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+  /* if greater, substract 1 */
+  Packet4f mask = _mm_cmpgt_ps(tmp, fx);
+  mask = _mm_and_ps(mask, p4f_1);
+  fx = psub(tmp, mask);
+#endif
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
+}
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& _x)
+{
+  Packet2d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+  static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
+
+  Packet2d tmp = _mm_setzero_pd(), fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  fx = _mm_floor_pd(fx);
+#else
+  emm0 = _mm_cvttpd_epi32(fx);
+  tmp  = _mm_cvtepi32_pd(emm0);
+  /* if greater, substract 1 */
+  Packet2d mask = _mm_cmpgt_pd(tmp, fx);
+  mask = _mm_and_pd(mask, p2d_1);
+  fx = psub(tmp, mask);
+#endif
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = _mm_cvttpd_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
+  emm0 = _mm_slli_epi32(emm0, 20);
+  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
+  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
+}
+
+/* evaluation of 4 sines at onces, using SSE2 intrinsics.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+*/
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psin<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+
+  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
+  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
+  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
+  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
+
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
+
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+  Packet4i emm0, emm2;
+  sign_bit = x;
+  /* take the absolute value */
+  x = pabs(x);
+
+  /* take the modulo */
+
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
+
+  /* scale by 4/Pi */
+  y = pmul(x, p4f_cephes_FOPI);
+
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, p4i_1);
+  emm2 = _mm_and_si128(emm2, p4i_not1);
+  y = _mm_cvtepi32_ps(emm2);
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, p4i_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, p4i_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+  Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
+  Packet4f poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = pmul(y, p4f_minus_cephes_DP1);
+  xmm2 = pmul(y, p4f_minus_cephes_DP2);
+  xmm3 = pmul(y, p4f_minus_cephes_DP3);
+  x = padd(x, xmm1);
+  x = padd(x, xmm2);
+  x = padd(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = p4f_coscof_p0;
+  Packet4f z = _mm_mul_ps(x,x);
+
+  y = pmadd(y, z, p4f_coscof_p1);
+  y = pmadd(y, z, p4f_coscof_p2);
+  y = pmul(y, z);
+  y = pmul(y, z);
+  Packet4f tmp = pmul(z, p4f_half);
+  y = psub(y, tmp);
+  y = padd(y, p4f_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  Packet4f y2 = p4f_sincof_p0;
+  y2 = pmadd(y2, z, p4f_sincof_p1);
+  y2 = pmadd(y2, z, p4f_sincof_p2);
+  y2 = pmul(y2, z);
+  y2 = pmul(y2, x);
+  y2 = padd(y2, x);
+
+  /* select the correct result from the two polynoms */
+  y2 = _mm_and_ps(poly_mask, y2);
+  y = _mm_andnot_ps(poly_mask, y);
+  y = _mm_or_ps(y,y2);
+  /* update the sign */
+  return _mm_xor_ps(y, sign_bit);
+}
+
+/* almost the same as psin */
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pcos<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+
+  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
+  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
+  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
+  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
+
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+  Packet4i emm0, emm2;
+
+  x = pabs(x);
+
+  /* scale by 4/Pi */
+  y = pmul(x, p4f_cephes_FOPI);
+
+  /* get the integer part of y */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, p4i_1);
+  emm2 = _mm_and_si128(emm2, p4i_not1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, p4i_2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, p4i_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, p4i_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+  Packet4f sign_bit = _mm_castsi128_ps(emm0);
+  Packet4f poly_mask = _mm_castsi128_ps(emm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = pmul(y, p4f_minus_cephes_DP1);
+  xmm2 = pmul(y, p4f_minus_cephes_DP2);
+  xmm3 = pmul(y, p4f_minus_cephes_DP3);
+  x = padd(x, xmm1);
+  x = padd(x, xmm2);
+  x = padd(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = p4f_coscof_p0;
+  Packet4f z = pmul(x,x);
+
+  y = pmadd(y,z,p4f_coscof_p1);
+  y = pmadd(y,z,p4f_coscof_p2);
+  y = pmul(y, z);
+  y = pmul(y, z);
+  Packet4f tmp = _mm_mul_ps(z, p4f_half);
+  y = psub(y, tmp);
+  y = padd(y, p4f_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+  Packet4f y2 = p4f_sincof_p0;
+  y2 = pmadd(y2, z, p4f_sincof_p1);
+  y2 = pmadd(y2, z, p4f_sincof_p2);
+  y2 = pmul(y2, z);
+  y2 = pmadd(y2, x, x);
+
+  /* select the correct result from the two polynoms */
+  y2 = _mm_and_ps(poly_mask, y2);
+  y  = _mm_andnot_ps(poly_mask, y);
+  y  = _mm_or_ps(y,y2);
+
+  /* update the sign */
+  return _mm_xor_ps(y, sign_bit);
+}
+
+#if EIGEN_FAST_MATH
+
+// This is based on Quake3's fast inverse square root.
+// For detail see here: http://www.beyond3d.com/content/articles/8/
+// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& _x)
+{
+  Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
+
+  /* select only the inverse sqrt of non-zero inputs */
+  Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)()));
+  Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
+
+  x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
+  return pmul(_x,x);
+}
+
+#else
+
+template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/include/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h b/include/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h
new file mode 100644
index 0000000..fc8ae50
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -0,0 +1,649 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_SSE_H
+#define EIGEN_PACKET_MATH_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#endif
+
+typedef __m128  Packet4f;
+typedef __m128i Packet4i;
+typedef __m128d Packet2d;
+
+template<> struct is_arithmetic<__m128>  { enum { value = true }; };
+template<> struct is_arithmetic<__m128i> { enum { value = true }; };
+template<> struct is_arithmetic<__m128d> { enum { value = true }; };
+
+#define vec4f_swizzle1(v,p,q,r,s) \
+  (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
+
+#define vec4i_swizzle1(v,p,q,r,s) \
+  (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
+
+#define vec2d_swizzle1(v,p,q) \
+  (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
+  
+#define vec4f_swizzle2(a,b,p,q,r,s) \
+  (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
+
+#define vec4i_swizzle2(a,b,p,q,r,s) \
+  (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
+  const Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+
+template<> struct packet_traits<float>  : default_packet_traits
+{
+  typedef Packet4f type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+
+    HasDiv  = 1,
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = EIGEN_FAST_MATH,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1
+  };
+};
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet2d type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+
+    HasDiv  = 1,
+    HasExp  = 1,
+    HasSqrt = 1
+  };
+};
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet4i type;
+  enum {
+    // FIXME check the Has*
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4
+  };
+};
+
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+
+#if defined(_MSC_VER) && (_MSC_VER==1500)
+// Workaround MSVC 9 internal compiler error.
+// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
+// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps(from,from,from,from); }
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
+  return _mm_xor_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
+  return _mm_xor_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
+{
+  return psub(_mm_setr_epi32(0,0,0,0), a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_mullo_epi32(a,b);
+#else
+  // this version is slightly faster than 4 scalar products
+  return vec4i_swizzle1(
+            vec4i_swizzle2(
+              _mm_mul_epu32(a,b),
+              _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
+                            vec4i_swizzle1(b,1,0,3,2)),
+              0,2,0,2),
+            0,2,1,3);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by SSE");
+  return pset1<Packet4i>(0);
+}
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_min_epi32(a,b);
+#else
+  // after some bench, this version *is* faster than a scalar implementation
+  Packet4i mask = _mm_cmplt_epi32(a,b);
+  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_max_epi32(a,b);
+#else
+  // after some bench, this version *is* faster than a scalar implementation
+  Packet4i mask = _mm_cmpgt_epi32(a,b);
+  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+
+#if defined(_MSC_VER)
+  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
+    EIGEN_DEBUG_UNALIGNED_LOAD
+    #if (_MSC_VER==1600)
+    // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
+    // (i.e., it does not generate an unaligned load!!
+    // TODO On most architectures this version should also be faster than a single _mm_loadu_ps
+    // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
+    __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
+    res = _mm_loadh_pi(res, (const __m64*)(from+2));
+    return res;
+    #else
+    return _mm_loadu_ps(from);
+    #endif
+  }
+  template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
+  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+#else
+// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
+// require pointer casting to incompatible pointer types and leads to invalid code
+// because of the strict aliasing rule. The "dummy" stuff are required to enforce
+// a correct instruction dependency.
+// TODO: do the same for MSVC (ICC is compatible)
+// NOTE: with the code below, MSVC's compiler crashes!
+
+#if defined(__GNUC__) && defined(__i386__)
+  // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
+  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
+#elif defined(__clang__)
+  // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
+  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
+#else
+  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
+  return _mm_loadu_ps(from);
+#else
+  __m128d res;
+  res =  _mm_load_sd((const double*)(from)) ;
+  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
+  return _mm_castpd_ps(res);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
+  return _mm_loadu_pd(from);
+#else
+  __m128d res;
+  res = _mm_load_sd(from) ;
+  res = _mm_loadh_pd(res,from+1);
+  return res;
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
+  return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
+#else
+  __m128d res;
+  res =  _mm_load_sd((const double*)(from)) ;
+  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
+  return _mm_castpd_si128(res);
+#endif
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+{
+  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
+}
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
+{ return pset1<Packet2d>(from[0]); }
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  Packet4i tmp;
+  tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
+  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm_storel_pd((to), from);
+  _mm_storeh_pd((to+1), from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
+
+// some compilers might be tempted to perform multiple moves instead of using a vector path.
+template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
+{
+  Packet4f pa = _mm_set_ss(a);
+  pstore(to, vec4f_swizzle1(pa,0,0,0,0));
+}
+// some compilers might be tempted to perform multiple moves instead of using a vector path.
+template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
+{
+  Packet2d pa = _mm_set_sd(a);
+  pstore(to, vec2d_swizzle1(pa,0,0));
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+
+#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
+// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
+// Direct of the struct members fixed bug #62.
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
+#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
+#else
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{ return _mm_shuffle_ps(a,a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{ return _mm_shuffle_pd(a,a,0x1); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{ return _mm_shuffle_epi32(a,0x1B); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm_and_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
+{
+  #ifdef EIGEN_VECTORIZE_SSSE3
+  return _mm_abs_epi32(a);
+  #else
+  Packet4i aux = _mm_srai_epi32(a,31);
+  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
+  #endif
+}
+
+EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
+{
+  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
+  vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
+  vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
+  vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
+}
+
+#ifdef EIGEN_VECTORIZE_SSE3
+// TODO implement SSE2 versions as well as integer versions
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
+}
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  return _mm_hadd_pd(vecs[0], vecs[1]);
+}
+// SSSE3 version:
+// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
+// {
+//   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
+// }
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp0 = _mm_hadd_ps(a,a);
+  return pfirst(_mm_hadd_ps(tmp0, tmp0));
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
+
+// SSSE3 version:
+// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
+// {
+//   Packet4i tmp0 = _mm_hadd_epi32(a,a);
+//   return pfirst(_mm_hadd_epi32(tmp0, tmp0));
+// }
+#else
+// SSE2 versions
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
+  return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  Packet4f tmp0, tmp1, tmp2;
+  tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
+  tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
+  tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
+  tmp0 = _mm_add_ps(tmp0, tmp1);
+  tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
+  tmp1 = _mm_add_ps(tmp1, tmp2);
+  tmp2 = _mm_movehl_ps(tmp1, tmp0);
+  tmp0 = _mm_movelh_ps(tmp0, tmp1);
+  return _mm_add_ps(tmp0, tmp2);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
+}
+#endif  // SSE3
+
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
+  return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  Packet4i tmp0, tmp1, tmp2;
+  tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+  tmp0 = _mm_add_epi32(tmp0, tmp1);
+  tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  tmp1 = _mm_add_epi32(tmp1, tmp2);
+  tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
+  tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
+  return _mm_add_epi32(tmp0, tmp2);
+}
+
+// Other reduction functions:
+
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
+  return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., reusing pmul is very slow !)
+  // TODO try to call _mm_mul_epu32 directly
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
+  return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., it does not like using std::min after the pstore !!)
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
+  return aux0<aux2 ? aux0 : aux2;
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
+  return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., it does not like using std::min after the pstore !!)
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
+  return aux0>aux2 ? aux0 : aux2;
+}
+
+#if (defined __GNUC__)
+// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
+// {
+//   Packet4f res = b;
+//   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
+//   return res;
+// }
+// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
+// {
+//   Packet4i res = a;
+//   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
+//   return res;
+// }
+#endif
+
+#ifdef EIGEN_VECTORIZE_SSSE3
+// SSSE3 versions
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    if (Offset!=0)
+      first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4i>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
+  {
+    if (Offset!=0)
+      first = _mm_alignr_epi8(second,first, Offset*4);
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset==1)
+      first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
+  }
+};
+#else
+// SSE2 versions
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm_move_ss(first,second);
+      first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
+    }
+    else if (Offset==2)
+    {
+      first = _mm_movehl_ps(first,first);
+      first = _mm_movelh_ps(first,second);
+    }
+    else if (Offset==3)
+    {
+      first = _mm_move_ss(first,second);
+      first = _mm_shuffle_ps(first,second,0x93);
+    }
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4i>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+      first = _mm_shuffle_epi32(first,0x39);
+    }
+    else if (Offset==2)
+    {
+      first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
+      first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+    }
+    else if (Offset==3)
+    {
+      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+      first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
+    }
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
+      first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
+    }
+  }
+};
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_SSE_H
diff --git a/include/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h b/include/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h
new file mode 100644
index 0000000..421f925
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -0,0 +1,439 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COEFFBASED_PRODUCT_H
+#define EIGEN_COEFFBASED_PRODUCT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/*********************************************************************************
+*  Coefficient based product implementation.
+*  It is designed for the following use cases:
+*  - small fixed sizes
+*  - lazy products
+*********************************************************************************/
+
+/* Since the all the dimensions of the product are small, here we can rely
+ * on the generic Assign mechanism to evaluate the product per coeff (or packet).
+ *
+ * Note that here the inner-loops should always be unrolled.
+ */
+
+template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl;
+
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl;
+
+template<typename LhsNested, typename RhsNested, int NestingFlags>
+struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
+{
+  typedef MatrixXpr XprKind;
+  typedef typename remove_all<LhsNested>::type _LhsNested;
+  typedef typename remove_all<RhsNested>::type _RhsNested;
+  typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
+  typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
+                                           typename traits<_RhsNested>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
+                                         typename traits<_RhsNested>::Index>::type Index;
+
+  enum {
+      LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+      RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+      LhsFlags = _LhsNested::Flags,
+      RhsFlags = _RhsNested::Flags,
+
+      RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+      ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+      InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+      MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+      LhsRowMajor = LhsFlags & RowMajorBit,
+      RhsRowMajor = RhsFlags & RowMajorBit,
+
+      SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
+
+      CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
+                      && (ColsAtCompileTime == Dynamic
+                          || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
+                              && (RhsFlags&AlignedBit)
+                             )
+                         ),
+
+      CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
+                      && (RowsAtCompileTime == Dynamic
+                          || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
+                              && (LhsFlags&AlignedBit)
+                             )
+                         ),
+
+      EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+                     : (RhsRowMajor && !CanVectorizeLhs),
+
+      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
+            | (EvalToRowMajor ? RowMajorBit : 0)
+            | NestingFlags
+            | (LhsFlags & RhsFlags & AlignedBit)
+            // TODO enable vectorization for mixed types
+            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
+
+      CoeffReadCost = InnerSize == Dynamic ? Dynamic
+                    : InnerSize == 0 ? 0
+                    : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+                      + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+
+      /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+      * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+      * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+      */
+      CanVectorizeInner =    SameType
+                          && LhsRowMajor
+                          && (!RhsRowMajor)
+                          && (LhsFlags & RhsFlags & ActualPacketAccessBit)
+                          && (LhsFlags & RhsFlags & AlignedBit)
+                          && (InnerSize % packet_traits<Scalar>::size == 0)
+    };
+};
+
+} // end namespace internal
+
+template<typename LhsNested, typename RhsNested, int NestingFlags>
+class CoeffBasedProduct
+  : internal::no_assignment_operator,
+    public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
+{
+  public:
+
+    typedef MatrixBase<CoeffBasedProduct> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
+    typedef typename Base::PlainObject PlainObject;
+
+  private:
+
+    typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
+    typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;
+
+    enum {
+      PacketSize = internal::packet_traits<Scalar>::size,
+      InnerSize  = internal::traits<CoeffBasedProduct>::InnerSize,
+      Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+      CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
+    };
+
+    typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
+                                   Unroll ? (InnerSize==0 ? 0 : InnerSize-1) : Dynamic,
+                                   _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
+
+    typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;
+
+  public:
+
+    inline CoeffBasedProduct(const CoeffBasedProduct& other)
+      : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
+    {}
+
+    template<typename Lhs, typename Rhs>
+    inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
+      : m_lhs(lhs), m_rhs(rhs)
+    {
+      // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
+      // We still allow to mix T and complex<T>.
+      EIGEN_STATIC_ASSERT((internal::scalar_product_traits<typename Lhs::RealScalar, typename Rhs::RealScalar>::Defined),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+      eigen_assert(lhs.cols() == rhs.rows()
+        && "invalid matrix product"
+        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+    {
+      Scalar res;
+      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
+    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+     * which is why we don't set the LinearAccessBit.
+     */
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      Scalar res;
+      const Index row = RowsAtCompileTime == 1 ? 0 : index;
+      const Index col = RowsAtCompileTime == 1 ? index : 0;
+      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
+    {
+      PacketScalar res;
+      internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
+                              Unroll ? (InnerSize==0 ? 0 : InnerSize-1) : Dynamic,
+                              _LhsNested, _RhsNested, PacketScalar, LoadMode>
+        ::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
+    // Implicit conversion to the nested type (trigger the evaluation of the product)
+    EIGEN_STRONG_INLINE operator const PlainObject& () const
+    {
+      m_result.lazyAssign(*this);
+      return m_result;
+    }
+
+    const _LhsNested& lhs() const { return m_lhs; }
+    const _RhsNested& rhs() const { return m_rhs; }
+
+    const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
+    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
+
+    template<int DiagonalIndex>
+    const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
+    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
+
+    const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
+    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }
+
+  protected:
+    typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
+    typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
+
+    mutable PlainObject m_result;
+};
+
+namespace internal {
+
+// here we need to overload the nested rule for products
+// such that the nested type is a const reference to a plain matrix
+template<typename Lhs, typename Rhs, int N, typename PlainObject>
+struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject>
+{
+  typedef PlainObject const& type;
+};
+
+/***************************************************************************
+* Normal product .coeff() implementation (with meta-unrolling)
+***************************************************************************/
+
+/**************************************
+*** Scalar path  - no vectorization ***
+**************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
+    res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
+  {
+    res = (lhs.row(row).transpose().cwiseProduct( rhs.col(col) )).sum();
+  }
+};
+
+/*******************************************
+*** Scalar path with inner vectorization ***
+*******************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
+struct product_coeff_vectorized_unroller
+{
+  typedef typename Lhs::Index Index;
+  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+  {
+    product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
+    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet>
+struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+  {
+    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::PacketScalar Packet;
+  typedef typename Lhs::Index Index;
+  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    Packet pres;
+    product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
+    product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
+    res = predux(pres);
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
+struct product_coeff_vectorized_dyn_selector
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
+  }
+};
+
+// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
+// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
+template<typename Lhs, typename Rhs, int RhsCols>
+struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows>
+struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = lhs.transpose().cwiseProduct(rhs).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
+  }
+};
+
+/*******************
+*** Packet path  ***
+*******************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+  {
+    product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+  {
+    product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+  {
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+  {
+    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
+  {
+    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+      for(Index i = 1; i < lhs.cols(); ++i)
+        res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
+  {
+    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+      for(Index i = 1; i < lhs.cols(); ++i)
+        res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COEFFBASED_PRODUCT_H
diff --git a/include/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/include/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h
new file mode 100644
index 0000000..bcdca5b
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -0,0 +1,1341 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
+#define EIGEN_GENERAL_BLOCK_PANEL_H
+
+namespace Eigen { 
+  
+namespace internal {
+
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
+class gebp_traits;
+
+
+/** \internal \returns b if a<=0, and returns a otherwise. */
+inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
+{
+  return a<=0 ? b : a;
+}
+
+/** \internal */
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
+{
+  static std::ptrdiff_t m_l1CacheSize = 0;
+  static std::ptrdiff_t m_l2CacheSize = 0;
+  if(m_l2CacheSize==0)
+  {
+    m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
+    m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
+  }
+  
+  if(action==SetAction)
+  {
+    // set the cpu cache size and cache all block sizes from a global cache size in byte
+    eigen_internal_assert(l1!=0 && l2!=0);
+    m_l1CacheSize = *l1;
+    m_l2CacheSize = *l2;
+  }
+  else if(action==GetAction)
+  {
+    eigen_internal_assert(l1!=0 && l2!=0);
+    *l1 = m_l1CacheSize;
+    *l2 = m_l2CacheSize;
+  }
+  else
+  {
+    eigen_internal_assert(false);
+  }
+}
+
+/** \brief Computes the blocking parameters for a m x k times k x n matrix product
+  *
+  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
+  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
+  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
+  *
+  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+  * this function computes the blocking size parameters along the respective dimensions
+  * for matrix products and related algorithms. The blocking sizes depends on various
+  * parameters:
+  * - the L1 and L2 cache sizes,
+  * - the register level blocking sizes defined by gebp_traits,
+  * - the number of scalars that fit into a packet (when vectorization is enabled).
+  *
+  * \sa setCpuCacheSizes */
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
+void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+{
+  EIGEN_UNUSED_VARIABLE(n);
+  // Explanations:
+  // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
+  // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
+  // per kc x nr vertical small panels where nr is the blocking size along the n dimension
+  // at the register level. For vectorization purpose, these small vertical panels are unpacked,
+  // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
+  // stay in L1 cache.
+  std::ptrdiff_t l1, l2;
+
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  enum {
+    kdiv = KcFactor * 2 * Traits::nr
+         * Traits::RhsProgress * sizeof(RhsScalar),
+    mr = gebp_traits<LhsScalar,RhsScalar>::mr,
+    mr_mask = (0xffffffff/mr)*mr
+  };
+
+  manage_caching_sizes(GetAction, &l1, &l2);
+  k = std::min<SizeType>(k, l1/kdiv);
+  SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
+  if(_m<m) m = _m & mr_mask;
+}
+
+template<typename LhsScalar, typename RhsScalar, typename SizeType>
+inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+{
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
+}
+
+#ifdef EIGEN_HAS_FUSE_CJMADD
+  #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
+#else
+
+  // FIXME (a bit overkill maybe ?)
+
+  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
+    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
+    {
+      c = cj.pmadd(a,b,c);
+    }
+  };
+
+  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
+    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
+    {
+      t = b; t = cj.pmul(a,t); c = padd(c,t);
+    }
+  };
+
+  template<typename CJ, typename A, typename B, typename C, typename T>
+  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
+  {
+    gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
+  }
+
+  #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
+//   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
+#endif
+
+/* Vectorization logic
+ *  real*real: unpack rhs to constant packets, ...
+ * 
+ *  cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
+ *          storing each res packet into two packets (2x2),
+ *          at the end combine them: swap the second and addsub them 
+ *  cf*cf : same but with 2x4 blocks
+ *  cplx*real : unpack rhs to constant packets, ...
+ *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
+ */
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
+class gebp_traits
+{
+public:
+  typedef _LhsScalar LhsScalar;
+  typedef _RhsScalar RhsScalar;
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  enum {
+    ConjLhs = _ConjLhs,
+    ConjRhs = _ConjRhs,
+    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+    
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+
+    // register block size along the N direction (must be either 2 or 4)
+    nr = NumberOfRegisters/4,
+
+    // register block size along the M direction (currently, this one cannot be modified)
+    mr = 2 * LhsPacketSize,
+    
+    WorkSpaceFactor = nr * RhsPacketSize,
+
+    LhsProgress = LhsPacketSize,
+    RhsProgress = RhsPacketSize
+  };
+
+  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+  typedef ResPacket AccPacket;
+  
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+  {
+    p = pset1<ResPacket>(ResScalar(0));
+  }
+
+  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+  {
+    for(DenseIndex k=0; k<n; k++)
+      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = pload<RhsPacket>(b);
+  }
+
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = pload<LhsPacket>(a);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
+  {
+    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+  }
+
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  {
+    r = pmadd(c,alpha,r);
+  }
+
+protected:
+//   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+//   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
+};
+
+template<typename RealScalar, bool _ConjLhs>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
+{
+public:
+  typedef std::complex<RealScalar> LhsScalar;
+  typedef RealScalar RhsScalar;
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  enum {
+    ConjLhs = _ConjLhs,
+    ConjRhs = false,
+    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+    
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    nr = NumberOfRegisters/4,
+    mr = 2 * LhsPacketSize,
+    WorkSpaceFactor = nr*RhsPacketSize,
+
+    LhsProgress = LhsPacketSize,
+    RhsProgress = RhsPacketSize
+  };
+
+  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+  typedef ResPacket AccPacket;
+
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+  {
+    p = pset1<ResPacket>(ResScalar(0));
+  }
+
+  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+  {
+    for(DenseIndex k=0; k<n; k++)
+      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = pload<RhsPacket>(b);
+  }
+
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = pload<LhsPacket>(a);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+  {
+    madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+  {
+    tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+  {
+    c += a * b;
+  }
+
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  {
+    r = cj.pmadd(c,alpha,r);
+  }
+
+protected:
+  conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
+};
+
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
+{
+public:
+  typedef std::complex<RealScalar>  Scalar;
+  typedef std::complex<RealScalar>  LhsScalar;
+  typedef std::complex<RealScalar>  RhsScalar;
+  typedef std::complex<RealScalar>  ResScalar;
+  
+  enum {
+    ConjLhs = _ConjLhs,
+    ConjRhs = _ConjRhs,
+    Vectorizable = packet_traits<RealScalar>::Vectorizable
+                && packet_traits<Scalar>::Vectorizable,
+    RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
+    ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
+    
+    nr = 2,
+    mr = 2 * ResPacketSize,
+    WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
+
+    LhsProgress = ResPacketSize,
+    RhsProgress = Vectorizable ? 2*ResPacketSize : 1
+  };
+  
+  typedef typename packet_traits<RealScalar>::type RealPacket;
+  typedef typename packet_traits<Scalar>::type     ScalarPacket;
+  struct DoublePacket
+  {
+    RealPacket first;
+    RealPacket second;
+  };
+
+  typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
+  typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
+  
+  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
+
+  EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
+  {
+    p.first   = pset1<RealPacket>(RealScalar(0));
+    p.second  = pset1<RealPacket>(RealScalar(0));
+  }
+
+  /* Unpack the rhs coeff such that each complex coefficient is spread into
+   * two packects containing respectively the real and imaginary coefficient
+   * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
+   */
+  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
+  {
+    for(DenseIndex k=0; k<n; k++)
+    {
+      if(Vectorizable)
+      {
+        pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0],             real(rhs[k]));
+        pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
+      }
+      else
+        b[k] = rhs[k];
+    }
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
+  {
+    dest.first  = pload<RealPacket>((const RealScalar*)b);
+    dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
+  }
+
+  // nothing special here
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const
+  {
+    c.first   = padd(pmul(a,b.first), c.first);
+    c.second  = padd(pmul(a,b.second),c.second);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
+  {
+    c = cj.pmadd(a,b,c);
+  }
+  
+  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
+  
+  EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
+  {
+    // assemble c
+    ResPacket tmp;
+    if((!ConjLhs)&&(!ConjRhs))
+    {
+      tmp = pcplxflip(pconj(ResPacket(c.second)));
+      tmp = padd(ResPacket(c.first),tmp);
+    }
+    else if((!ConjLhs)&&(ConjRhs))
+    {
+      tmp = pconj(pcplxflip(ResPacket(c.second)));
+      tmp = padd(ResPacket(c.first),tmp);
+    }
+    else if((ConjLhs)&&(!ConjRhs))
+    {
+      tmp = pcplxflip(ResPacket(c.second));
+      tmp = padd(pconj(ResPacket(c.first)),tmp);
+    }
+    else if((ConjLhs)&&(ConjRhs))
+    {
+      tmp = pcplxflip(ResPacket(c.second));
+      tmp = psub(pconj(ResPacket(c.first)),tmp);
+    }
+    
+    r = pmadd(tmp,alpha,r);
+  }
+
+protected:
+  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+};
+
+template<typename RealScalar, bool _ConjRhs>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
+{
+public:
+  typedef std::complex<RealScalar>  Scalar;
+  typedef RealScalar  LhsScalar;
+  typedef Scalar      RhsScalar;
+  typedef Scalar      ResScalar;
+
+  enum {
+    ConjLhs = false,
+    ConjRhs = _ConjRhs,
+    Vectorizable = packet_traits<RealScalar>::Vectorizable
+                && packet_traits<Scalar>::Vectorizable,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+    
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    nr = 4,
+    mr = 2*ResPacketSize,
+    WorkSpaceFactor = nr*RhsPacketSize,
+
+    LhsProgress = ResPacketSize,
+    RhsProgress = ResPacketSize
+  };
+
+  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+  typedef ResPacket AccPacket;
+
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+  {
+    p = pset1<ResPacket>(ResScalar(0));
+  }
+
+  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+  {
+    for(DenseIndex k=0; k<n; k++)
+      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = pload<RhsPacket>(b);
+  }
+
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploaddup<LhsPacket>(a);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+  {
+    madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+  {
+    tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+  {
+    c += a * b;
+  }
+
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  {
+    r = cj.pmadd(alpha,c,r);
+  }
+
+protected:
+  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
+};
+
+/* optimized GEneral packed Block * packed Panel product kernel
+ *
+ * Mixing type logic: C += A * B
+ *  |  A  |  B  | comments
+ *  |real |cplx | no vectorization yet, would require to pack A with duplication
+ *  |cplx |real | easy vectorization
+ */
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel
+{
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  enum {
+    Vectorizable  = Traits::Vectorizable,
+    LhsProgress   = Traits::LhsProgress,
+    RhsProgress   = Traits::RhsProgress,
+    ResPacketSize = Traits::ResPacketSize
+  };
+
+  EIGEN_DONT_INLINE
+  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
+  ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
+  {
+    Traits traits;
+    
+    if(strideA==-1) strideA = depth;
+    if(strideB==-1) strideB = depth;
+    conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+//     conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+    Index packet_cols = (cols/nr) * nr;
+    const Index peeled_mc = (rows/mr)*mr;
+    // FIXME:
+    const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
+    const Index peeled_kc = (depth/4)*4;
+
+    if(unpackedB==0)
+      unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
+
+    // loops on each micro vertical panel of rhs (depth x nr)
+    for(Index j2=0; j2<packet_cols; j2+=nr)
+    {
+      traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 
+
+      // loops on each largest micro horizontal panel of lhs (mr x depth)
+      // => we select a mr x nr micro block of res which is entirely
+      //    stored into mr/packet_size x nr registers.
+      for(Index i=0; i<peeled_mc; i+=mr)
+      {
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
+                  traits.initAcc(C0);
+                  traits.initAcc(C1);
+        if(nr==4) traits.initAcc(C2);
+        if(nr==4) traits.initAcc(C3);
+                  traits.initAcc(C4);
+                  traits.initAcc(C5);
+        if(nr==4) traits.initAcc(C6);
+        if(nr==4) traits.initAcc(C7);
+
+        ResScalar* r0 = &res[(j2+0)*resStride + i];
+        ResScalar* r1 = r0 + resStride;
+        ResScalar* r2 = r1 + resStride;
+        ResScalar* r3 = r2 + resStride;
+
+        prefetch(r0+16);
+        prefetch(r1+16);
+        prefetch(r2+16);
+        prefetch(r3+16);
+
+        // performs "inner" product
+        // TODO let's check wether the folowing peeled loop could not be
+        //      optimized via optimal prefetching from one loop to the other
+        const RhsScalar* blB = unpackedB;
+        for(Index k=0; k<peeled_kc; k+=4)
+        {
+          if(nr==2)
+          {
+            LhsPacket A0, A1;
+            RhsPacket B_0;
+            RhsPacket T0;
+            
+EIGEN_ASM_COMMENT("mybegin2");
+            traits.loadLhs(&blA[0*LhsProgress], A0);
+            traits.loadLhs(&blA[1*LhsProgress], A1);
+            traits.loadRhs(&blB[0*RhsProgress], B_0);
+            traits.madd(A0,B_0,C0,T0);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.loadRhs(&blB[1*RhsProgress], B_0);
+            traits.madd(A0,B_0,C1,T0);
+            traits.madd(A1,B_0,C5,B_0);
+
+            traits.loadLhs(&blA[2*LhsProgress], A0);
+            traits.loadLhs(&blA[3*LhsProgress], A1);
+            traits.loadRhs(&blB[2*RhsProgress], B_0);
+            traits.madd(A0,B_0,C0,T0);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.loadRhs(&blB[3*RhsProgress], B_0);
+            traits.madd(A0,B_0,C1,T0);
+            traits.madd(A1,B_0,C5,B_0);
+
+            traits.loadLhs(&blA[4*LhsProgress], A0);
+            traits.loadLhs(&blA[5*LhsProgress], A1);
+            traits.loadRhs(&blB[4*RhsProgress], B_0);
+            traits.madd(A0,B_0,C0,T0);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.loadRhs(&blB[5*RhsProgress], B_0);
+            traits.madd(A0,B_0,C1,T0);
+            traits.madd(A1,B_0,C5,B_0);
+
+            traits.loadLhs(&blA[6*LhsProgress], A0);
+            traits.loadLhs(&blA[7*LhsProgress], A1);
+            traits.loadRhs(&blB[6*RhsProgress], B_0);
+            traits.madd(A0,B_0,C0,T0);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.loadRhs(&blB[7*RhsProgress], B_0);
+            traits.madd(A0,B_0,C1,T0);
+            traits.madd(A1,B_0,C5,B_0);
+EIGEN_ASM_COMMENT("myend");
+          }
+          else
+          {
+EIGEN_ASM_COMMENT("mybegin4");
+            LhsPacket A0, A1;
+            RhsPacket B_0, B1, B2, B3;
+            RhsPacket T0;
+            
+            traits.loadLhs(&blA[0*LhsProgress], A0);
+            traits.loadLhs(&blA[1*LhsProgress], A1);
+            traits.loadRhs(&blB[0*RhsProgress], B_0);
+            traits.loadRhs(&blB[1*RhsProgress], B1);
+
+            traits.madd(A0,B_0,C0,T0);
+            traits.loadRhs(&blB[2*RhsProgress], B2);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.loadRhs(&blB[3*RhsProgress], B3);
+            traits.loadRhs(&blB[4*RhsProgress], B_0);
+            traits.madd(A0,B1,C1,T0);
+            traits.madd(A1,B1,C5,B1);
+            traits.loadRhs(&blB[5*RhsProgress], B1);
+            traits.madd(A0,B2,C2,T0);
+            traits.madd(A1,B2,C6,B2);
+            traits.loadRhs(&blB[6*RhsProgress], B2);
+            traits.madd(A0,B3,C3,T0);
+            traits.loadLhs(&blA[2*LhsProgress], A0);
+            traits.madd(A1,B3,C7,B3);
+            traits.loadLhs(&blA[3*LhsProgress], A1);
+            traits.loadRhs(&blB[7*RhsProgress], B3);
+            traits.madd(A0,B_0,C0,T0);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.loadRhs(&blB[8*RhsProgress], B_0);
+            traits.madd(A0,B1,C1,T0);
+            traits.madd(A1,B1,C5,B1);
+            traits.loadRhs(&blB[9*RhsProgress], B1);
+            traits.madd(A0,B2,C2,T0);
+            traits.madd(A1,B2,C6,B2);
+            traits.loadRhs(&blB[10*RhsProgress], B2);
+            traits.madd(A0,B3,C3,T0);
+            traits.loadLhs(&blA[4*LhsProgress], A0);
+            traits.madd(A1,B3,C7,B3);
+            traits.loadLhs(&blA[5*LhsProgress], A1);
+            traits.loadRhs(&blB[11*RhsProgress], B3);
+
+            traits.madd(A0,B_0,C0,T0);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.loadRhs(&blB[12*RhsProgress], B_0);
+            traits.madd(A0,B1,C1,T0);
+            traits.madd(A1,B1,C5,B1);
+            traits.loadRhs(&blB[13*RhsProgress], B1);
+            traits.madd(A0,B2,C2,T0);
+            traits.madd(A1,B2,C6,B2);
+            traits.loadRhs(&blB[14*RhsProgress], B2);
+            traits.madd(A0,B3,C3,T0);
+            traits.loadLhs(&blA[6*LhsProgress], A0);
+            traits.madd(A1,B3,C7,B3);
+            traits.loadLhs(&blA[7*LhsProgress], A1);
+            traits.loadRhs(&blB[15*RhsProgress], B3);
+            traits.madd(A0,B_0,C0,T0);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.madd(A0,B1,C1,T0);
+            traits.madd(A1,B1,C5,B1);
+            traits.madd(A0,B2,C2,T0);
+            traits.madd(A1,B2,C6,B2);
+            traits.madd(A0,B3,C3,T0);
+            traits.madd(A1,B3,C7,B3);
+          }
+
+          blB += 4*nr*RhsProgress;
+          blA += 4*mr;
+        }
+        // process remaining peeled loop
+        for(Index k=peeled_kc; k<depth; k++)
+        {
+          if(nr==2)
+          {
+            LhsPacket A0, A1;
+            RhsPacket B_0;
+            RhsPacket T0;
+
+            traits.loadLhs(&blA[0*LhsProgress], A0);
+            traits.loadLhs(&blA[1*LhsProgress], A1);
+            traits.loadRhs(&blB[0*RhsProgress], B_0);
+            traits.madd(A0,B_0,C0,T0);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.loadRhs(&blB[1*RhsProgress], B_0);
+            traits.madd(A0,B_0,C1,T0);
+            traits.madd(A1,B_0,C5,B_0);
+          }
+          else
+          {
+            LhsPacket A0, A1;
+            RhsPacket B_0, B1, B2, B3;
+            RhsPacket T0;
+
+            traits.loadLhs(&blA[0*LhsProgress], A0);
+            traits.loadLhs(&blA[1*LhsProgress], A1);
+            traits.loadRhs(&blB[0*RhsProgress], B_0);
+            traits.loadRhs(&blB[1*RhsProgress], B1);
+
+            traits.madd(A0,B_0,C0,T0);
+            traits.loadRhs(&blB[2*RhsProgress], B2);
+            traits.madd(A1,B_0,C4,B_0);
+            traits.loadRhs(&blB[3*RhsProgress], B3);
+            traits.madd(A0,B1,C1,T0);
+            traits.madd(A1,B1,C5,B1);
+            traits.madd(A0,B2,C2,T0);
+            traits.madd(A1,B2,C6,B2);
+            traits.madd(A0,B3,C3,T0);
+            traits.madd(A1,B3,C7,B3);
+          }
+
+          blB += nr*RhsProgress;
+          blA += mr;
+        }
+
+        if(nr==4)
+        {
+          ResPacket R0, R1, R2, R3, R4, R5, R6;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = ploadu<ResPacket>(r0);
+          R1 = ploadu<ResPacket>(r1);
+          R2 = ploadu<ResPacket>(r2);
+          R3 = ploadu<ResPacket>(r3);
+          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
+          R5 = ploadu<ResPacket>(r1 + ResPacketSize);
+          R6 = ploadu<ResPacket>(r2 + ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          pstoreu(r0, R0);
+          R0 = ploadu<ResPacket>(r3 + ResPacketSize);
+
+          traits.acc(C1, alphav, R1);
+          traits.acc(C2, alphav, R2);
+          traits.acc(C3, alphav, R3);
+          traits.acc(C4, alphav, R4);
+          traits.acc(C5, alphav, R5);
+          traits.acc(C6, alphav, R6);
+          traits.acc(C7, alphav, R0);
+          
+          pstoreu(r1, R1);
+          pstoreu(r2, R2);
+          pstoreu(r3, R3);
+          pstoreu(r0 + ResPacketSize, R4);
+          pstoreu(r1 + ResPacketSize, R5);
+          pstoreu(r2 + ResPacketSize, R6);
+          pstoreu(r3 + ResPacketSize, R0);
+        }
+        else
+        {
+          ResPacket R0, R1, R4;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = ploadu<ResPacket>(r0);
+          R1 = ploadu<ResPacket>(r1);
+          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          pstoreu(r0, R0);
+          R0 = ploadu<ResPacket>(r1 + ResPacketSize);
+          traits.acc(C1, alphav, R1);
+          traits.acc(C4, alphav, R4);
+          traits.acc(C5, alphav, R0);
+          pstoreu(r1, R1);
+          pstoreu(r0 + ResPacketSize, R4);
+          pstoreu(r1 + ResPacketSize, R0);
+        }
+        
+      }
+      
+      if(rows-peeled_mc>=LhsProgress)
+      {
+        Index i = peeled_mc;
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0, C1, C2, C3;
+                  traits.initAcc(C0);
+                  traits.initAcc(C1);
+        if(nr==4) traits.initAcc(C2);
+        if(nr==4) traits.initAcc(C3);
+
+        // performs "inner" product
+        const RhsScalar* blB = unpackedB;
+        for(Index k=0; k<peeled_kc; k+=4)
+        {
+          if(nr==2)
+          {
+            LhsPacket A0;
+            RhsPacket B_0, B1;
+
+            traits.loadLhs(&blA[0*LhsProgress], A0);
+            traits.loadRhs(&blB[0*RhsProgress], B_0);
+            traits.loadRhs(&blB[1*RhsProgress], B1);
+            traits.madd(A0,B_0,C0,B_0);
+            traits.loadRhs(&blB[2*RhsProgress], B_0);
+            traits.madd(A0,B1,C1,B1);
+            traits.loadLhs(&blA[1*LhsProgress], A0);
+            traits.loadRhs(&blB[3*RhsProgress], B1);
+            traits.madd(A0,B_0,C0,B_0);
+            traits.loadRhs(&blB[4*RhsProgress], B_0);
+            traits.madd(A0,B1,C1,B1);
+            traits.loadLhs(&blA[2*LhsProgress], A0);
+            traits.loadRhs(&blB[5*RhsProgress], B1);
+            traits.madd(A0,B_0,C0,B_0);
+            traits.loadRhs(&blB[6*RhsProgress], B_0);
+            traits.madd(A0,B1,C1,B1);
+            traits.loadLhs(&blA[3*LhsProgress], A0);
+            traits.loadRhs(&blB[7*RhsProgress], B1);
+            traits.madd(A0,B_0,C0,B_0);
+            traits.madd(A0,B1,C1,B1);
+          }
+          else
+          {
+            LhsPacket A0;
+            RhsPacket B_0, B1, B2, B3;
+
+            traits.loadLhs(&blA[0*LhsProgress], A0);
+            traits.loadRhs(&blB[0*RhsProgress], B_0);
+            traits.loadRhs(&blB[1*RhsProgress], B1);
+
+            traits.madd(A0,B_0,C0,B_0);
+            traits.loadRhs(&blB[2*RhsProgress], B2);
+            traits.loadRhs(&blB[3*RhsProgress], B3);
+            traits.loadRhs(&blB[4*RhsProgress], B_0);
+            traits.madd(A0,B1,C1,B1);
+            traits.loadRhs(&blB[5*RhsProgress], B1);
+            traits.madd(A0,B2,C2,B2);
+            traits.loadRhs(&blB[6*RhsProgress], B2);
+            traits.madd(A0,B3,C3,B3);
+            traits.loadLhs(&blA[1*LhsProgress], A0);
+            traits.loadRhs(&blB[7*RhsProgress], B3);
+            traits.madd(A0,B_0,C0,B_0);
+            traits.loadRhs(&blB[8*RhsProgress], B_0);
+            traits.madd(A0,B1,C1,B1);
+            traits.loadRhs(&blB[9*RhsProgress], B1);
+            traits.madd(A0,B2,C2,B2);
+            traits.loadRhs(&blB[10*RhsProgress], B2);
+            traits.madd(A0,B3,C3,B3);
+            traits.loadLhs(&blA[2*LhsProgress], A0);
+            traits.loadRhs(&blB[11*RhsProgress], B3);
+
+            traits.madd(A0,B_0,C0,B_0);
+            traits.loadRhs(&blB[12*RhsProgress], B_0);
+            traits.madd(A0,B1,C1,B1);
+            traits.loadRhs(&blB[13*RhsProgress], B1);
+            traits.madd(A0,B2,C2,B2);
+            traits.loadRhs(&blB[14*RhsProgress], B2);
+            traits.madd(A0,B3,C3,B3);
+
+            traits.loadLhs(&blA[3*LhsProgress], A0);
+            traits.loadRhs(&blB[15*RhsProgress], B3);
+            traits.madd(A0,B_0,C0,B_0);
+            traits.madd(A0,B1,C1,B1);
+            traits.madd(A0,B2,C2,B2);
+            traits.madd(A0,B3,C3,B3);
+          }
+
+          blB += nr*4*RhsProgress;
+          blA += 4*LhsProgress;
+        }
+        // process remaining peeled loop
+        for(Index k=peeled_kc; k<depth; k++)
+        {
+          if(nr==2)
+          {
+            LhsPacket A0;
+            RhsPacket B_0, B1;
+
+            traits.loadLhs(&blA[0*LhsProgress], A0);
+            traits.loadRhs(&blB[0*RhsProgress], B_0);
+            traits.loadRhs(&blB[1*RhsProgress], B1);
+            traits.madd(A0,B_0,C0,B_0);
+            traits.madd(A0,B1,C1,B1);
+          }
+          else
+          {
+            LhsPacket A0;
+            RhsPacket B_0, B1, B2, B3;
+
+            traits.loadLhs(&blA[0*LhsProgress], A0);
+            traits.loadRhs(&blB[0*RhsProgress], B_0);
+            traits.loadRhs(&blB[1*RhsProgress], B1);
+            traits.loadRhs(&blB[2*RhsProgress], B2);
+            traits.loadRhs(&blB[3*RhsProgress], B3);
+
+            traits.madd(A0,B_0,C0,B_0);
+            traits.madd(A0,B1,C1,B1);
+            traits.madd(A0,B2,C2,B2);
+            traits.madd(A0,B3,C3,B3);
+          }
+
+          blB += nr*RhsProgress;
+          blA += LhsProgress;
+        }
+
+        ResPacket R0, R1, R2, R3;
+        ResPacket alphav = pset1<ResPacket>(alpha);
+
+        ResScalar* r0 = &res[(j2+0)*resStride + i];
+        ResScalar* r1 = r0 + resStride;
+        ResScalar* r2 = r1 + resStride;
+        ResScalar* r3 = r2 + resStride;
+
+                  R0 = ploadu<ResPacket>(r0);
+                  R1 = ploadu<ResPacket>(r1);
+        if(nr==4) R2 = ploadu<ResPacket>(r2);
+        if(nr==4) R3 = ploadu<ResPacket>(r3);
+
+                  traits.acc(C0, alphav, R0);
+                  traits.acc(C1, alphav, R1);
+        if(nr==4) traits.acc(C2, alphav, R2);
+        if(nr==4) traits.acc(C3, alphav, R3);
+
+                  pstoreu(r0, R0);
+                  pstoreu(r1, R1);
+        if(nr==4) pstoreu(r2, R2);
+        if(nr==4) pstoreu(r3, R3);
+      }
+      for(Index i=peeled_mc2; i<rows; i++)
+      {
+        const LhsScalar* blA = &blockA[i*strideA+offsetA];
+        prefetch(&blA[0]);
+
+        // gets a 1 x nr res block as registers
+        ResScalar C0(0), C1(0), C2(0), C3(0);
+        // TODO directly use blockB ???
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+        for(Index k=0; k<depth; k++)
+        {
+          if(nr==2)
+          {
+            LhsScalar A0;
+            RhsScalar B_0, B1;
+
+            A0 = blA[k];
+            B_0 = blB[0];
+            B1 = blB[1];
+            MADD(cj,A0,B_0,C0,B_0);
+            MADD(cj,A0,B1,C1,B1);
+          }
+          else
+          {
+            LhsScalar A0;
+            RhsScalar B_0, B1, B2, B3;
+
+            A0 = blA[k];
+            B_0 = blB[0];
+            B1 = blB[1];
+            B2 = blB[2];
+            B3 = blB[3];
+
+            MADD(cj,A0,B_0,C0,B_0);
+            MADD(cj,A0,B1,C1,B1);
+            MADD(cj,A0,B2,C2,B2);
+            MADD(cj,A0,B3,C3,B3);
+          }
+
+          blB += nr;
+        }
+                  res[(j2+0)*resStride + i] += alpha*C0;
+                  res[(j2+1)*resStride + i] += alpha*C1;
+        if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
+        if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
+      }
+    }
+    // process remaining rhs/res columns one at a time
+    // => do the same but with nr==1
+    for(Index j2=packet_cols; j2<cols; j2++)
+    {
+      // unpack B
+      traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
+
+      for(Index i=0; i<peeled_mc; i+=mr)
+      {
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
+        prefetch(&blA[0]);
+
+        // TODO move the res loads to the stores
+
+        // get res block as registers
+        AccPacket C0, C4;
+        traits.initAcc(C0);
+        traits.initAcc(C4);
+
+        const RhsScalar* blB = unpackedB;
+        for(Index k=0; k<depth; k++)
+        {
+          LhsPacket A0, A1;
+          RhsPacket B_0;
+          RhsPacket T0;
+
+          traits.loadLhs(&blA[0*LhsProgress], A0);
+          traits.loadLhs(&blA[1*LhsProgress], A1);
+          traits.loadRhs(&blB[0*RhsProgress], B_0);
+          traits.madd(A0,B_0,C0,T0);
+          traits.madd(A1,B_0,C4,B_0);
+
+          blB += RhsProgress;
+          blA += 2*LhsProgress;
+        }
+        ResPacket R0, R4;
+        ResPacket alphav = pset1<ResPacket>(alpha);
+
+        ResScalar* r0 = &res[(j2+0)*resStride + i];
+
+        R0 = ploadu<ResPacket>(r0);
+        R4 = ploadu<ResPacket>(r0+ResPacketSize);
+
+        traits.acc(C0, alphav, R0);
+        traits.acc(C4, alphav, R4);
+
+        pstoreu(r0,               R0);
+        pstoreu(r0+ResPacketSize, R4);
+      }
+      if(rows-peeled_mc>=LhsProgress)
+      {
+        Index i = peeled_mc;
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
+        prefetch(&blA[0]);
+
+        AccPacket C0;
+        traits.initAcc(C0);
+
+        const RhsScalar* blB = unpackedB;
+        for(Index k=0; k<depth; k++)
+        {
+          LhsPacket A0;
+          RhsPacket B_0;
+          traits.loadLhs(blA, A0);
+          traits.loadRhs(blB, B_0);
+          traits.madd(A0, B_0, C0, B_0);
+          blB += RhsProgress;
+          blA += LhsProgress;
+        }
+
+        ResPacket alphav = pset1<ResPacket>(alpha);
+        ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
+        traits.acc(C0, alphav, R0);
+        pstoreu(&res[(j2+0)*resStride + i], R0);
+      }
+      for(Index i=peeled_mc2; i<rows; i++)
+      {
+        const LhsScalar* blA = &blockA[i*strideA+offsetA];
+        prefetch(&blA[0]);
+
+        // gets a 1 x 1 res block as registers
+        ResScalar C0(0);
+        // FIXME directly use blockB ??
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+        for(Index k=0; k<depth; k++)
+        {
+          LhsScalar A0 = blA[k];
+          RhsScalar B_0 = blB[k];
+          MADD(cj, A0, B_0, C0, B_0);
+        }
+        res[(j2+0)*resStride + i] += alpha*C0;
+      }
+    }
+  }
+
+
+#undef CJMADD
+
+// pack a block of the lhs
+// The traversal is as follow (mr==4):
+//   0  4  8 12 ...
+//   1  5  9 13 ...
+//   2  6 10 14 ...
+//   3  7 11 15 ...
+//
+//  16 20 24 28 ...
+//  17 21 25 29 ...
+//  18 22 26 30 ...
+//  19 23 27 31 ...
+//
+//  32 33 34 35 ...
+//  36 36 38 39 ...
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs
+{
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum { PacketSize = packet_traits<Scalar>::size };
+
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  EIGEN_UNUSED_VARIABLE(stride)
+  EIGEN_UNUSED_VARIABLE(offset)
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
+  Index count = 0;
+  Index peeled_mc = (rows/Pack1)*Pack1;
+  for(Index i=0; i<peeled_mc; i+=Pack1)
+  {
+    if(PanelMode) count += Pack1 * offset;
+
+    if(StorageOrder==ColMajor)
+    {
+      for(Index k=0; k<depth; k++)
+      {
+        Packet A, B, C, D;
+        if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
+        if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
+        if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
+        if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
+        if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
+        if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
+        if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
+        if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
+      }
+    }
+    else
+    {
+      for(Index k=0; k<depth; k++)
+      {
+        // TODO add a vectorized transpose here
+        Index w=0;
+        for(; w<Pack1-3; w+=4)
+        {
+          Scalar a(cj(lhs(i+w+0, k))),
+                  b(cj(lhs(i+w+1, k))),
+                  c(cj(lhs(i+w+2, k))),
+                  d(cj(lhs(i+w+3, k)));
+          blockA[count++] = a;
+          blockA[count++] = b;
+          blockA[count++] = c;
+          blockA[count++] = d;
+        }
+        if(Pack1%4)
+          for(;w<Pack1;++w)
+            blockA[count++] = cj(lhs(i+w, k));
+      }
+    }
+    if(PanelMode) count += Pack1 * (stride-offset-depth);
+  }
+  if(rows-peeled_mc>=Pack2)
+  {
+    if(PanelMode) count += Pack2*offset;
+    for(Index k=0; k<depth; k++)
+      for(Index w=0; w<Pack2; w++)
+        blockA[count++] = cj(lhs(peeled_mc+w, k));
+    if(PanelMode) count += Pack2 * (stride-offset-depth);
+    peeled_mc += Pack2;
+  }
+  for(Index i=peeled_mc; i<rows; i++)
+  {
+    if(PanelMode) count += offset;
+    for(Index k=0; k<depth; k++)
+      blockA[count++] = cj(lhs(i, k));
+    if(PanelMode) count += (stride-offset-depth);
+  }
+}
+
+// copy a complete panel of the rhs
+// this version is optimized for column major matrices
+// The traversal order is as follow: (nr==4):
+//  0  1  2  3   12 13 14 15   24 27
+//  4  5  6  7   16 17 18 19   25 28
+//  8  9 10 11   20 21 22 23   26 29
+//  .  .  .  .    .  .  .  .    .  .
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum { PacketSize = packet_traits<Scalar>::size };
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+{
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride)
+  EIGEN_UNUSED_VARIABLE(offset)
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols = (cols/nr) * nr;
+  Index count = 0;
+  for(Index j2=0; j2<packet_cols; j2+=nr)
+  {
+    // skip what we have before
+    if(PanelMode) count += nr * offset;
+    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+    const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+    const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+    const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+    for(Index k=0; k<depth; k++)
+    {
+                blockB[count+0] = cj(b0[k]);
+                blockB[count+1] = cj(b1[k]);
+      if(nr==4) blockB[count+2] = cj(b2[k]);
+      if(nr==4) blockB[count+3] = cj(b3[k]);
+      count += nr;
+    }
+    // skip what we have after
+    if(PanelMode) count += nr * (stride-offset-depth);
+  }
+
+  // copy the remaining columns one at a time (nr==1)
+  for(Index j2=packet_cols; j2<cols; ++j2)
+  {
+    if(PanelMode) count += offset;
+    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+    for(Index k=0; k<depth; k++)
+    {
+      blockB[count] = cj(b0[k]);
+      count += 1;
+    }
+    if(PanelMode) count += (stride-offset-depth);
+  }
+}
+
+// this version is optimized for row major matrices
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
+{
+  enum { PacketSize = packet_traits<Scalar>::size };
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+{
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride)
+  EIGEN_UNUSED_VARIABLE(offset)
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols = (cols/nr) * nr;
+  Index count = 0;
+  for(Index j2=0; j2<packet_cols; j2+=nr)
+  {
+    // skip what we have before
+    if(PanelMode) count += nr * offset;
+    for(Index k=0; k<depth; k++)
+    {
+      const Scalar* b0 = &rhs[k*rhsStride + j2];
+                blockB[count+0] = cj(b0[0]);
+                blockB[count+1] = cj(b0[1]);
+      if(nr==4) blockB[count+2] = cj(b0[2]);
+      if(nr==4) blockB[count+3] = cj(b0[3]);
+      count += nr;
+    }
+    // skip what we have after
+    if(PanelMode) count += nr * (stride-offset-depth);
+  }
+  // copy the remaining columns one at a time (nr==1)
+  for(Index j2=packet_cols; j2<cols; ++j2)
+  {
+    if(PanelMode) count += offset;
+    const Scalar* b0 = &rhs[j2];
+    for(Index k=0; k<depth; k++)
+    {
+      blockB[count] = cj(b0[k*rhsStride]);
+      count += 1;
+    }
+    if(PanelMode) count += stride-offset-depth;
+  }
+}
+
+} // end namespace internal
+
+/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+  * \sa setCpuCacheSize */
+inline std::ptrdiff_t l1CacheSize()
+{
+  std::ptrdiff_t l1, l2;
+  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  return l1;
+}
+
+/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+  * \sa setCpuCacheSize */
+inline std::ptrdiff_t l2CacheSize()
+{
+  std::ptrdiff_t l1, l2;
+  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  return l2;
+}
+
+/** Set the cpu L1 and L2 cache sizes (in bytes).
+  * These values are use to adjust the size of the blocks
+  * for the algorithms working per blocks.
+  *
+  * \sa computeProductBlockingSizes */
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
+{
+  internal::manage_caching_sizes(SetAction, &l1, &l2);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_BLOCK_PANEL_H
diff --git a/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h b/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h
new file mode 100644
index 0000000..3f5ffcf
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -0,0 +1,427 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
+
+/* Specialization for a row-major destination matrix => simple transposition of the product */
+template<
+  typename Index,
+  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(
+    Index rows, Index cols, Index depth,
+    const LhsScalar* lhs, Index lhsStride,
+    const RhsScalar* rhs, Index rhsStride,
+    ResScalar* res, Index resStride,
+    ResScalar alpha,
+    level3_blocking<RhsScalar,LhsScalar>& blocking,
+    GemmParallelInfo<Index>* info = 0)
+  {
+    // transpose the product such that the result is column major
+    general_matrix_matrix_product<Index,
+      RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+      LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+      ColMajor>
+    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
+  }
+};
+
+/*  Specialization for a col-major destination matrix
+ *    => Blocking algorithm following Goto's paper */
+template<
+  typename Index,
+  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
+{
+
+typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+static void run(Index rows, Index cols, Index depth,
+  const LhsScalar* _lhs, Index lhsStride,
+  const RhsScalar* _rhs, Index rhsStride,
+  ResScalar* res, Index resStride,
+  ResScalar alpha,
+  level3_blocking<LhsScalar,RhsScalar>& blocking,
+  GemmParallelInfo<Index>* info = 0)
+{
+  const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
+  const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+  Index kc = blocking.kc();                   // cache block size along the K direction
+  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+  //Index nc = blocking.nc(); // cache block size along the N direction
+
+  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
+  gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+
+#ifdef EIGEN_HAS_OPENMP
+  if(info)
+  {
+    // this is the parallel version!
+    Index tid = omp_get_thread_num();
+    Index threads = omp_get_num_threads();
+    
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
+    
+    RhsScalar* blockB = blocking.blockB();
+    eigen_internal_assert(blockB!=0);
+
+    // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
+    for(Index k=0; k<depth; k+=kc)
+    {
+      const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
+
+      // In order to reduce the chance that a thread has to wait for the other,
+      // let's start by packing A'.
+      pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
+
+      // Pack B_k to B' in a parallel fashion:
+      // each thread packs the sub block B_k,j to B'_j where j is the thread id.
+
+      // However, before copying to B'_j, we have to make sure that no other thread is still using it,
+      // i.e., we test that info[tid].users equals 0.
+      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
+      while(info[tid].users!=0) {}
+      info[tid].users += threads;
+
+      pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
+
+      // Notify the other threads that the part B'_j is ready to go.
+      info[tid].sync = k;
+
+      // Computes C_i += A' * B' per B'_j
+      for(Index shift=0; shift<threads; ++shift)
+      {
+        Index j = (tid+shift)%threads;
+
+        // At this point we have to make sure that B'_j has been updated by the thread j,
+        // we use testAndSetOrdered to mimic a volatile access.
+        // However, no need to wait for the B' part which has been updated by the current thread!
+        if(shift>0)
+          while(info[j].sync!=k) {}
+
+        gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
+      }
+
+      // Then keep going as usual with the remaining A'
+      for(Index i=mc; i<rows; i+=mc)
+      {
+        const Index actual_mc = (std::min)(i+mc,rows)-i;
+
+        // pack A_i,k to A'
+        pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
+
+        // C_i += A' * B'
+        gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
+      }
+
+      // Release all the sub blocks B'_j of B' for the current thread,
+      // i.e., we simply decrement the number of users by 1
+      for(Index j=0; j<threads; ++j)
+        #pragma omp atomic
+        --(info[j].users);
+    }
+  }
+  else
+#endif // EIGEN_HAS_OPENMP
+  {
+    EIGEN_UNUSED_VARIABLE(info);
+
+    // this is the sequential version!
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
+
+    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+    // (==GEMM_VAR1)
+    for(Index k2=0; k2<depth; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+      // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
+      // Note that this panel will be read as many times as the number of blocks in the lhs's
+      // vertical panel which is, in practice, a very low number.
+      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
+
+      // For each mc x kc block of the lhs's vertical panel...
+      // (==GEPP_VAR1)
+      for(Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+
+        // We pack the lhs's block into a sequential chunk of memory (L1 caching)
+        // Note that this block will be read a very high number of times, which is equal to the number of
+        // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
+        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+
+        // Everything is packed, we can now call the block * panel kernel:
+        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+      }
+    }
+  }
+}
+
+};
+
+/*********************************************************************************
+*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
+*  implementation of the high level wrapper to general_matrix_matrix_product
+**********************************************************************************/
+
+template<typename Lhs, typename Rhs>
+struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
+ : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
+{};
+
+template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
+struct gemm_functor
+{
+  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha,
+                  BlockingType& blocking)
+    : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
+  {}
+
+  void initParallelSession() const
+  {
+    m_blocking.allocateB();
+  }
+
+  void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
+  {
+    if(cols==-1)
+      cols = m_rhs.cols();
+
+    Gemm::run(rows, cols, m_lhs.cols(),
+              /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
+              /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
+              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
+              m_actualAlpha, m_blocking, info);
+  }
+
+  protected:
+    const Lhs& m_lhs;
+    const Rhs& m_rhs;
+    Dest& m_dest;
+    Scalar m_actualAlpha;
+    BlockingType& m_blocking;
+};
+
+template<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor=1,
+bool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space;
+
+template<typename _LhsScalar, typename _RhsScalar>
+class level3_blocking
+{
+    typedef _LhsScalar LhsScalar;
+    typedef _RhsScalar RhsScalar;
+
+  protected:
+    LhsScalar* m_blockA;
+    RhsScalar* m_blockB;
+    RhsScalar* m_blockW;
+
+    DenseIndex m_mc;
+    DenseIndex m_nc;
+    DenseIndex m_kc;
+
+  public:
+
+    level3_blocking()
+      : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0)
+    {}
+
+    inline DenseIndex mc() const { return m_mc; }
+    inline DenseIndex nc() const { return m_nc; }
+    inline DenseIndex kc() const { return m_kc; }
+
+    inline LhsScalar* blockA() { return m_blockA; }
+    inline RhsScalar* blockB() { return m_blockB; }
+    inline RhsScalar* blockW() { return m_blockW; }
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
+  : public level3_blocking<
+      typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+      typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+    enum {
+      Transpose = StorageOrder==RowMajor,
+      ActualRows = Transpose ? MaxCols : MaxRows,
+      ActualCols = Transpose ? MaxRows : MaxCols
+    };
+    typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+    typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+    enum {
+      SizeA = ActualRows * MaxDepth,
+      SizeB = ActualCols * MaxDepth,
+      SizeW = MaxDepth * Traits::WorkSpaceFactor
+    };
+
+    EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
+    EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
+
+  public:
+
+    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/)
+    {
+      this->m_mc = ActualRows;
+      this->m_nc = ActualCols;
+      this->m_kc = MaxDepth;
+      this->m_blockA = m_staticA;
+      this->m_blockB = m_staticB;
+      this->m_blockW = m_staticW;
+    }
+
+    inline void allocateA() {}
+    inline void allocateB() {}
+    inline void allocateW() {}
+    inline void allocateAll() {}
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, false>
+  : public level3_blocking<
+      typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+      typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+    enum {
+      Transpose = StorageOrder==RowMajor
+    };
+    typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+    typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+    DenseIndex m_sizeA;
+    DenseIndex m_sizeB;
+    DenseIndex m_sizeW;
+
+  public:
+
+    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+
+      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+      m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
+    }
+
+    void allocateA()
+    {
+      if(this->m_blockA==0)
+        this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
+    }
+
+    void allocateB()
+    {
+      if(this->m_blockB==0)
+        this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
+    }
+
+    void allocateW()
+    {
+      if(this->m_blockW==0)
+        this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
+    }
+
+    void allocateAll()
+    {
+      allocateA();
+      allocateB();
+      allocateW();
+    }
+
+    ~gemm_blocking_space()
+    {
+      aligned_delete(this->m_blockA, m_sizeA);
+      aligned_delete(this->m_blockB, m_sizeB);
+      aligned_delete(this->m_blockW, m_sizeW);
+    }
+};
+
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, GemmProduct>
+  : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
+{
+    enum {
+      MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
+    };
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+    
+    typedef typename  Lhs::Scalar LhsScalar;
+    typedef typename  Rhs::Scalar RhsScalar;
+    typedef           Scalar      ResScalar;
+
+    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {
+      typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
+      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
+    }
+
+    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+    {
+      eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+      typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+      typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+      Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                                 * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+      typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
+              Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+
+      typedef internal::gemm_functor<
+        Scalar, Index,
+        internal::general_matrix_matrix_product<
+          Index,
+          LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+          RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
+        _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
+
+      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());
+
+      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
+    }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
new file mode 100644
index 0000000..5c37639
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -0,0 +1,278 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+
+namespace Eigen { 
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update;
+
+namespace internal {
+
+/**********************************************************************
+* This file implements a general A * B product while
+* evaluating only one triangular part of the product.
+* This is more general version of self adjoint product (C += A A^T)
+* as the level 3 SYRK Blas routine.
+**********************************************************************/
+
+// forward declarations (defined at the end of this file)
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+struct tribb_kernel;
+  
+/* Optimized matrix-matrix product evaluating only one triangular half */
+template <typename Index,
+          typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+                              int ResStorageOrder, int  UpLo, int Version = Specialized>
+struct general_matrix_matrix_triangular_product;
+
+// as usual if the result is row major => we transpose the product
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+  {
+    general_matrix_matrix_triangular_product<Index,
+        RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+        LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+        ColMajor, UpLo==Lower?Upper:Lower>
+      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+  }
+};
+
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
+                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+  {
+    const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
+    const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+    Index kc = depth; // cache block size along the K direction
+    Index mc = size;  // cache block size along the M direction
+    Index nc = size;  // cache block size along the N direction
+    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
+    // !!! mc must be a multiple of nr:
+    if(mc > Traits::nr)
+      mc = (mc/Traits::nr)*Traits::nr;
+
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = sizeW + kc*size;
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
+    RhsScalar* blockB = allocatedBlockB + sizeW;
+    
+    gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
+    gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+    tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
+
+    for(Index k2=0; k2<depth; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+      // note that the actual rhs is the transpose/adjoint of mat
+      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
+
+      for(Index i2=0; i2<size; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,size)-i2;
+
+        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+
+        // the selected actual_mc * size panel of res is split into three different part:
+        //  1 - before the diagonal => processed with gebp or skipped
+        //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
+        //  3 - after the diagonal => processed with gebp or skipped
+        if (UpLo==Lower)
+          gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
+               -1, -1, 0, 0, allocatedBlockB);
+
+        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
+
+        if (UpLo==Upper)
+        {
+          Index j2 = i2+actual_mc;
+          gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
+               -1, -1, 0, 0, allocatedBlockB);
+        }
+      }
+    }
+  }
+};
+
+// Optimized packed Block * packed Block product kernel evaluating only one given triangular part
+// This kernel is built on top of the gebp kernel:
+// - the current destination block is processed per panel of actual_mc x BlockSize
+//   where BlockSize is set to the minimal value allowing gebp to be as fast as possible
+// - then, as usual, each panel is split into three parts along the diagonal,
+//   the sub blocks above and below the diagonal are processed as usual,
+//   while the triangular block overlapping the diagonal is evaluated into a
+//   small temporary buffer which is then accumulated into the result using a
+//   triangular traversal.
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+struct tribb_kernel
+{
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
+  typedef typename Traits::ResScalar ResScalar;
+  
+  enum {
+    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
+  };
+  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace)
+  {
+    gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
+
+    // let's process the block per panel of actual_mc x BlockSize,
+    // again, each is split into three parts, etc.
+    for (Index j=0; j<size; j+=BlockSize)
+    {
+      Index actualBlockSize = std::min<Index>(BlockSize,size - j);
+      const RhsScalar* actual_b = blockB+j*depth;
+
+      if(UpLo==Upper)
+        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
+                    -1, -1, 0, 0, workspace);
+
+      // selfadjoint micro block
+      {
+        Index i = j;
+        buffer.setZero();
+        // 1 - apply the kernel on the temporary buffer
+        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+                    -1, -1, 0, 0, workspace);
+        // 2 - triangular accumulation
+        for(Index j1=0; j1<actualBlockSize; ++j1)
+        {
+          ResScalar* r = res + (j+j1)*resStride + i;
+          for(Index i1=UpLo==Lower ? j1 : 0;
+              UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
+            r[i1] += buffer(i1,j1);
+        }
+      }
+
+      if(UpLo==Lower)
+      {
+        Index i = j+actualBlockSize;
+        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
+                    -1, -1, 0, 0, workspace);
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+// high level API
+
+template<typename MatrixType, typename ProductType, int UpLo, bool IsOuterProduct>
+struct general_product_to_triangular_selector;
+
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
+{
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    
+    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    
+    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+    typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+    typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+    enum {
+      StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+      UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
+      UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1
+    };
+    
+    internal::gemv_static_vector_if<Scalar,Lhs::SizeAtCompileTime,Lhs::MaxSizeAtCompileTime,!UseLhsDirectly> static_lhs;
+    ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(),
+      (UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data()));
+    if(!UseLhsDirectly) Map<typename _ActualLhs::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs;
+    
+    internal::gemv_static_vector_if<Scalar,Rhs::SizeAtCompileTime,Rhs::MaxSizeAtCompileTime,!UseRhsDirectly> static_rhs;
+    ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(),
+      (UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data()));
+    if(!UseRhsDirectly) Map<typename _ActualRhs::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+    
+    
+    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+                              LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+                              RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex>
+          ::run(actualLhs.size(), mat.data(), mat.outerStride(), actualLhsPtr, actualRhsPtr, actualAlpha);
+  }
+};
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
+{
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Index Index;
+    
+    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    
+    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+    typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+    typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+    internal::general_matrix_matrix_triangular_product<Index,
+      typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
+      ::run(mat.cols(), actualLhs.cols(),
+            &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
+            mat.data(), mat.outerStride(), actualAlpha);
+  }
+};
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename ProductDerived, typename _Lhs, typename _Rhs>
+TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
+{
+  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
+  
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
diff --git a/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
new file mode 100644
index 0000000..3deed06
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
@@ -0,0 +1,146 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Level 3 BLAS SYRK/HERK implementation.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int  UpLo>
+struct general_matrix_matrix_rankupdate :
+       general_matrix_matrix_triangular_product<
+         Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
+                          int RhsStorageOrder, bool ConjugateRhs, int  UpLo> \
+struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
+               Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
+                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
+  { \
+    if (lhs==rhs) { \
+      general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+    } else { \
+      general_matrix_matrix_triangular_product<Index, \
+        Scalar, LhsStorageOrder, ConjugateLhs, \
+        Scalar, RhsStorageOrder, ConjugateRhs, \
+        ColMajor, UpLo, BuiltIn> \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+    } \
+  } \
+};
+
+EIGEN_MKL_RANKUPDATE_SPECIALIZE(double)
+//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
+EIGEN_MKL_RANKUPDATE_SPECIALIZE(float)
+//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
+
+// SYRK for float/double
+#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+  enum { \
+    IsLower = (UpLo&Lower) == Lower, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
+  }; \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+  { \
+  /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
+\
+   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
+   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
+   MKLTYPE alpha_, beta_; \
+\
+/* Set alpha_ & beta_ */ \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
+  } \
+};
+
+// HERK for complex data
+#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+  enum { \
+    IsLower = (UpLo&Lower) == Lower, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
+  }; \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+  { \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
+\
+   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
+   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
+   RTYPE alpha_, beta_; \
+   const EIGTYPE* a_ptr; \
+\
+/* Set alpha_ & beta_ */ \
+/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\
+/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \
+   alpha_ = alpha.real(); \
+   beta_ = 1.0; \
+/* Copy with conjugation in some cases*/ \
+   MatrixType a; \
+   if (conjA) { \
+     Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
+     a = mapA.conjugate(); \
+     lda = a.outerStride(); \
+     a_ptr = a.data(); \
+   } else a_ptr=lhs; \
+   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
+  } \
+};
+
+
+EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk)
+EIGEN_MKL_RANKUPDATE_R(float,  float,  ssyrk)
+
+//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
+//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8,  double, cherk)
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
diff --git a/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
new file mode 100644
index 0000000..060af32
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
@@ -0,0 +1,118 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   General matrix-matrix product functionality based on ?GEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-matrix multiplication using BLAS
+* gemm function via partial specialization of
+* general_matrix_matrix_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemm specialization
+
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
+template< \
+  typename Index, \
+  int LhsStorageOrder, bool ConjugateLhs, \
+  int RhsStorageOrder, bool ConjugateRhs> \
+struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+static void run(Index rows, Index cols, Index depth, \
+  const EIGTYPE* _lhs, Index lhsStride, \
+  const EIGTYPE* _rhs, Index rhsStride, \
+  EIGTYPE* res, Index resStride, \
+  EIGTYPE alpha, \
+  level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
+  GemmParallelInfo<Index>* /*info = 0*/) \
+{ \
+  using std::conj; \
+\
+  char transa, transb; \
+  MKL_INT m, n, k, lda, ldb, ldc; \
+  const EIGTYPE *a, *b; \
+  MKLTYPE alpha_, beta_; \
+  MatrixX##EIGPREFIX a_tmp, b_tmp; \
+  EIGTYPE myone(1);\
+\
+/* Set transpose options */ \
+  transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+  transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set m, n, k */ \
+  m = (MKL_INT)rows;  \
+  n = (MKL_INT)cols;  \
+  k = (MKL_INT)depth; \
+\
+/* Set alpha_ & beta_ */ \
+  assign_scalar_eig2mkl(alpha_, alpha); \
+  assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+  lda = (MKL_INT)lhsStride; \
+  ldb = (MKL_INT)rhsStride; \
+  ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+  if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
+    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
+    a_tmp = lhs.conjugate(); \
+    a = a_tmp.data(); \
+    lda = a_tmp.outerStride(); \
+  } else a = _lhs; \
+\
+  if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
+    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
+    b_tmp = rhs.conjugate(); \
+    b = b_tmp.data(); \
+    ldb = b_tmp.outerStride(); \
+  } else b = _rhs; \
+\
+  MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+}};
+
+GEMM_SPECIALIZATION(double,   d,  double,        d)
+GEMM_SPECIALIZATION(float,    f,  float,         s)
+GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z)
+GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8,  c)
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
diff --git a/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h b/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h
new file mode 100644
index 0000000..0938770
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -0,0 +1,566 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/* Optimized col-major matrix * vector product:
+ * This algorithm processes 4 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic: C += alpha * A * B
+ *  |  A  |  B  |alpha| comments
+ *  |real |cplx |cplx | no vectorization
+ *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
+ *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
+ *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
+ */
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+{
+typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
+              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
+  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+};
+
+typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsScalar* lhs, Index lhsStride,
+  const RhsScalar* rhs, Index rhsIncr,
+  ResScalar* res, Index resIncr, RhsScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+  Index rows, Index cols,
+  const LhsScalar* lhs, Index lhsStride,
+  const RhsScalar* rhs, Index rhsIncr,
+  ResScalar* res, Index resIncr, RhsScalar alpha)
+{
+  EIGEN_UNUSED_VARIABLE(resIncr)
+  eigen_internal_assert(resIncr==1);
+  #ifdef _EIGEN_ACCUMULATE_PACKETS
+  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+  #endif
+  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
+    pstore(&res[j], \
+      padd(pload<ResPacket>(&res[j]), \
+        padd( \
+          padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
+                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
+          padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
+                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
+
+  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+  if(ConjugateRhs)
+    alpha = numext::conj(alpha);
+
+  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
+  const Index columnsAtOnce = 4;
+  const Index peels = 2;
+  const Index LhsPacketAlignedMask = LhsPacketSize-1;
+  const Index ResPacketAlignedMask = ResPacketSize-1;
+//  const Index PeelAlignedMask = ResPacketSize*peels-1;
+  const Index size = rows;
+  
+  // How many coeffs of the result do we have to skip to be aligned.
+  // Here we assume data are at least aligned on the base scalar type.
+  Index alignedStart = internal::first_aligned(res,size);
+  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
+  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
+
+  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
+  Index alignmentPattern = alignmentStep==0 ? AllAligned
+                       : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                       : FirstAligned;
+
+  // we cannot assume the first element is aligned because of sub-matrices
+  const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
+
+  // find how many columns do we have to skip to be aligned with the result (if possible)
+  Index skipColumns = 0;
+  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+  if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
+  {
+    alignedSize = 0;
+    alignedStart = 0;
+  }
+  else if (LhsPacketSize>1)
+  {
+    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
+
+    while (skipColumns<LhsPacketSize &&
+          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
+      ++skipColumns;
+    if (skipColumns==LhsPacketSize)
+    {
+      // nothing can be aligned, no need to skip any column
+      alignmentPattern = NoneAligned;
+      skipColumns = 0;
+    }
+    else
+    {
+      skipColumns = (std::min)(skipColumns,cols);
+      // note that the skiped columns are processed later.
+    }
+
+    eigen_internal_assert(  (alignmentPattern==NoneAligned)
+                      || (skipColumns + columnsAtOnce >= cols)
+                      || LhsPacketSize > size
+                      || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
+  }
+  else if(Vectorizable)
+  {
+    alignedStart = 0;
+    alignedSize = size;
+    alignmentPattern = AllAligned;
+  }
+
+  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+
+  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
+  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
+  {
+    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
+              ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
+              ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
+              ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
+
+    // this helps a lot generating better binary code
+    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
+                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+
+    if (Vectorizable)
+    {
+      /* explicit vectorization */
+      // process initial unaligned coeffs
+      for (Index j=0; j<alignedStart; ++j)
+      {
+        res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
+        res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
+        res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
+        res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+      }
+
+      if (alignedSize>alignedStart)
+      {
+        switch(alignmentPattern)
+        {
+          case AllAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+            break;
+          case EvenAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+            break;
+          case FirstAligned:
+          {
+            Index j = alignedStart;
+            if(peels>1)
+            {
+              LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
+              ResPacket T0, T1;
+
+              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
+              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
+              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+
+              for (; j<peeledSize; j+=peels*ResPacketSize)
+              {
+                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
+                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
+                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+
+                A00 = pload<LhsPacket>(&lhs0[j]);
+                A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
+                T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
+                T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
+
+                T0  = pcj.pmadd(A01, ptmp1, T0);
+                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                T0  = pcj.pmadd(A02, ptmp2, T0);
+                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                T0  = pcj.pmadd(A03, ptmp3, T0);
+                pstore(&res[j],T0);
+                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+                T1  = pcj.pmadd(A11, ptmp1, T1);
+                T1  = pcj.pmadd(A12, ptmp2, T1);
+                T1  = pcj.pmadd(A13, ptmp3, T1);
+                pstore(&res[j+ResPacketSize],T1);
+              }
+            }
+            for (; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+            break;
+          }
+          default:
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+            break;
+        }
+      }
+    } // end explicit vectorization
+
+    /* process remaining coeffs (or all if there is no explicit vectorization) */
+    for (Index j=alignedSize; j<size; ++j)
+    {
+      res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
+      res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
+      res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
+      res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+    }
+  }
+
+  // process remaining first and last columns (at most columnsAtOnce-1)
+  Index end = cols;
+  Index start = columnBound;
+  do
+  {
+    for (Index k=start; k<end; ++k)
+    {
+      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
+      const LhsScalar* lhs0 = lhs + k*lhsStride;
+
+      if (Vectorizable)
+      {
+        /* explicit vectorization */
+        // process first unaligned result's coeffs
+        for (Index j=0; j<alignedStart; ++j)
+          res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
+        // process aligned result's coeffs
+        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+            pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+        else
+          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+            pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+      }
+
+      // process remaining scalars (or all if no explicit vectorization)
+      for (Index i=alignedSize; i<size; ++i)
+        res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
+    }
+    if (skipColumns)
+    {
+      start = 0;
+      end = skipColumns;
+      skipColumns = 0;
+    }
+    else
+      break;
+  } while(Vectorizable);
+  #undef _EIGEN_ACCUMULATE_PACKETS
+}
+
+/* Optimized row-major matrix * vector product:
+ * This algorithm processes 4 rows at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic:
+ *  - alpha is always a complex (or converted to a complex)
+ *  - no vectorization
+ */
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+{
+typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
+              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
+  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+};
+
+typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsScalar* lhs, Index lhsStride,
+  const RhsScalar* rhs, Index rhsIncr,
+  ResScalar* res, Index resIncr,
+  ResScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+  Index rows, Index cols,
+  const LhsScalar* lhs, Index lhsStride,
+  const RhsScalar* rhs, Index rhsIncr,
+  ResScalar* res, Index resIncr,
+  ResScalar alpha)
+{
+  EIGEN_UNUSED_VARIABLE(rhsIncr);
+  eigen_internal_assert(rhsIncr==1);
+  #ifdef _EIGEN_ACCUMULATE_PACKETS
+  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+  #endif
+
+  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
+    RhsPacket b = pload<RhsPacket>(&rhs[j]); \
+    ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
+    ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
+    ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
+    ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
+
+  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+
+  enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
+  const Index rowsAtOnce = 4;
+  const Index peels = 2;
+  const Index RhsPacketAlignedMask = RhsPacketSize-1;
+  const Index LhsPacketAlignedMask = LhsPacketSize-1;
+//   const Index PeelAlignedMask = RhsPacketSize*peels-1;
+  const Index depth = cols;
+
+  // How many coeffs of the result do we have to skip to be aligned.
+  // Here we assume data are at least aligned on the base scalar type
+  // if that's not the case then vectorization is discarded, see below.
+  Index alignedStart = internal::first_aligned(rhs, depth);
+  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
+  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
+
+  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
+  Index alignmentPattern = alignmentStep==0 ? AllAligned
+                         : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                         : FirstAligned;
+
+  // we cannot assume the first element is aligned because of sub-matrices
+  const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
+
+  // find how many rows do we have to skip to be aligned with rhs (if possible)
+  Index skipRows = 0;
+  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
+  {
+    alignedSize = 0;
+    alignedStart = 0;
+  }
+  else if (LhsPacketSize>1)
+  {
+    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
+
+    while (skipRows<LhsPacketSize &&
+           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
+      ++skipRows;
+    if (skipRows==LhsPacketSize)
+    {
+      // nothing can be aligned, no need to skip any column
+      alignmentPattern = NoneAligned;
+      skipRows = 0;
+    }
+    else
+    {
+      skipRows = (std::min)(skipRows,Index(rows));
+      // note that the skiped columns are processed later.
+    }
+    eigen_internal_assert(  alignmentPattern==NoneAligned
+                      || LhsPacketSize==1
+                      || (skipRows + rowsAtOnce >= rows)
+                      || LhsPacketSize > depth
+                      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
+  }
+  else if(Vectorizable)
+  {
+    alignedStart = 0;
+    alignedSize = depth;
+    alignmentPattern = AllAligned;
+  }
+
+  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+
+  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
+  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
+  {
+    EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
+
+    // this helps the compiler generating good binary code
+    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
+                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+
+    if (Vectorizable)
+    {
+      /* explicit vectorization */
+      ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
+                ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
+
+      // process initial unaligned coeffs
+      // FIXME this loop get vectorized by the compiler !
+      for (Index j=0; j<alignedStart; ++j)
+      {
+        RhsScalar b = rhs[j];
+        tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
+        tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+      }
+
+      if (alignedSize>alignedStart)
+      {
+        switch(alignmentPattern)
+        {
+          case AllAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+            break;
+          case EvenAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+            break;
+          case FirstAligned:
+          {
+            Index j = alignedStart;
+            if (peels>1)
+            {
+              /* Here we proccess 4 rows with with two peeled iterations to hide
+               * the overhead of unaligned loads. Moreover unaligned loads are handled
+               * using special shift/move operations between the two aligned packets
+               * overlaping the desired unaligned packet. This is *much* more efficient
+               * than basic unaligned loads.
+               */
+              LhsPacket A01, A02, A03, A11, A12, A13;
+              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
+              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
+              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+
+              for (; j<peeledSize; j+=peels*RhsPacketSize)
+              {
+                RhsPacket b = pload<RhsPacket>(&rhs[j]);
+                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
+                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
+                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+
+                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
+                ptmp1 = pcj.pmadd(A01, b, ptmp1);
+                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                ptmp2 = pcj.pmadd(A02, b, ptmp2);
+                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                ptmp3 = pcj.pmadd(A03, b, ptmp3);
+                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+
+                b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
+                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
+                ptmp1 = pcj.pmadd(A11, b, ptmp1);
+                ptmp2 = pcj.pmadd(A12, b, ptmp2);
+                ptmp3 = pcj.pmadd(A13, b, ptmp3);
+              }
+            }
+            for (; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+            break;
+          }
+          default:
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+            break;
+        }
+        tmp0 += predux(ptmp0);
+        tmp1 += predux(ptmp1);
+        tmp2 += predux(ptmp2);
+        tmp3 += predux(ptmp3);
+      }
+    } // end explicit vectorization
+
+    // process remaining coeffs (or all if no explicit vectorization)
+    // FIXME this loop get vectorized by the compiler !
+    for (Index j=alignedSize; j<depth; ++j)
+    {
+      RhsScalar b = rhs[j];
+      tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
+      tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+    }
+    res[i*resIncr]            += alpha*tmp0;
+    res[(i+offset1)*resIncr]  += alpha*tmp1;
+    res[(i+2)*resIncr]        += alpha*tmp2;
+    res[(i+offset3)*resIncr]  += alpha*tmp3;
+  }
+
+  // process remaining first and last rows (at most columnsAtOnce-1)
+  Index end = rows;
+  Index start = rowBound;
+  do
+  {
+    for (Index i=start; i<end; ++i)
+    {
+      EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
+      const LhsScalar* lhs0 = lhs + i*lhsStride;
+      // process first unaligned result's coeffs
+      // FIXME this loop get vectorized by the compiler !
+      for (Index j=0; j<alignedStart; ++j)
+        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+
+      if (alignedSize>alignedStart)
+      {
+        // process aligned rhs coeffs
+        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+            ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+        else
+          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+            ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+        tmp0 += predux(ptmp0);
+      }
+
+      // process remaining scalars
+      // FIXME this loop get vectorized by the compiler !
+      for (Index j=alignedSize; j<depth; ++j)
+        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+      res[i*resIncr] += alpha*tmp0;
+    }
+    if (skipRows)
+    {
+      start = 0;
+      end = skipRows;
+      skipRows = 0;
+    }
+    else
+      break;
+  } while(Vectorizable);
+
+  #undef _EIGEN_ACCUMULATE_PACKETS
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
diff --git a/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
new file mode 100644
index 0000000..1cb9fe6
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
@@ -0,0 +1,131 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   General matrix-vector product functionality based on ?GEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-vector multiplication using BLAS
+* gemv function via partial specialization of
+* general_matrix_vector_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemv specialization
+
+template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv :
+  general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
+
+#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+static void run( \
+  Index rows, Index cols, \
+  const Scalar* lhs, Index lhsStride, \
+  const Scalar* rhs, Index rhsIncr, \
+  Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+  if (ConjugateLhs) { \
+    general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
+      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+  } else { \
+    general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+  } \
+} \
+}; \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+static void run( \
+  Index rows, Index cols, \
+  const Scalar* lhs, Index lhsStride, \
+  const Scalar* rhs, Index rhsIncr, \
+  Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+    general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+} \
+}; \
+
+EIGEN_MKL_GEMV_SPECIALIZE(double)
+EIGEN_MKL_GEMV_SPECIALIZE(float)
+EIGEN_MKL_GEMV_SPECIALIZE(dcomplex)
+EIGEN_MKL_GEMV_SPECIALIZE(scomplex)
+
+#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \
+template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
+\
+static void run( \
+  Index rows, Index cols, \
+  const EIGTYPE* lhs, Index lhsStride, \
+  const EIGTYPE* rhs, Index rhsIncr, \
+  EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
+{ \
+  MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \
+  MKLTYPE alpha_, beta_; \
+  const EIGTYPE *x_ptr, myone(1); \
+  char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
+  if (LhsStorageOrder==RowMajor) { \
+    m=cols; \
+    n=rows; \
+  }\
+  assign_scalar_eig2mkl(alpha_, alpha); \
+  assign_scalar_eig2mkl(beta_, myone); \
+  GEMVVector x_tmp; \
+  if (ConjugateRhs) { \
+    Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
+    x_tmp=map_x.conjugate(); \
+    x_ptr=x_tmp.data(); \
+    incx=1; \
+  } else x_ptr=rhs; \
+  MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+}\
+};
+
+EIGEN_MKL_GEMV_SPECIALIZATION(double,   double,        d)
+EIGEN_MKL_GEMV_SPECIALIZATION(float,    float,         s)
+EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8,  c)
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
diff --git a/include/eigen3/Eigen/src/Core/products/Parallelizer.h b/include/eigen3/Eigen/src/Core/products/Parallelizer.h
new file mode 100644
index 0000000..5c3e9b7
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/Parallelizer.h
@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARALLELIZER_H
+#define EIGEN_PARALLELIZER_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal */
+inline void manage_multi_threading(Action action, int* v)
+{
+  static EIGEN_UNUSED int m_maxThreads = -1;
+
+  if(action==SetAction)
+  {
+    eigen_internal_assert(v!=0);
+    m_maxThreads = *v;
+  }
+  else if(action==GetAction)
+  {
+    eigen_internal_assert(v!=0);
+    #ifdef EIGEN_HAS_OPENMP
+    if(m_maxThreads>0)
+      *v = m_maxThreads;
+    else
+      *v = omp_get_max_threads();
+    #else
+    *v = 1;
+    #endif
+  }
+  else
+  {
+    eigen_internal_assert(false);
+  }
+}
+
+}
+
+/** Must be call first when calling Eigen from multiple threads */
+inline void initParallel()
+{
+  int nbt;
+  internal::manage_multi_threading(GetAction, &nbt);
+  std::ptrdiff_t l1, l2;
+  internal::manage_caching_sizes(GetAction, &l1, &l2);
+}
+
+/** \returns the max number of threads reserved for Eigen
+  * \sa setNbThreads */
+inline int nbThreads()
+{
+  int ret;
+  internal::manage_multi_threading(GetAction, &ret);
+  return ret;
+}
+
+/** Sets the max number of threads reserved for Eigen
+  * \sa nbThreads */
+inline void setNbThreads(int v)
+{
+  internal::manage_multi_threading(SetAction, &v);
+}
+
+namespace internal {
+
+template<typename Index> struct GemmParallelInfo
+{
+  GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}
+
+  int volatile sync;
+  int volatile users;
+
+  Index rhs_start;
+  Index rhs_length;
+};
+
+template<bool Condition, typename Functor, typename Index>
+void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
+{
+  // TODO when EIGEN_USE_BLAS is defined,
+  // we should still enable OMP for other scalar types
+#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
+  // FIXME the transpose variable is only needed to properly split
+  // the matrix product when multithreading is enabled. This is a temporary
+  // fix to support row-major destination matrices. This whole
+  // parallelizer mechanism has to be redisigned anyway.
+  EIGEN_UNUSED_VARIABLE(transpose);
+  func(0,rows, 0,cols);
+#else
+
+  // Dynamically check whether we should enable or disable OpenMP.
+  // The conditions are:
+  // - the max number of threads we can create is greater than 1
+  // - we are not already in a parallel code
+  // - the sizes are large enough
+
+  // 1- are we already in a parallel session?
+  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
+  if((!Condition) || (omp_get_num_threads()>1))
+    return func(0,rows, 0,cols);
+
+  Index size = transpose ? cols : rows;
+
+  // 2- compute the maximal number of threads from the size of the product:
+  // FIXME this has to be fine tuned
+  Index max_threads = std::max<Index>(1,size / 32);
+
+  // 3 - compute the number of threads we are going to use
+  Index threads = std::min<Index>(nbThreads(), max_threads);
+
+  if(threads==1)
+    return func(0,rows, 0,cols);
+
+  Eigen::initParallel();
+  func.initParallelSession();
+
+  if(transpose)
+    std::swap(rows,cols);
+
+  Index blockCols = (cols / threads) & ~Index(0x3);
+  Index blockRows = (rows / threads) & ~Index(0x7);
+  
+  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
+
+  #pragma omp parallel for schedule(static,1) num_threads(threads)
+  for(Index i=0; i<threads; ++i)
+  {
+    Index r0 = i*blockRows;
+    Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+
+    Index c0 = i*blockCols;
+    Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+
+    info[i].rhs_start = c0;
+    info[i].rhs_length = actualBlockCols;
+
+    if(transpose)
+      func(0, cols, r0, actualBlockRows, info);
+    else
+      func(r0, actualBlockRows, 0,cols, info);
+  }
+
+  delete[] info;
+#endif
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARALLELIZER_H
diff --git a/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
new file mode 100644
index 0000000..99cf9e0
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -0,0 +1,436 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// pack a selfadjoint block diagonal for use with the gebp_kernel
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
+struct symm_pack_lhs
+{
+  template<int BlockRows> inline
+  void pack(Scalar* blockA, const const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count)
+  {
+    // normal copy
+    for(Index k=0; k<i; k++)
+      for(Index w=0; w<BlockRows; w++)
+        blockA[count++] = lhs(i+w,k);           // normal
+    // symmetric copy
+    Index h = 0;
+    for(Index k=i; k<i+BlockRows; k++)
+    {
+      for(Index w=0; w<h; w++)
+        blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+
+      blockA[count++] = numext::real(lhs(k,k));   // real (diagonal)
+
+      for(Index w=h+1; w<BlockRows; w++)
+        blockA[count++] = lhs(i+w, k);          // normal
+      ++h;
+    }
+    // transposed copy
+    for(Index k=i+BlockRows; k<cols; k++)
+      for(Index w=0; w<BlockRows; w++)
+        blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+  }
+  void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
+    Index count = 0;
+    Index peeled_mc = (rows/Pack1)*Pack1;
+    for(Index i=0; i<peeled_mc; i+=Pack1)
+    {
+      pack<Pack1>(blockA, lhs, cols, i, count);
+    }
+
+    if(rows-peeled_mc>=Pack2)
+    {
+      pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
+      peeled_mc += Pack2;
+    }
+
+    // do the same with mr==1
+    for(Index i=peeled_mc; i<rows; i++)
+    {
+      for(Index k=0; k<i; k++)
+        blockA[count++] = lhs(i, k);              // normal
+
+      blockA[count++] = numext::real(lhs(i, i));       // real (diagonal)
+
+      for(Index k=i+1; k<cols; k++)
+        blockA[count++] = numext::conj(lhs(k, i));     // transposed
+    }
+  }
+};
+
+template<typename Scalar, typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs
+{
+  enum { PacketSize = packet_traits<Scalar>::size };
+  void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    Index end_k = k2 + rows;
+    Index count = 0;
+    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
+    Index packet_cols = (cols/nr)*nr;
+
+    // first part: normal case
+    for(Index j2=0; j2<k2; j2+=nr)
+    {
+      for(Index k=k2; k<end_k; k++)
+      {
+        blockB[count+0] = rhs(k,j2+0);
+        blockB[count+1] = rhs(k,j2+1);
+        if (nr==4)
+        {
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+        }
+        count += nr;
+      }
+    }
+
+    // second part: diagonal block
+    for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr)
+    {
+      // again we can split vertically in three different parts (transpose, symmetric, normal)
+      // transpose
+      for(Index k=k2; k<j2; k++)
+      {
+        blockB[count+0] = numext::conj(rhs(j2+0,k));
+        blockB[count+1] = numext::conj(rhs(j2+1,k));
+        if (nr==4)
+        {
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+        }
+        count += nr;
+      }
+      // symmetric
+      Index h = 0;
+      for(Index k=j2; k<j2+nr; k++)
+      {
+        // normal
+        for (Index w=0 ; w<h; ++w)
+          blockB[count+w] = rhs(k,j2+w);
+
+        blockB[count+h] = numext::real(rhs(k,k));
+
+        // transpose
+        for (Index w=h+1 ; w<nr; ++w)
+          blockB[count+w] = numext::conj(rhs(j2+w,k));
+        count += nr;
+        ++h;
+      }
+      // normal
+      for(Index k=j2+nr; k<end_k; k++)
+      {
+        blockB[count+0] = rhs(k,j2+0);
+        blockB[count+1] = rhs(k,j2+1);
+        if (nr==4)
+        {
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+        }
+        count += nr;
+      }
+    }
+
+    // third part: transposed
+    for(Index j2=k2+rows; j2<packet_cols; j2+=nr)
+    {
+      for(Index k=k2; k<end_k; k++)
+      {
+        blockB[count+0] = numext::conj(rhs(j2+0,k));
+        blockB[count+1] = numext::conj(rhs(j2+1,k));
+        if (nr==4)
+        {
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+        }
+        count += nr;
+      }
+    }
+
+    // copy the remaining columns one at a time (=> the same with nr==1)
+    for(Index j2=packet_cols; j2<cols; ++j2)
+    {
+      // transpose
+      Index half = (std::min)(end_k,j2);
+      for(Index k=k2; k<half; k++)
+      {
+        blockB[count] = numext::conj(rhs(j2,k));
+        count += 1;
+      }
+
+      if(half==j2 && half<k2+rows)
+      {
+        blockB[count] = numext::real(rhs(j2,j2));
+        count += 1;
+      }
+      else
+        half--;
+
+      // normal
+      for(Index k=half+1; k<k2+rows; k++)
+      {
+        blockB[count] = rhs(k,j2);
+        count += 1;
+      }
+    }
+  }
+};
+
+/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
+          int ResStorageOrder>
+struct product_selfadjoint_matrix;
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
+{
+
+  static EIGEN_STRONG_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* lhs, Index lhsStride,
+    const Scalar* rhs, Index rhsStride,
+    Scalar* res,       Index resStride,
+    const Scalar& alpha)
+  {
+    product_selfadjoint_matrix<Scalar, Index,
+      EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+      RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
+      EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+      LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
+      ColMajor>
+      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha);
+  }
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
+{
+
+  static EIGEN_DONT_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha);
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha)
+  {
+    Index size = rows;
+
+    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
+    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+
+    Index kc = size;  // cache block size along the K direction
+    Index mc = rows;  // cache block size along the M direction
+    Index nc = cols;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    // kc must smaller than mc
+    kc = (std::min)(kc,mc);
+
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = sizeW + kc*cols;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
+    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
+    Scalar* blockB = allocatedBlockB + sizeW;
+
+    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+
+    for(Index k2=0; k2<size; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+      // we have selected one row panel of rhs and one column panel of lhs
+      // pack rhs's panel into a sequential chunk of memory
+      // and expand each coeff to a constant packet for further reuse
+      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
+
+      // the select lhs's panel has to be split in three different parts:
+      //  1 - the transposed panel above the diagonal block => transposed packed copy
+      //  2 - the diagonal block => special packed copy
+      //  3 - the panel below the diagonal block => generic packed copy
+      for(Index i2=0; i2<k2; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,k2)-i2;
+        // transposed packed copy
+        pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
+
+        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+      // the block diagonal
+      {
+        const Index actual_mc = (std::min)(k2+kc,size)-k2;
+        // symmetric packed copy
+        pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
+
+        gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+
+      for(Index i2=k2+kc; i2<size; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,size)-i2;
+        gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+          (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+
+        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+    }
+  }
+
+// matrix * selfadjoint product
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
+{
+
+  static EIGEN_DONT_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha);
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha)
+  {
+    Index size = cols;
+
+    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+
+    Index kc = size; // cache block size along the K direction
+    Index mc = rows;  // cache block size along the M direction
+    Index nc = cols;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = sizeW + kc*cols;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
+    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
+    Scalar* blockB = allocatedBlockB + sizeW;
+
+    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+
+    for(Index k2=0; k2<size; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+      pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
+
+      // => GEPP
+      for(Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+
+        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+    }
+  }
+
+} // end namespace internal
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_matrix
+***************************************************************************/
+
+namespace internal {
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
+  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
+  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  enum {
+    LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
+    LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
+    RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
+    RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
+  };
+
+  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  {
+    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+    internal::product_selfadjoint_matrix<Scalar, Index,
+      EIGEN_LOGICAL_XOR(LhsIsUpper,
+                        internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
+      EIGEN_LOGICAL_XOR(RhsIsUpper,
+                        internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
+      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
+      ::run(
+        lhs.rows(), rhs.cols(),                 // sizes
+        &lhs.coeffRef(0,0),    lhs.outerStride(),  // lhs info
+        &rhs.coeffRef(0,0),    rhs.outerStride(),  // rhs info
+        &dst.coeffRef(0,0), dst.outerStride(),  // result info
+        actualAlpha                             // alpha
+      );
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
new file mode 100644
index 0000000..dfa687f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
@@ -0,0 +1,295 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
+{\
+\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha) \
+  { \
+    char side='L', uplo='L'; \
+    MKL_INT m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    MKLTYPE alpha_, beta_; \
+    MatrixX##EIGPREFIX b_tmp; \
+    EIGTYPE myone(1);\
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+    m = (MKL_INT)rows;  \
+    n = (MKL_INT)cols;  \
+\
+/* Set alpha_ & beta_ */ \
+    assign_scalar_eig2mkl(alpha_, alpha); \
+    assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = (MKL_INT)lhsStride; \
+    ldb = (MKL_INT)rhsStride; \
+    ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+    if (LhsStorageOrder==RowMajor) uplo='U'; \
+    a = _lhs; \
+\
+    if (RhsStorageOrder==RowMajor) { \
+      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+      b_tmp = rhs.adjoint(); \
+      b = b_tmp.data(); \
+      ldb = b_tmp.outerStride(); \
+    } else b = _rhs; \
+\
+    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+\
+  } \
+};
+
+
+#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
+{\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha) \
+  { \
+    char side='L', uplo='L'; \
+    MKL_INT m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    MKLTYPE alpha_, beta_; \
+    MatrixX##EIGPREFIX b_tmp; \
+    Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
+    EIGTYPE myone(1); \
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+    m = (MKL_INT)rows; \
+    n = (MKL_INT)cols; \
+\
+/* Set alpha_ & beta_ */ \
+    assign_scalar_eig2mkl(alpha_, alpha); \
+    assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = (MKL_INT)lhsStride; \
+    ldb = (MKL_INT)rhsStride; \
+    ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+    if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
+      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
+      a_tmp = lhs.conjugate(); \
+      a = a_tmp.data(); \
+      lda = a_tmp.outerStride(); \
+    } else a = _lhs; \
+    if (LhsStorageOrder==RowMajor) uplo='U'; \
+\
+    if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \
+       b = _rhs; } \
+    else { \
+      if (RhsStorageOrder==ColMajor && ConjugateRhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.conjugate(); \
+      } else \
+      if (ConjugateRhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.adjoint(); \
+      } else { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.transpose(); \
+      } \
+      b = b_tmp.data(); \
+      ldb = b_tmp.outerStride(); \
+    } \
+\
+    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+\
+  } \
+};
+
+EIGEN_MKL_SYMM_L(double, double, d, d)
+EIGEN_MKL_SYMM_L(float, float, f, s)
+EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c)
+
+
+/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
+{\
+\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha) \
+  { \
+    char side='R', uplo='L'; \
+    MKL_INT m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    MKLTYPE alpha_, beta_; \
+    MatrixX##EIGPREFIX b_tmp; \
+    EIGTYPE myone(1);\
+\
+/* Set m, n, k */ \
+    m = (MKL_INT)rows;  \
+    n = (MKL_INT)cols;  \
+\
+/* Set alpha_ & beta_ */ \
+    assign_scalar_eig2mkl(alpha_, alpha); \
+    assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = (MKL_INT)rhsStride; \
+    ldb = (MKL_INT)lhsStride; \
+    ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+    if (RhsStorageOrder==RowMajor) uplo='U'; \
+    a = _rhs; \
+\
+    if (LhsStorageOrder==RowMajor) { \
+      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
+      b_tmp = lhs.adjoint(); \
+      b = b_tmp.data(); \
+      ldb = b_tmp.outerStride(); \
+    } else b = _lhs; \
+\
+    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+\
+  } \
+};
+
+
+#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
+{\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha) \
+  { \
+    char side='R', uplo='L'; \
+    MKL_INT m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    MKLTYPE alpha_, beta_; \
+    MatrixX##EIGPREFIX b_tmp; \
+    Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
+    EIGTYPE myone(1); \
+\
+/* Set m, n, k */ \
+    m = (MKL_INT)rows; \
+    n = (MKL_INT)cols; \
+\
+/* Set alpha_ & beta_ */ \
+    assign_scalar_eig2mkl(alpha_, alpha); \
+    assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = (MKL_INT)rhsStride; \
+    ldb = (MKL_INT)lhsStride; \
+    ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+    if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
+      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
+      a_tmp = rhs.conjugate(); \
+      a = a_tmp.data(); \
+      lda = a_tmp.outerStride(); \
+    } else a = _rhs; \
+    if (RhsStorageOrder==RowMajor) uplo='U'; \
+\
+    if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \
+       b = _lhs; } \
+    else { \
+      if (LhsStorageOrder==ColMajor && ConjugateLhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.conjugate(); \
+      } else \
+      if (ConjugateLhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.adjoint(); \
+      } else { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.transpose(); \
+      } \
+      b = b_tmp.data(); \
+      ldb = b_tmp.outerStride(); \
+    } \
+\
+    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+  } \
+};
+
+EIGEN_MKL_SYMM_R(double, double, d, d)
+EIGEN_MKL_SYMM_R(float, float, f, s)
+EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
diff --git a/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h b/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h
new file mode 100644
index 0000000..f698f67
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -0,0 +1,281 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/* Optimized selfadjoint matrix * vector product:
+ * This algorithm processes 2 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 2 and to reduce
+ * the instruction dependency.
+ */
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version=Specialized>
+struct selfadjoint_matrix_vector_product;
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+struct selfadjoint_matrix_vector_product
+
+{
+static EIGEN_DONT_INLINE void run(
+  Index size,
+  const Scalar*  lhs, Index lhsStride,
+  const Scalar* _rhs, Index rhsIncr,
+  Scalar* res,
+  Scalar alpha);
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+  Index size,
+  const Scalar*  lhs, Index lhsStride,
+  const Scalar* _rhs, Index rhsIncr,
+  Scalar* res,
+  Scalar alpha)
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
+
+  enum {
+    IsRowMajor = StorageOrder==RowMajor ? 1 : 0,
+    IsLower = UpLo == Lower ? 1 : 0,
+    FirstTriangular = IsRowMajor == IsLower
+  };
+
+  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
+  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
+  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
+
+  conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> pcj0;
+  conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
+
+  Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
+
+  // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
+  // if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
+  // this is because we need to extract packets
+  ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);  
+  if (rhsIncr!=1)
+  {
+    const Scalar* it = _rhs;
+    for (Index i=0; i<size; ++i, it+=rhsIncr)
+      rhs[i] = *it;
+  }
+
+  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
+  if (FirstTriangular)
+    bound = size - bound;
+
+  for (Index j=FirstTriangular ? bound : 0;
+       j<(FirstTriangular ? size : bound);j+=2)
+  {
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+    const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
+
+    Scalar t0 = cjAlpha * rhs[j];
+    Packet ptmp0 = pset1<Packet>(t0);
+    Scalar t1 = cjAlpha * rhs[j+1];
+    Packet ptmp1 = pset1<Packet>(t1);
+
+    Scalar t2(0);
+    Packet ptmp2 = pset1<Packet>(t2);
+    Scalar t3(0);
+    Packet ptmp3 = pset1<Packet>(t3);
+
+    size_t starti = FirstTriangular ? 0 : j+2;
+    size_t endi   = FirstTriangular ? j : size;
+    size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
+    size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
+
+    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
+    res[j]   += cjd.pmul(numext::real(A0[j]), t0);
+    res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
+    if(FirstTriangular)
+    {
+      res[j]   += cj0.pmul(A1[j],   t1);
+      t3       += cj1.pmul(A1[j],   rhs[j]);
+    }
+    else
+    {
+      res[j+1] += cj0.pmul(A0[j+1],t0);
+      t2 += cj1.pmul(A0[j+1], rhs[j+1]);
+    }
+
+    for (size_t i=starti; i<alignedStart; ++i)
+    {
+      res[i] += t0 * A0[i] + t1 * A1[i];
+      t2 += numext::conj(A0[i]) * rhs[i];
+      t3 += numext::conj(A1[i]) * rhs[i];
+    }
+    // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
+    // gcc 4.2 does this optimization automatically.
+    const Scalar* EIGEN_RESTRICT a0It  = A0  + alignedStart;
+    const Scalar* EIGEN_RESTRICT a1It  = A1  + alignedStart;
+    const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+          Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+    for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
+    {
+      Packet A0i = ploadu<Packet>(a0It);  a0It  += PacketSize;
+      Packet A1i = ploadu<Packet>(a1It);  a1It  += PacketSize;
+      Packet Bi  = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
+      Packet Xi  = pload <Packet>(resIt);
+
+      Xi    = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
+      ptmp2 = pcj1.pmadd(A0i,  Bi, ptmp2);
+      ptmp3 = pcj1.pmadd(A1i,  Bi, ptmp3);
+      pstore(resIt,Xi); resIt += PacketSize;
+    }
+    for (size_t i=alignedEnd; i<endi; i++)
+    {
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+      t3 += cj1.pmul(A1[i], rhs[i]);
+    }
+
+    res[j]   += alpha * (t2 + predux(ptmp2));
+    res[j+1] += alpha * (t3 + predux(ptmp3));
+  }
+  for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
+  {
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+
+    Scalar t1 = cjAlpha * rhs[j];
+    Scalar t2(0);
+    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
+    res[j] += cjd.pmul(numext::real(A0[j]), t1);
+    for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
+    {
+      res[i] += cj0.pmul(A0[i], t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+    }
+    res[j] += alpha * t2;
+  }
+}
+
+} // end namespace internal 
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_vector
+***************************************************************************/
+
+namespace internal {
+template<typename Lhs, int LhsMode, typename Rhs>
+struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
+  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, int LhsMode, typename Rhs>
+struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
+  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+  enum {
+    LhsUpLo = LhsMode&(Upper|Lower)
+  };
+
+  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  {
+    typedef typename Dest::Scalar ResScalar;
+    typedef typename Base::RhsScalar RhsScalar;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    
+    eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+    enum {
+      EvalToDest = (Dest::InnerStrideAtCompileTime==1),
+      UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
+    };
+    
+    internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
+    internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
+
+    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                  EvalToDest ? dest.data() : static_dest.data());
+                                                  
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(),
+        UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
+    
+    if(!EvalToDest)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      int size = dest.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      MappedDest(actualDestPtr, dest.size()) = dest;
+    }
+      
+    if(!UseRhs)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      int size = rhs.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+    }
+      
+      
+    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
+      (
+        lhs.rows(),                             // size
+        &lhs.coeffRef(0,0),  lhs.outerStride(), // lhs info
+        actualRhsPtr, 1,                        // rhs info
+        actualDestPtr,                          // result info
+        actualAlpha                             // scale factor
+      );
+    
+    if(!EvalToDest)
+      dest = MappedDest(actualDestPtr, dest.size());
+  }
+};
+
+namespace internal {
+template<typename Lhs, typename Rhs, int RhsMode>
+struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
+  : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, typename Rhs, int RhsMode>
+struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
+  : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+  enum {
+    RhsUpLo = RhsMode&(Upper|Lower)
+  };
+
+  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  {
+    // let's simply transpose the product
+    Transpose<Dest> destT(dest);
+    SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
+                             Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
diff --git a/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
new file mode 100644
index 0000000..86684b6
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
@@ -0,0 +1,114 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements selfadjoint matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// symv/hemv specialization
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
+struct selfadjoint_matrix_vector_product_symv :
+  selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {};
+
+#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
+static void run( \
+  Index size, const Scalar*  lhs, Index lhsStride, \
+  const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \
+    enum {\
+      IsColMajor = StorageOrder==ColMajor \
+    }; \
+    if (IsColMajor == ConjugateLhs) {\
+      selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
+        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+    } else {\
+      selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
+        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+    }\
+  } \
+}; \
+
+EIGEN_MKL_SYMV_SPECIALIZE(double)
+EIGEN_MKL_SYMV_SPECIALIZE(float)
+EIGEN_MKL_SYMV_SPECIALIZE(dcomplex)
+EIGEN_MKL_SYMV_SPECIALIZE(scomplex)
+
+#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLFUNC) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
+\
+static void run( \
+Index size, const EIGTYPE*  lhs, Index lhsStride, \
+const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
+{ \
+  enum {\
+    IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
+    IsLower = UpLo == Lower ? 1 : 0 \
+  }; \
+  MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \
+  MKLTYPE alpha_, beta_; \
+  const EIGTYPE *x_ptr, myone(1); \
+  char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
+  assign_scalar_eig2mkl(alpha_, alpha); \
+  assign_scalar_eig2mkl(beta_, myone); \
+  SYMVVector x_tmp; \
+  if (ConjugateRhs) { \
+    Map<const SYMVVector, 0, InnerStride<> > map_x(_rhs,size,1,InnerStride<>(incx)); \
+    x_tmp=map_x.conjugate(); \
+    x_ptr=x_tmp.data(); \
+    incx=1; \
+  } else x_ptr=_rhs; \
+  MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+}\
+};
+
+EIGEN_MKL_SYMV_SPECIALIZATION(double,   double,        dsymv)
+EIGEN_MKL_SYMV_SPECIALIZATION(float,    float,         ssymv)
+EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
+EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8,  chemv)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
diff --git a/include/eigen3/Eigen/src/Core/products/SelfadjointProduct.h b/include/eigen3/Eigen/src/Core/products/SelfadjointProduct.h
new file mode 100644
index 0000000..6ca4ae6
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/SelfadjointProduct.h
@@ -0,0 +1,123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_PRODUCT_H
+#define EIGEN_SELFADJOINT_PRODUCT_H
+
+/**********************************************************************
+* This file implements a self adjoint product: C += A A^T updating only
+* half of the selfadjoint matrix C.
+* It corresponds to the level 3 SYRK and level 2 SYR Blas routines.
+**********************************************************************/
+
+namespace Eigen { 
+
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo,ConjLhs,ConjRhs>
+{
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+  {
+    internal::conj_if<ConjRhs> cj;
+    typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
+    typedef typename internal::conditional<ConjLhs,typename OtherMap::ConjugateReturnType,const OtherMap&>::type ConjLhsType;
+    for (Index i=0; i<size; ++i)
+    {
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1)))
+          += (alpha * cj(vecY[i])) * ConjLhsType(OtherMap(vecX+(UpLo==Lower ? i : 0),UpLo==Lower ? size-i : (i+1)));
+    }
+  }
+};
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,RowMajor,UpLo,ConjLhs,ConjRhs>
+{
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+  {
+    selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo==Lower?Upper:Lower,ConjRhs,ConjLhs>::run(size,mat,stride,vecY,vecX,alpha);
+  }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo, bool OtherIsVector = OtherType::IsVectorAtCompileTime>
+struct selfadjoint_product_selector;
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>
+{
+  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    typedef internal::blas_traits<OtherType> OtherBlasTraits;
+    typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+    typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+    typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+    Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+    enum {
+      StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+      UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1
+    };
+    internal::gemv_static_vector_if<Scalar,OtherType::SizeAtCompileTime,OtherType::MaxSizeAtCompileTime,!UseOtherDirectly> static_other;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(),
+      (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data()));
+      
+    if(!UseOtherDirectly)
+      Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther;
+    
+    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+                              OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+                            (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex>
+          ::run(other.size(), mat.data(), mat.outerStride(), actualOtherPtr, actualOtherPtr, actualAlpha);
+  }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
+{
+  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    typedef internal::blas_traits<OtherType> OtherBlasTraits;
+    typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+    typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+    typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+    Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+    enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+
+    internal::general_matrix_matrix_triangular_product<Index,
+      Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+      Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
+      ::run(mat.cols(), actualOther.cols(),
+            &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
+            mat.data(), mat.outerStride(), actualAlpha);
+  }
+};
+
+// high level API
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU>
+SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
+{
+  selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
+
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_PRODUCT_H
diff --git a/include/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h b/include/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h
new file mode 100644
index 0000000..8594a97
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -0,0 +1,93 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINTRANK2UPTADE_H
+#define EIGEN_SELFADJOINTRANK2UPTADE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu'
+ * It corresponds to the Level2 syr2 BLAS routine
+ */
+
+template<typename Scalar, typename Index, typename UType, typename VType, int UpLo>
+struct selfadjoint_rank2_update_selector;
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
+{
+  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  {
+    const Index size = u.size();
+    for (Index i=0; i<size; ++i)
+    {
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+i, size-i) +=
+                        (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size-i)
+                      + (alpha * numext::conj(v.coeff(i))) * u.tail(size-i);
+    }
+  }
+};
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Upper>
+{
+  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  {
+    const Index size = u.size();
+    for (Index i=0; i<size; ++i)
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i, i+1) +=
+                        (numext::conj(alpha)  * numext::conj(u.coeff(i))) * v.head(i+1)
+                      + (alpha * numext::conj(v.coeff(i))) * u.head(i+1);
+  }
+};
+
+template<bool Cond, typename T> struct conj_expr_if
+  : conditional<!Cond, const T&,
+      CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>,T> > {};
+
+} // end namespace internal
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU, typename DerivedV>
+SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
+{
+  typedef internal::blas_traits<DerivedU> UBlasTraits;
+  typedef typename UBlasTraits::DirectLinearAccessType ActualUType;
+  typedef typename internal::remove_all<ActualUType>::type _ActualUType;
+  typename internal::add_const_on_value_type<ActualUType>::type actualU = UBlasTraits::extract(u.derived());
+
+  typedef internal::blas_traits<DerivedV> VBlasTraits;
+  typedef typename VBlasTraits::DirectLinearAccessType ActualVType;
+  typedef typename internal::remove_all<ActualVType>::type _ActualVType;
+  typename internal::add_const_on_value_type<ActualVType>::type actualV = VBlasTraits::extract(v.derived());
+
+  // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
+  // vice versa, and take the complex conjugate of all coefficients and vector entries.
+
+  enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+  Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived())
+                             * numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
+  if (IsRowMajor)
+    actualAlpha = numext::conj(actualAlpha);
+
+  internal::selfadjoint_rank2_update_selector<Scalar, Index,
+    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type,
+    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type,
+    (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
+    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha);
+
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINTRANK2UPTADE_H
diff --git a/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h b/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h
new file mode 100644
index 0000000..8110507
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -0,0 +1,427 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode>
+// struct gemm_pack_lhs_triangular
+// {
+//   Matrix<Scalar,mr,mr,
+//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
+//   {
+//     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+//     const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
+//     int count = 0;
+//     const int peeled_mc = (rows/mr)*mr;
+//     for(int i=0; i<peeled_mc; i+=mr)
+//     {
+//       for(int k=0; k<depth; k++)
+//         for(int w=0; w<mr; w++)
+//           blockA[count++] = cj(lhs(i+w, k));
+//     }
+//     for(int i=peeled_mc; i<rows; i++)
+//     {
+//       for(int k=0; k<depth; k++)
+//         blockA[count++] = cj(lhs(i, k));
+//     }
+//   }
+// };
+
+/* Optimized triangular matrix * matrix (_TRMM++) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResStorageOrder, int Version = Specialized>
+struct product_triangular_matrix_matrix;
+
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
+                                           LhsStorageOrder,ConjugateLhs,
+                                           RhsStorageOrder,ConjugateRhs,RowMajor,Version>
+{
+  static EIGEN_STRONG_INLINE void run(
+    Index rows, Index cols, Index depth,
+    const Scalar* lhs, Index lhsStride,
+    const Scalar* rhs, Index rhsStride,
+    Scalar* res,       Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    product_triangular_matrix_matrix<Scalar, Index,
+      (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper),
+      (!LhsIsTriangular),
+      RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+      ConjugateRhs,
+      LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+      ConjugateLhs,
+      ColMajor>
+      ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
+  }
+};
+
+// implements col-major += alpha * op(triangular) * op(general)
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+                                           LhsStorageOrder,ConjugateLhs,
+                                           RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+{
+  
+  typedef gebp_traits<Scalar,Scalar> Traits;
+  enum {
+    SmallPanelWidth   = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+    IsLower = (Mode&Lower) == Lower,
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+  };
+
+  static EIGEN_DONT_INLINE void run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+                                                        LhsStorageOrder,ConjugateLhs,
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    // strip zeros
+    Index diagSize  = (std::min)(_rows,_depth);
+    Index rows      = IsLower ? _rows : diagSize;
+    Index depth     = IsLower ? diagSize : _depth;
+    Index cols      = _cols;
+    
+    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
+    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
+
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
+    triangularBuffer.setZero();
+    if((Mode&ZeroDiag)==ZeroDiag)
+      triangularBuffer.diagonal().setZero();
+    else
+      triangularBuffer.diagonal().setOnes();
+
+    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+
+    for(Index k2=IsLower ? depth : 0;
+        IsLower ? k2>0 : k2<depth;
+        IsLower ? k2-=kc : k2+=kc)
+    {
+      Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc);
+      Index actual_k2 = IsLower ? k2-actual_kc : k2;
+
+      // align blocks with the end of the triangular part for trapezoidal lhs
+      if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows))
+      {
+        actual_kc = rows-k2;
+        k2 = k2+actual_kc-kc;
+      }
+
+      pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
+
+      // the selected lhs's panel has to be split in three different parts:
+      //  1 - the part which is zero => skip it
+      //  2 - the diagonal block => special kernel
+      //  3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP
+
+      // the block diagonal, if any:
+      if(IsLower || actual_k2<rows)
+      {
+        // for each small vertical panels of lhs
+        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+          Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
+          Index startBlock   = actual_k2+k1;
+          Index blockBOffset = k1;
+
+          // => GEBP with the micro triangular block
+          // The trick is to pack this micro block while filling the opposite triangular part with zeros.
+          // To this end we do an extra triangular copy to a small temporary buffer
+          for (Index k=0;k<actualPanelWidth;++k)
+          {
+            if (SetDiag)
+              triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k);
+            for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
+              triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
+          }
+          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
+
+          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
+                      actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+
+          // GEBP with remaining micro panel
+          if (lengthTarget>0)
+          {
+            Index startTarget  = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
+
+            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
+
+            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
+                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+          }
+        }
+      }
+      // the part below (lower case) or above (upper case) the diagonal => GEPP
+      {
+        Index start = IsLower ? k2 : 0;
+        Index end   = IsLower ? rows : (std::min)(actual_k2,rows);
+        for(Index i2=start; i2<end; i2+=mc)
+        {
+          const Index actual_mc = (std::min)(i2+mc,end)-i2;
+          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+
+          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+        }
+      }
+    }
+  }
+
+// implements col-major += alpha * op(general) * op(triangular)
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+                                        LhsStorageOrder,ConjugateLhs,
+                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+{
+  typedef gebp_traits<Scalar,Scalar> Traits;
+  enum {
+    SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+    IsLower = (Mode&Lower) == Lower,
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+  };
+
+  static EIGEN_DONT_INLINE void run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+                                                        LhsStorageOrder,ConjugateLhs,
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    // strip zeros
+    Index diagSize  = (std::min)(_cols,_depth);
+    Index rows      = _rows;
+    Index depth     = IsLower ? _depth : diagSize;
+    Index cols      = IsLower ? diagSize : _cols;
+    
+    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
+    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
+
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
+    triangularBuffer.setZero();
+    if((Mode&ZeroDiag)==ZeroDiag)
+      triangularBuffer.diagonal().setZero();
+    else
+      triangularBuffer.diagonal().setOnes();
+
+    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+
+    for(Index k2=IsLower ? 0 : depth;
+        IsLower ? k2<depth  : k2>0;
+        IsLower ? k2+=kc   : k2-=kc)
+    {
+      Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc);
+      Index actual_k2 = IsLower ? k2 : k2-actual_kc;
+
+      // align blocks with the end of the triangular part for trapezoidal rhs
+      if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols))
+      {
+        actual_kc = cols-k2;
+        k2 = actual_k2 + actual_kc - kc;
+      }
+
+      // remaining size
+      Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2;
+      // size of the triangular part
+      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
+
+      Scalar* geb = blockB+ts*ts;
+
+      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
+
+      // pack the triangular part of the rhs padding the unrolled blocks with zeros
+      if(ts>0)
+      {
+        for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+          Index actual_j2 = actual_k2 + j2;
+          Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+          Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+          // general part
+          pack_rhs_panel(blockB+j2*actual_kc,
+                         &rhs(actual_k2+panelOffset, actual_j2), rhsStride,
+                         panelLength, actualPanelWidth,
+                         actual_kc, panelOffset);
+
+          // append the triangular part via a temporary buffer
+          for (Index j=0;j<actualPanelWidth;++j)
+          {
+            if (SetDiag)
+              triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j);
+            for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k)
+              triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
+          }
+
+          pack_rhs_panel(blockB+j2*actual_kc,
+                         triangularBuffer.data(), triangularBuffer.outerStride(),
+                         actualPanelWidth, actualPanelWidth,
+                         actual_kc, j2);
+        }
+      }
+
+      for (Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(mc,rows-i2);
+        pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+
+        // triangular kernel
+        if(ts>0)
+        {
+          for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+          {
+            Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+            Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
+            Index blockOffset = IsLower ? j2 : 0;
+
+            gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
+                        blockA, blockB+j2*actual_kc,
+                        actual_mc, panelLength, actualPanelWidth,
+                        alpha,
+                        actual_kc, actual_kc,  // strides
+                        blockOffset, blockOffset,// offsets
+                        blockW); // workspace
+          }
+        }
+        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
+                    blockA, geb, actual_mc, actual_kc, rs,
+                    alpha,
+                    -1, -1, 0, 0, blockW);
+      }
+    }
+  }
+
+/***************************************************************************
+* Wrapper to product_triangular_matrix_matrix
+***************************************************************************/
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> >
+  : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> >
+{};
+
+} // end namespace internal
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
+  : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+
+  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  {
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
+
+    enum { IsLower = (Mode&Lower) == Lower };
+    Index stripedRows  = ((!LhsIsTriangular) || (IsLower))  ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols());
+    Index stripedCols  = ((LhsIsTriangular)  || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows());
+    Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
+                                         : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
+
+    BlockingType blocking(stripedRows, stripedCols, stripedDepth);
+
+    internal::product_triangular_matrix_matrix<Scalar, Index,
+      Mode, LhsIsTriangular,
+      (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
+      ::run(
+        stripedRows, stripedCols, stripedDepth,   // sizes
+        &lhs.coeffRef(0,0),    lhs.outerStride(), // lhs info
+        &rhs.coeffRef(0,0),    rhs.outerStride(), // rhs info
+        &dst.coeffRef(0,0), dst.outerStride(),    // result info
+        actualAlpha, blocking
+      );
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
new file mode 100644
index 0000000..ba41a1c
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
@@ -0,0 +1,309 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResStorageOrder>
+struct product_triangular_matrix_matrix_trmm :
+       product_triangular_matrix_matrix<Scalar,Index,Mode,
+          LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
+          RhsStorageOrder, ConjugateRhs, ResStorageOrder, BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
+           LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,Specialized> { \
+  static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
+    const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
+      product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
+        LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
+        RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
+        _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+  } \
+};
+
+EIGEN_MKL_TRMM_SPECIALIZE(double, true)
+EIGEN_MKL_TRMM_SPECIALIZE(double, false)
+EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true)
+EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false)
+EIGEN_MKL_TRMM_SPECIALIZE(float, true)
+EIGEN_MKL_TRMM_SPECIALIZE(float, false)
+EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true)
+EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false)
+
+// implements col-major += alpha * op(triangular) * op(general)
+#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
+         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \
+  }; \
+\
+  static void run( \
+    Index _rows, Index _cols, Index _depth, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+  { \
+   Index diagSize  = (std::min)(_rows,_depth); \
+   Index rows      = IsLower ? _rows : diagSize; \
+   Index depth     = IsLower ? diagSize : _depth; \
+   Index cols      = _cols; \
+\
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+   if (rows != depth) { \
+\
+     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+\
+     if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
+     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+     /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+     } else { \
+     /* Make sense to call GEMM */ \
+       Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+       MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
+       MKL_INT aStride = aa_tmp.outerStride(); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
+       rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
+\
+     /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+     } \
+     return; \
+   } \
+   char side = 'L', transa, uplo, diag = 'N'; \
+   EIGTYPE *b; \
+   const EIGTYPE *a; \
+   MKL_INT m, n, lda, ldb; \
+   MKLTYPE alpha_; \
+\
+/* Set alpha_*/ \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+\
+/* Set m, n */ \
+   m = (MKL_INT)diagSize; \
+   n = (MKL_INT)cols; \
+\
+/* Set trans */ \
+   transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \
+   MatrixX##EIGPREFIX b_tmp; \
+\
+   if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \
+   b = b_tmp.data(); \
+   ldb = b_tmp.outerStride(); \
+\
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+   MatrixLhs a_tmp; \
+\
+   if ((conjA!=0) || (SetDiag==0)) { \
+     if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \
+     if (IsZeroDiag) \
+       a_tmp.diagonal().setZero(); \
+     else if (IsUnitDiag) \
+       a_tmp.diagonal().setOnes();\
+     a = a_tmp.data(); \
+     lda = a_tmp.outerStride(); \
+   } else { \
+     a = _lhs; \
+     lda = lhsStride; \
+   } \
+   /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+   res_tmp=res_tmp+b_tmp; \
+  } \
+};
+
+EIGEN_MKL_TRMM_L(double, double, d, d)
+EIGEN_MKL_TRMM_L(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMM_L(float, float, f, s)
+EIGEN_MKL_TRMM_L(scomplex, MKL_Complex8, cf, c)
+
+// implements col-major += alpha * op(general) * op(triangular)
+#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
+         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \
+  }; \
+\
+  static void run( \
+    Index _rows, Index _cols, Index _depth, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+  { \
+   Index diagSize  = (std::min)(_cols,_depth); \
+   Index rows      = _rows; \
+   Index depth     = IsLower ? _depth : diagSize; \
+   Index cols      = IsLower ? diagSize : _cols; \
+\
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+   if (cols != depth) { \
+\
+     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+\
+     if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
+     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+       /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+     } else { \
+     /* Make sense to call GEMM */ \
+       Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+       MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
+       MKL_INT aStride = aa_tmp.outerStride(); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
+       rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
+\
+     /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+     } \
+     return; \
+   } \
+   char side = 'R', transa, uplo, diag = 'N'; \
+   EIGTYPE *b; \
+   const EIGTYPE *a; \
+   MKL_INT m, n, lda, ldb; \
+   MKLTYPE alpha_; \
+\
+/* Set alpha_*/ \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+\
+/* Set m, n */ \
+   m = (MKL_INT)rows; \
+   n = (MKL_INT)diagSize; \
+\
+/* Set trans */ \
+   transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+   MatrixX##EIGPREFIX b_tmp; \
+\
+   if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \
+   b = b_tmp.data(); \
+   ldb = b_tmp.outerStride(); \
+\
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+   MatrixRhs a_tmp; \
+\
+   if ((conjA!=0) || (SetDiag==0)) { \
+     if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \
+     if (IsZeroDiag) \
+       a_tmp.diagonal().setZero(); \
+     else if (IsUnitDiag) \
+       a_tmp.diagonal().setOnes();\
+     a = a_tmp.data(); \
+     lda = a_tmp.outerStride(); \
+   } else { \
+     a = _rhs; \
+     lda = rhsStride; \
+   } \
+   /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+   res_tmp=res_tmp+b_tmp; \
+  } \
+};
+
+EIGEN_MKL_TRMM_R(double, double, d, d)
+EIGEN_MKL_TRMM_R(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMM_R(float, float, f, s)
+EIGEN_MKL_TRMM_R(scomplex, MKL_Complex8, cf, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
diff --git a/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h b/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h
new file mode 100644
index 0000000..6117d5a
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -0,0 +1,348 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
+#define EIGEN_TRIANGULARMATRIXVECTOR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder, int Version=Specialized>
+struct triangular_matrix_vector_product;
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  enum {
+    IsLower = ((Mode&Lower)==Lower),
+    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+  };
+  static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+  {
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+    Index size = (std::min)(_rows,_cols);
+    Index rows = IsLower ? _rows : (std::min)(_rows,_cols);
+    Index cols = IsLower ? (std::min)(_rows,_cols) : _cols;
+
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+    
+    typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
+    const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
+    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+    typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
+    ResMap res(_res,rows);
+
+    for (Index pi=0; pi<size; pi+=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
+      for (Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = pi + k;
+        Index s = IsLower ? ((HasUnitDiag||HasZeroDiag) ? i+1 : i ) : pi;
+        Index r = IsLower ? actualPanelWidth-k : k+1;
+        if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+          res.segment(s,r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s,r);
+        if (HasUnitDiag)
+          res.coeffRef(i) += alpha * cjRhs.coeff(i);
+      }
+      Index r = IsLower ? rows - pi - actualPanelWidth : pi;
+      if (r>0)
+      {
+        Index s = IsLower ? pi+actualPanelWidth : 0;
+        general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+            r, actualPanelWidth,
+            &lhs.coeffRef(s,pi), lhsStride,
+            &rhs.coeffRef(pi), rhsIncr,
+            &res.coeffRef(s), resIncr, alpha);
+      }
+    }
+    if((!IsLower) && cols>size)
+    {
+      general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+          rows, cols-size,
+          &lhs.coeffRef(0,size), lhsStride,
+          &rhs.coeffRef(size), rhsIncr,
+          _res, resIncr, alpha);
+    }
+  }
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  enum {
+    IsLower = ((Mode&Lower)==Lower),
+    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+  };
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+                                    const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+  {
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+    Index diagSize = (std::min)(_rows,_cols);
+    Index rows = IsLower ? _rows : diagSize;
+    Index cols = IsLower ? diagSize : _cols;
+
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+
+    typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
+    const RhsMap rhs(_rhs,cols);
+    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+    typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
+    ResMap res(_res,rows,InnerStride<>(resIncr));
+    
+    for (Index pi=0; pi<diagSize; pi+=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
+      for (Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = pi + k;
+        Index s = IsLower ? pi  : ((HasUnitDiag||HasZeroDiag) ? i+1 : i);
+        Index r = IsLower ? k+1 : actualPanelWidth-k;
+        if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+          res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s,r).cwiseProduct(cjRhs.segment(s,r).transpose())).sum();
+        if (HasUnitDiag)
+          res.coeffRef(i) += alpha * cjRhs.coeff(i);
+      }
+      Index r = IsLower ? pi : cols - pi - actualPanelWidth;
+      if (r>0)
+      {
+        Index s = IsLower ? 0 : pi + actualPanelWidth;
+        general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+            actualPanelWidth, r,
+            &lhs.coeffRef(pi,s), lhsStride,
+            &rhs.coeffRef(s), rhsIncr,
+            &res.coeffRef(pi), resIncr, alpha);
+      }
+    }
+    if(IsLower && rows>diagSize)
+    {
+      general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+            rows-diagSize, cols,
+            &lhs.coeffRef(diagSize,0), lhsStride,
+            &rhs.coeffRef(0), rhsIncr,
+            &res.coeffRef(diagSize), resIncr, alpha);
+    }
+  }
+
+/***************************************************************************
+* Wrapper to product_triangular_vector
+***************************************************************************/
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true> >
+ : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true>, Lhs, Rhs> >
+{};
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false> >
+ : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false>, Lhs, Rhs> >
+{};
+
+
+template<int StorageOrder>
+struct trmv_selector;
+
+} // end namespace internal
+
+template<int Mode, typename Lhs, typename Rhs>
+struct TriangularProduct<Mode,true,Lhs,false,Rhs,true>
+  : public ProductBase<TriangularProduct<Mode,true,Lhs,false,Rhs,true>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+
+  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  {
+    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+  
+    internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha);
+  }
+};
+
+template<int Mode, typename Lhs, typename Rhs>
+struct TriangularProduct<Mode,false,Lhs,true,Rhs,false>
+  : public ProductBase<TriangularProduct<Mode,false,Lhs,true,Rhs,false>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+
+  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  {
+    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+    typedef TriangularProduct<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),true,Transpose<const Rhs>,false,Transpose<const Lhs>,true> TriangularProductTranspose;
+    Transpose<Dest> dstT(dst);
+    internal::trmv_selector<(int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run(
+      TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha);
+  }
+};
+
+namespace internal {
+
+// TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
+  
+template<> struct trmv_selector<ColMajor>
+{
+  template<int Mode, typename Lhs, typename Rhs, typename Dest>
+  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  {
+    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
+    typedef typename ProductType::Index Index;
+    typedef typename ProductType::LhsScalar   LhsScalar;
+    typedef typename ProductType::RhsScalar   RhsScalar;
+    typedef typename ProductType::Scalar      ResScalar;
+    typedef typename ProductType::RealScalar  RealScalar;
+    typedef typename ProductType::ActualLhsType ActualLhsType;
+    typedef typename ProductType::ActualRhsType ActualRhsType;
+    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+
+    enum {
+      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // on, the other hand it is good for the cache to pack the vector anyways...
+      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
+      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+    };
+
+    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+    
+    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
+
+    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                  evalToDest ? dest.data() : static_dest.data());
+
+    if(!evalToDest)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      Index size = dest.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      if(!alphaIsCompatible)
+      {
+        MappedDest(actualDestPtr, dest.size()).setZero();
+        compatibleAlpha = RhsScalar(1);
+      }
+      else
+        MappedDest(actualDestPtr, dest.size()) = dest;
+    }
+    
+    internal::triangular_matrix_vector_product
+      <Index,Mode,
+       LhsScalar, LhsBlasTraits::NeedToConjugate,
+       RhsScalar, RhsBlasTraits::NeedToConjugate,
+       ColMajor>
+      ::run(actualLhs.rows(),actualLhs.cols(),
+            actualLhs.data(),actualLhs.outerStride(),
+            actualRhs.data(),actualRhs.innerStride(),
+            actualDestPtr,1,compatibleAlpha);
+
+    if (!evalToDest)
+    {
+      if(!alphaIsCompatible)
+        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
+      else
+        dest = MappedDest(actualDestPtr, dest.size());
+    }
+  }
+};
+
+template<> struct trmv_selector<RowMajor>
+{
+  template<int Mode, typename Lhs, typename Rhs, typename Dest>
+  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  {
+    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
+    typedef typename ProductType::LhsScalar LhsScalar;
+    typedef typename ProductType::RhsScalar RhsScalar;
+    typedef typename ProductType::Scalar    ResScalar;
+    typedef typename ProductType::Index Index;
+    typedef typename ProductType::ActualLhsType ActualLhsType;
+    typedef typename ProductType::ActualRhsType ActualRhsType;
+    typedef typename ProductType::_ActualRhsType _ActualRhsType;
+    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+
+    enum {
+      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+    };
+
+    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
+        DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
+
+    if(!DirectlyUseRhs)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      int size = actualRhs.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+    }
+    
+    internal::triangular_matrix_vector_product
+      <Index,Mode,
+       LhsScalar, LhsBlasTraits::NeedToConjugate,
+       RhsScalar, RhsBlasTraits::NeedToConjugate,
+       RowMajor>
+      ::run(actualLhs.rows(),actualLhs.cols(),
+            actualLhs.data(),actualLhs.outerStride(),
+            actualRhsPtr,1,
+            dest.data(),dest.innerStride(),
+            actualAlpha);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULARMATRIXVECTOR_H
diff --git a/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
new file mode 100644
index 0000000..09f110d
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
@@ -0,0 +1,247 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Triangular matrix-vector product functionality based on ?TRMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements triangular matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// trmv/hemv specialization
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder>
+struct triangular_matrix_vector_product_trmv :
+  triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {};
+
+#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
+        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+  } \
+}; \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
+        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+  } \
+};
+
+EIGEN_MKL_TRMV_SPECIALIZE(double)
+EIGEN_MKL_TRMV_SPECIALIZE(float)
+EIGEN_MKL_TRMV_SPECIALIZE(dcomplex)
+EIGEN_MKL_TRMV_SPECIALIZE(scomplex)
+
+// implements col-major: res += alpha * op(triangular) * vector
+#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper \
+  }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+   if (ConjLhs || IsZeroDiag) { \
+     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
+       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+     return; \
+   }\
+   Index size = (std::min)(_rows,_cols); \
+   Index rows = IsLower ? _rows : size; \
+   Index cols = IsLower ? size : _cols; \
+\
+   typedef VectorX##EIGPREFIX VectorRhs; \
+   EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   VectorRhs x_tmp; \
+   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+   x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+   char trans, uplo, diag; \
+   MKL_INT m, n, lda, incx, incy; \
+   EIGTYPE const *a; \
+   MKLTYPE alpha_, beta_; \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+\
+/* Set m, n */ \
+   n = (MKL_INT)size; \
+   lda = lhsStride; \
+   incx = 1; \
+   incy = resIncr; \
+\
+/* Set uplo, trans and diag*/ \
+   trans = 'N'; \
+   uplo = IsLower ? 'L' : 'U'; \
+   diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+   if (size<(std::max)(rows,cols)) { \
+     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
+     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+     x = x_tmp.data(); \
+     if (size<rows) { \
+       y = _res + size*resIncr; \
+       a = _lhs + size; \
+       m = rows-size; \
+       n = size; \
+     } \
+     else { \
+       x += size; \
+       y = _res; \
+       a = _lhs + size*lda; \
+       m = size; \
+       n = cols-size; \
+     } \
+     MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+   } \
+  } \
+};
+
+EIGEN_MKL_TRMV_CM(double, double, d, d)
+EIGEN_MKL_TRMV_CM(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMV_CM(float, float, f, s)
+EIGEN_MKL_TRMV_CM(scomplex, MKL_Complex8, cf, c)
+
+// implements row-major: res += alpha * op(triangular) * vector
+#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper \
+  }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+   if (IsZeroDiag) { \
+     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
+       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+     return; \
+   }\
+   Index size = (std::min)(_rows,_cols); \
+   Index rows = IsLower ? _rows : size; \
+   Index cols = IsLower ? size : _cols; \
+\
+   typedef VectorX##EIGPREFIX VectorRhs; \
+   EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   VectorRhs x_tmp; \
+   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+   x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+   char trans, uplo, diag; \
+   MKL_INT m, n, lda, incx, incy; \
+   EIGTYPE const *a; \
+   MKLTYPE alpha_, beta_; \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+\
+/* Set m, n */ \
+   n = (MKL_INT)size; \
+   lda = lhsStride; \
+   incx = 1; \
+   incy = resIncr; \
+\
+/* Set uplo, trans and diag*/ \
+   trans = ConjLhs ? 'C' : 'T'; \
+   uplo = IsLower ? 'U' : 'L'; \
+   diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+   if (size<(std::max)(rows,cols)) { \
+     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
+     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+     x = x_tmp.data(); \
+     if (size<rows) { \
+       y = _res + size*resIncr; \
+       a = _lhs + size*lda; \
+       m = rows-size; \
+       n = size; \
+     } \
+     else { \
+       x += size; \
+       y = _res; \
+       a = _lhs + size; \
+       m = size; \
+       n = cols-size; \
+     } \
+     MKLPREFIX##gemv(&trans, &n, &m, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+   } \
+  } \
+};
+
+EIGEN_MKL_TRMV_RM(double, double, d, d)
+EIGEN_MKL_TRMV_RM(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMV_RM(float, float, f, s)
+EIGEN_MKL_TRMV_RM(scomplex, MKL_Complex8, cf, c)
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
diff --git a/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h b/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h
new file mode 100644
index 0000000..f103eae
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -0,0 +1,329 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// if the rhs is row major, let's transpose the product
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
+struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
+{
+  static void run(
+    Index size, Index cols,
+    const Scalar*  tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking)
+  {
+    triangular_solve_matrix<
+      Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
+      (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
+      NumTraits<Scalar>::IsComplex && Conjugate,
+      TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor>
+      ::run(size, cols, tri, triStride, _other, otherStride, blocking);
+  }
+};
+
+/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
+{
+  static EIGEN_DONT_INLINE void run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking)
+  {
+    Index cols = otherSize;
+    const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
+    blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+    enum {
+      SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+      IsLower = (Mode&Lower) == Lower
+    };
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(size,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
+
+    conj_if<Conjugate> conj;
+    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
+
+    // the goal here is to subdivise the Rhs panels such that we keep some cache
+    // coherence when accessing the rhs elements
+    std::ptrdiff_t l1, l2;
+    manage_caching_sizes(GetAction, &l1, &l2);
+    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
+    subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
+
+    for(Index k2=IsLower ? 0 : size;
+        IsLower ? k2<size : k2>0;
+        IsLower ? k2+=kc : k2-=kc)
+    {
+      const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc);
+
+      // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
+      // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
+      // A11 (the triangular part) and A21 the remaining rectangular part.
+      // Then the high level algorithm is:
+      //  - B = R1                    => general block copy (done during the next step)
+      //  - R1 = A11^-1 B             => tricky part
+      //  - update B from the new R1  => actually this has to be performed continuously during the above step
+      //  - R2 -= A21 * B             => GEPP
+
+      // The tricky part: compute R1 = A11^-1 B while updating B from R1
+      // The idea is to split A11 into multiple small vertical panels.
+      // Each panel can be split into a small triangular part T1k which is processed without optimization,
+      // and the remaining small part T2k which is processed using gebp with appropriate block strides
+      for(Index j2=0; j2<cols; j2+=subcols)
+      {
+        Index actual_cols = (std::min)(cols-j2,subcols);
+        // for each small vertical panels [T1k^T, T2k^T]^T of lhs
+        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+          // tr solve
+          for (Index k=0; k<actualPanelWidth; ++k)
+          {
+            // TODO write a small kernel handling this (can be shared with trsv)
+            Index i  = IsLower ? k2+k1+k : k2-k1-k-1;
+            Index s  = IsLower ? k2+k1 : i+1;
+            Index rs = actualPanelWidth - k - 1; // remaining size
+
+            Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
+            for (Index j=j2; j<j2+actual_cols; ++j)
+            {
+              if (TriStorageOrder==RowMajor)
+              {
+                Scalar b(0);
+                const Scalar* l = &tri(i,s);
+                Scalar* r = &other(s,j);
+                for (Index i3=0; i3<k; ++i3)
+                  b += conj(l[i3]) * r[i3];
+
+                other(i,j) = (other(i,j) - b)*a;
+              }
+              else
+              {
+                Index s = IsLower ? i+1 : i-rs;
+                Scalar b = (other(i,j) *= a);
+                Scalar* r = &other(s,j);
+                const Scalar* l = &tri(s,i);
+                for (Index i3=0;i3<rs;++i3)
+                  r[i3] -= b * conj(l[i3]);
+              }
+            }
+          }
+
+          Index lengthTarget = actual_kc-k1-actualPanelWidth;
+          Index startBlock   = IsLower ? k2+k1 : k2-k1-actualPanelWidth;
+          Index blockBOffset = IsLower ? k1 : lengthTarget;
+
+          // update the respective rows of B from other
+          pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+
+          // GEBP
+          if (lengthTarget>0)
+          {
+            Index startTarget  = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
+
+            pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
+
+            gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+          }
+        }
+      }
+      
+      // R2 -= A21 * B => GEPP
+      {
+        Index start = IsLower ? k2+kc : 0;
+        Index end   = IsLower ? size : k2-kc;
+        for(Index i2=start; i2<end; i2+=mc)
+        {
+          const Index actual_mc = (std::min)(mc,end-i2);
+          if (actual_mc>0)
+          {
+            pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
+
+            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW);
+          }
+        }
+      }
+    }
+  }
+
+/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
+{
+  static EIGEN_DONT_INLINE void run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking)
+  {
+    Index rows = otherSize;
+    const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
+    blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+    enum {
+      RhsStorageOrder   = TriStorageOrder,
+      SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+      IsLower = (Mode&Lower) == Lower
+    };
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*size;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
+
+    conj_if<Conjugate> conj;
+    gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+
+    for(Index k2=IsLower ? size : 0;
+        IsLower ? k2>0 : k2<size;
+        IsLower ? k2-=kc : k2+=kc)
+    {
+      const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc);
+      Index actual_k2 = IsLower ? k2-actual_kc : k2 ;
+
+      Index startPanel = IsLower ? 0 : k2+actual_kc;
+      Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
+      Scalar* geb = blockB+actual_kc*actual_kc;
+
+      if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
+
+      // triangular packing (we only pack the panels off the diagonal,
+      // neglecting the blocks overlapping the diagonal
+      {
+        for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+          Index actual_j2 = actual_k2 + j2;
+          Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+          Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+
+          if (panelLength>0)
+          pack_rhs_panel(blockB+j2*actual_kc,
+                         &rhs(actual_k2+panelOffset, actual_j2), triStride,
+                         panelLength, actualPanelWidth,
+                         actual_kc, panelOffset);
+        }
+      }
+
+      for(Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(mc,rows-i2);
+
+        // triangular solver kernel
+        {
+          // for each small block of the diagonal (=> vertical panels of rhs)
+          for (Index j2 = IsLower
+                      ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth)
+                                                                  : Index(SmallPanelWidth)))
+                      : 0;
+               IsLower ? j2>=0 : j2<actual_kc;
+               IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth)
+          {
+            Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+            Index absolute_j2 = actual_k2 + j2;
+            Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+            Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
+
+            // GEBP
+            if(panelLength>0)
+            {
+              gebp_kernel(&lhs(i2,absolute_j2), otherStride,
+                          blockA, blockB+j2*actual_kc,
+                          actual_mc, panelLength, actualPanelWidth,
+                          Scalar(-1),
+                          actual_kc, actual_kc, // strides
+                          panelOffset, panelOffset, // offsets
+                          blockW);  // workspace
+            }
+
+            // unblocked triangular solve
+            for (Index k=0; k<actualPanelWidth; ++k)
+            {
+              Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
+
+              Scalar* r = &lhs(i2,j);
+              for (Index k3=0; k3<k; ++k3)
+              {
+                Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
+                Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3);
+                for (Index i=0; i<actual_mc; ++i)
+                  r[i] -= a[i] * b;
+              }
+              Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j));
+              for (Index i=0; i<actual_mc; ++i)
+                r[i] *= b;
+            }
+
+            // pack the just computed part of lhs to A
+            pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
+                           actualPanelWidth, actual_mc,
+                           actual_kc, j2);
+          }
+        }
+
+        if (rs>0)
+          gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
+                      actual_mc, actual_kc, rs, Scalar(-1),
+                      -1, -1, 0, 0, blockW);
+      }
+    }
+  }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
diff --git a/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
new file mode 100644
index 0000000..6a0bb83
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
@@ -0,0 +1,155 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+// implements LeftSide op(triangular)^-1 * general
+#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+  }; \
+  static void run( \
+      Index size, Index otherSize, \
+      const EIGTYPE* _tri, Index triStride, \
+      EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+  { \
+   MKL_INT m = size, n = otherSize, lda, ldb; \
+   char side = 'L', uplo, diag='N', transa; \
+   /* Set alpha_ */ \
+   MKLTYPE alpha; \
+   EIGTYPE myone(1); \
+   assign_scalar_eig2mkl(alpha, myone); \
+   ldb = otherStride;\
+\
+   const EIGTYPE *a; \
+/* Set trans */ \
+   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+   MatrixTri a_tmp; \
+\
+   if (conjA) { \
+     a_tmp = tri.conjugate(); \
+     a = a_tmp.data(); \
+     lda = a_tmp.outerStride(); \
+   } else { \
+     a = _tri; \
+     lda = triStride; \
+   } \
+   if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+ } \
+};
+
+EIGEN_MKL_TRSM_L(double, double, d)
+EIGEN_MKL_TRSM_L(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_TRSM_L(float, float, s)
+EIGEN_MKL_TRSM_L(scomplex, MKL_Complex8, c)
+
+
+// implements RightSide general * op(triangular)^-1
+#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+  }; \
+  static void run( \
+      Index size, Index otherSize, \
+      const EIGTYPE* _tri, Index triStride, \
+      EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+  { \
+   MKL_INT m = otherSize, n = size, lda, ldb; \
+   char side = 'R', uplo, diag='N', transa; \
+   /* Set alpha_ */ \
+   MKLTYPE alpha; \
+   EIGTYPE myone(1); \
+   assign_scalar_eig2mkl(alpha, myone); \
+   ldb = otherStride;\
+\
+   const EIGTYPE *a; \
+/* Set trans */ \
+   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+   MatrixTri a_tmp; \
+\
+   if (conjA) { \
+     a_tmp = tri.conjugate(); \
+     a = a_tmp.data(); \
+     lda = a_tmp.outerStride(); \
+   } else { \
+     a = _tri; \
+     lda = triStride; \
+   } \
+   if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+   /*std::cout << "TRMS_L specialization!\n";*/ \
+ } \
+};
+
+EIGEN_MKL_TRSM_R(double, double, d)
+EIGEN_MKL_TRSM_R(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_TRSM_R(float, float, s)
+EIGEN_MKL_TRSM_R(scomplex, MKL_Complex8, c)
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
diff --git a/include/eigen3/Eigen/src/Core/products/TriangularSolverVector.h b/include/eigen3/Eigen/src/Core/products/TriangularSolverVector.h
new file mode 100644
index 0000000..ce4d100
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/products/TriangularSolverVector.h
@@ -0,0 +1,139 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder>
+{
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+  {
+    triangular_solve_vector<LhsScalar,RhsScalar,Index,OnTheLeft,
+        ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
+        Conjugate,StorageOrder==RowMajor?ColMajor:RowMajor
+      >::run(size, _lhs, lhsStride, rhs);
+  }
+};
+    
+// forward and backward substitution, row-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
+{
+  enum {
+    IsLower = ((Mode&Lower)==Lower)
+  };
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+  {
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+    typename internal::conditional<
+                          Conjugate,
+                          const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+                          const LhsMap&>
+                        ::type cjLhs(lhs);
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+    for(Index pi=IsLower ? 0 : size;
+        IsLower ? pi<size : pi>0;
+        IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+
+      Index r = IsLower ? pi : size - pi; // remaining size
+      if (r > 0)
+      {
+        // let's directly call the low level product function because:
+        // 1 - it is faster to compile
+        // 2 - it is slighlty faster at runtime
+        Index startRow = IsLower ? pi : pi-actualPanelWidth;
+        Index startCol = IsLower ? 0 : pi;
+
+        general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
+          actualPanelWidth, r,
+          &lhs.coeffRef(startRow,startCol), lhsStride,
+          rhs + startCol, 1,
+          rhs + startRow, 1,
+          RhsScalar(-1));
+      }
+
+      for(Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = IsLower ? pi+k : pi-k-1;
+        Index s = IsLower ? pi   : i+1;
+        if (k>0)
+          rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
+        
+        if(!(Mode & UnitDiag))
+          rhs[i] /= cjLhs(i,i);
+      }
+    }
+  }
+};
+
+// forward and backward substitution, column-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor>
+{
+  enum {
+    IsLower = ((Mode&Lower)==Lower)
+  };
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+  {
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+    typename internal::conditional<Conjugate,
+                                   const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+                                   const LhsMap&
+                                  >::type cjLhs(lhs);
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+
+    for(Index pi=IsLower ? 0 : size;
+        IsLower ? pi<size : pi>0;
+        IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+      Index startBlock = IsLower ? pi : pi-actualPanelWidth;
+      Index endBlock = IsLower ? pi + actualPanelWidth : 0;
+
+      for(Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = IsLower ? pi+k : pi-k-1;
+        if(!(Mode & UnitDiag))
+          rhs[i] /= cjLhs.coeff(i,i);
+
+        Index r = actualPanelWidth - k - 1; // remaining size
+        Index s = IsLower ? i+1 : i-r;
+        if (r>0)
+          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+      }
+      Index r = IsLower ? size - endBlock : startBlock; // remaining size
+      if (r > 0)
+      {
+        // let's directly call the low level product function because:
+        // 1 - it is faster to compile
+        // 2 - it is slighlty faster at runtime
+        general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
+            r, actualPanelWidth,
+            &lhs.coeffRef(endBlock,startBlock), lhsStride,
+            rhs+startBlock, 1,
+            rhs+endBlock, 1, RhsScalar(-1));
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_VECTOR_H
diff --git a/include/eigen3/Eigen/src/Core/util/BlasUtil.h b/include/eigen3/Eigen/src/Core/util/BlasUtil.h
new file mode 100644
index 0000000..a28f16f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/BlasUtil.h
@@ -0,0 +1,264 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BLASUTIL_H
+#define EIGEN_BLASUTIL_H
+
+// This file contains many lightweight helper classes used to
+// implement and control fast level 2 and level 3 BLAS-like routines.
+
+namespace Eigen {
+
+namespace internal {
+
+// forward declarations
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+struct gebp_kernel;
+
+template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+struct gemm_pack_rhs;
+
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+struct gemm_pack_lhs;
+
+template<
+  typename Index,
+  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+  int ResStorageOrder>
+struct general_matrix_matrix_product;
+
+template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
+struct general_matrix_vector_product;
+
+
+template<bool Conjugate> struct conj_if;
+
+template<> struct conj_if<true> {
+  template<typename T>
+  inline T operator()(const T& x) { return numext::conj(x); }
+  template<typename T>
+  inline T pconj(const T& x) { return internal::pconj(x); }
+};
+
+template<> struct conj_if<false> {
+  template<typename T>
+  inline const T& operator()(const T& x) { return x; }
+  template<typename T>
+  inline const T& pconj(const T& x) { return x; }
+};
+
+template<typename Scalar> struct conj_helper<Scalar,Scalar,false,false>
+{
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); }
+  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); }
+};
+
+template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, false,true>
+{
+  typedef std::complex<RealScalar> Scalar;
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
+  { return c + pmul(x,y); }
+
+  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
+  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); }
+};
+
+template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,false>
+{
+  typedef std::complex<RealScalar> Scalar;
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
+  { return c + pmul(x,y); }
+
+  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
+  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
+};
+
+template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,true>
+{
+  typedef std::complex<RealScalar> Scalar;
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
+  { return c + pmul(x,y); }
+
+  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
+  { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
+};
+
+template<typename RealScalar,bool Conj> struct conj_helper<std::complex<RealScalar>, RealScalar, Conj,false>
+{
+  typedef std::complex<RealScalar> Scalar;
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const
+  { return padd(c, pmul(x,y)); }
+  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const
+  { return conj_if<Conj>()(x)*y; }
+};
+
+template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::complex<RealScalar>, false,Conj>
+{
+  typedef std::complex<RealScalar> Scalar;
+  EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const
+  { return padd(c, pmul(x,y)); }
+  EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const
+  { return x*conj_if<Conj>()(y); }
+};
+
+template<typename From,typename To> struct get_factor {
+  static EIGEN_STRONG_INLINE To run(const From& x) { return x; }
+};
+
+template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::Real> {
+  static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
+};
+
+// Lightweight helper class to access matrix coefficients.
+// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
+// and so I hope better compilation performance (time and code quality).
+template<typename Scalar, typename Index, int StorageOrder>
+class blas_data_mapper
+{
+  public:
+    blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
+    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
+  protected:
+    Scalar* EIGEN_RESTRICT m_data;
+    Index m_stride;
+};
+
+// lightweight helper class to access matrix coefficients (const version)
+template<typename Scalar, typename Index, int StorageOrder>
+class const_blas_data_mapper
+{
+  public:
+    const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
+    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
+  protected:
+    const Scalar* EIGEN_RESTRICT m_data;
+    Index m_stride;
+};
+
+
+/* Helper class to analyze the factors of a Product expression.
+ * In particular it allows to pop out operator-, scalar multiples,
+ * and conjugate */
+template<typename XprType> struct blas_traits
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef const XprType& ExtractType;
+  typedef XprType _ExtractType;
+  enum {
+    IsComplex = NumTraits<Scalar>::IsComplex,
+    IsTransposed = false,
+    NeedToConjugate = false,
+    HasUsableDirectAccess = (    (int(XprType::Flags)&DirectAccessBit)
+                              && (   bool(XprType::IsVectorAtCompileTime)
+                                  || int(inner_stride_at_compile_time<XprType>::ret) == 1)
+                             ) ?  1 : 0
+  };
+  typedef typename conditional<bool(HasUsableDirectAccess),
+    ExtractType,
+    typename _ExtractType::PlainObject
+    >::type DirectLinearAccessType;
+  static inline ExtractType extract(const XprType& x) { return x; }
+  static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+};
+
+// pop conjugate
+template<typename Scalar, typename NestedXpr>
+struct blas_traits<CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> >
+ : blas_traits<NestedXpr>
+{
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> XprType;
+  typedef typename Base::ExtractType ExtractType;
+
+  enum {
+    IsComplex = NumTraits<Scalar>::IsComplex,
+    NeedToConjugate = Base::NeedToConjugate ? 0 : IsComplex
+  };
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline Scalar extractScalarFactor(const XprType& x) { return conj(Base::extractScalarFactor(x.nestedExpression())); }
+};
+
+// pop scalar multiple
+template<typename Scalar, typename NestedXpr>
+struct blas_traits<CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> >
+ : blas_traits<NestedXpr>
+{
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> XprType;
+  typedef typename Base::ExtractType ExtractType;
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline Scalar extractScalarFactor(const XprType& x)
+  { return x.functor().m_other * Base::extractScalarFactor(x.nestedExpression()); }
+};
+
+// pop opposite
+template<typename Scalar, typename NestedXpr>
+struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> >
+ : blas_traits<NestedXpr>
+{
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
+  typedef typename Base::ExtractType ExtractType;
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline Scalar extractScalarFactor(const XprType& x)
+  { return - Base::extractScalarFactor(x.nestedExpression()); }
+};
+
+// pop/push transpose
+template<typename NestedXpr>
+struct blas_traits<Transpose<NestedXpr> >
+ : blas_traits<NestedXpr>
+{
+  typedef typename NestedXpr::Scalar Scalar;
+  typedef blas_traits<NestedXpr> Base;
+  typedef Transpose<NestedXpr> XprType;
+  typedef Transpose<const typename Base::_ExtractType>  ExtractType; // const to get rid of a compile error; anyway blas traits are only used on the RHS
+  typedef Transpose<const typename Base::_ExtractType> _ExtractType;
+  typedef typename conditional<bool(Base::HasUsableDirectAccess),
+    ExtractType,
+    typename ExtractType::PlainObject
+    >::type DirectLinearAccessType;
+  enum {
+    IsTransposed = Base::IsTransposed ? 0 : 1
+  };
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
+};
+
+template<typename T>
+struct blas_traits<const T>
+     : blas_traits<T>
+{};
+
+template<typename T, bool HasUsableDirectAccess=blas_traits<T>::HasUsableDirectAccess>
+struct extract_data_selector {
+  static const typename T::Scalar* run(const T& m)
+  {
+    return blas_traits<T>::extract(m).data();
+  }
+};
+
+template<typename T>
+struct extract_data_selector<T,false> {
+  static typename T::Scalar* run(const T&) { return 0; }
+};
+
+template<typename T> const typename T::Scalar* extract_data(const T& m)
+{
+  return extract_data_selector<T>::run(m);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BLASUTIL_H
diff --git a/include/eigen3/Eigen/src/Core/util/Constants.h b/include/eigen3/Eigen/src/Core/util/Constants.h
new file mode 100644
index 0000000..14b9624
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/Constants.h
@@ -0,0 +1,438 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONSTANTS_H
+#define EIGEN_CONSTANTS_H
+
+namespace Eigen {
+
+/** This value means that a positive quantity (e.g., a size) is not known at compile-time, and that instead the value is
+  * stored in some runtime variable.
+  *
+  * Changing the value of Dynamic breaks the ABI, as Dynamic is often used as a template parameter for Matrix.
+  */
+const int Dynamic = -1;
+
+/** This value means that a signed quantity (e.g., a signed index) is not known at compile-time, and that instead its value
+  * has to be specified at runtime.
+  */
+const int DynamicIndex = 0xffffff;
+
+/** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
+  * The value Infinity there means the L-infinity norm.
+  */
+const int Infinity = -1;
+
+/** \defgroup flags Flags
+  * \ingroup Core_Module
+  *
+  * These are the possible bits which can be OR'ed to constitute the flags of a matrix or
+  * expression.
+  *
+  * It is important to note that these flags are a purely compile-time notion. They are a compile-time property of
+  * an expression type, implemented as enum's. They are not stored in memory at runtime, and they do not incur any
+  * runtime overhead.
+  *
+  * \sa MatrixBase::Flags
+  */
+
+/** \ingroup flags
+  *
+  * for a matrix, this means that the storage order is row-major.
+  * If this bit is not set, the storage order is column-major.
+  * For an expression, this determines the storage order of
+  * the matrix created by evaluation of that expression. 
+  * \sa \ref TopicStorageOrders */
+const unsigned int RowMajorBit = 0x1;
+
+/** \ingroup flags
+  *
+  * means the expression should be evaluated by the calling expression */
+const unsigned int EvalBeforeNestingBit = 0x2;
+
+/** \ingroup flags
+  *
+  * means the expression should be evaluated before any assignment */
+const unsigned int EvalBeforeAssigningBit = 0x4;
+
+/** \ingroup flags
+  *
+  * Short version: means the expression might be vectorized
+  *
+  * Long version: means that the coefficients can be handled by packets
+  * and start at a memory location whose alignment meets the requirements
+  * of the present CPU architecture for optimized packet access. In the fixed-size
+  * case, there is the additional condition that it be possible to access all the
+  * coefficients by packets (this implies the requirement that the size be a multiple of 16 bytes,
+  * and that any nontrivial strides don't break the alignment). In the dynamic-size case,
+  * there is no such condition on the total size and strides, so it might not be possible to access
+  * all coeffs by packets.
+  *
+  * \note This bit can be set regardless of whether vectorization is actually enabled.
+  *       To check for actual vectorizability, see \a ActualPacketAccessBit.
+  */
+const unsigned int PacketAccessBit = 0x8;
+
+#ifdef EIGEN_VECTORIZE
+/** \ingroup flags
+  *
+  * If vectorization is enabled (EIGEN_VECTORIZE is defined) this constant
+  * is set to the value \a PacketAccessBit.
+  *
+  * If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
+  * is set to the value 0.
+  */
+const unsigned int ActualPacketAccessBit = PacketAccessBit;
+#else
+const unsigned int ActualPacketAccessBit = 0x0;
+#endif
+
+/** \ingroup flags
+  *
+  * Short version: means the expression can be seen as 1D vector.
+  *
+  * Long version: means that one can access the coefficients
+  * of this expression by coeff(int), and coeffRef(int) in the case of a lvalue expression. These
+  * index-based access methods are guaranteed
+  * to not have to do any runtime computation of a (row, col)-pair from the index, so that it
+  * is guaranteed that whenever it is available, index-based access is at least as fast as
+  * (row,col)-based access. Expressions for which that isn't possible don't have the LinearAccessBit.
+  *
+  * If both PacketAccessBit and LinearAccessBit are set, then the
+  * packets of this expression can be accessed by packet(int), and writePacket(int) in the case of a
+  * lvalue expression.
+  *
+  * Typically, all vector expressions have the LinearAccessBit, but there is one exception:
+  * Product expressions don't have it, because it would be troublesome for vectorization, even when the
+  * Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
+  * not index-based packet access, so they don't have the LinearAccessBit.
+  */
+const unsigned int LinearAccessBit = 0x10;
+
+/** \ingroup flags
+  *
+  * Means the expression has a coeffRef() method, i.e. is writable as its individual coefficients are directly addressable.
+  * This rules out read-only expressions.
+  *
+  * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but note
+  * the other:
+  *   \li writable expressions that don't have a very simple memory layout as a strided array, have LvalueBit but not DirectAccessBit
+  *   \li Map-to-const expressions, for example Map<const Matrix>, have DirectAccessBit but not LvalueBit
+  *
+  * Expressions having LvalueBit also have their coeff() method returning a const reference instead of returning a new value.
+  */
+const unsigned int LvalueBit = 0x20;
+
+/** \ingroup flags
+  *
+  * Means that the underlying array of coefficients can be directly accessed as a plain strided array. The memory layout
+  * of the array of coefficients must be exactly the natural one suggested by rows(), cols(),
+  * outerStride(), innerStride(), and the RowMajorBit. This rules out expressions such as Diagonal, whose coefficients,
+  * though referencable, do not have such a regular memory layout.
+  *
+  * See the comment on LvalueBit for an explanation of how LvalueBit and DirectAccessBit are mutually orthogonal.
+  */
+const unsigned int DirectAccessBit = 0x40;
+
+/** \ingroup flags
+  *
+  * means the first coefficient packet is guaranteed to be aligned */
+const unsigned int AlignedBit = 0x80;
+
+const unsigned int NestByRefBit = 0x100;
+
+// list of flags that are inherited by default
+const unsigned int HereditaryBits = RowMajorBit
+                                  | EvalBeforeNestingBit
+                                  | EvalBeforeAssigningBit;
+
+/** \defgroup enums Enumerations
+  * \ingroup Core_Module
+  *
+  * Various enumerations used in %Eigen. Many of these are used as template parameters.
+  */
+
+/** \ingroup enums
+  * Enum containing possible values for the \p Mode parameter of 
+  * MatrixBase::selfadjointView() and MatrixBase::triangularView(). */
+enum {
+  /** View matrix as a lower triangular matrix. */
+  Lower=0x1,                      
+  /** View matrix as an upper triangular matrix. */
+  Upper=0x2,                      
+  /** %Matrix has ones on the diagonal; to be used in combination with #Lower or #Upper. */
+  UnitDiag=0x4, 
+  /** %Matrix has zeros on the diagonal; to be used in combination with #Lower or #Upper. */
+  ZeroDiag=0x8,
+  /** View matrix as a lower triangular matrix with ones on the diagonal. */
+  UnitLower=UnitDiag|Lower, 
+  /** View matrix as an upper triangular matrix with ones on the diagonal. */
+  UnitUpper=UnitDiag|Upper,
+  /** View matrix as a lower triangular matrix with zeros on the diagonal. */
+  StrictlyLower=ZeroDiag|Lower, 
+  /** View matrix as an upper triangular matrix with zeros on the diagonal. */
+  StrictlyUpper=ZeroDiag|Upper,
+  /** Used in BandMatrix and SelfAdjointView to indicate that the matrix is self-adjoint. */
+  SelfAdjoint=0x10,
+  /** Used to support symmetric, non-selfadjoint, complex matrices. */
+  Symmetric=0x20
+};
+
+/** \ingroup enums
+  * Enum for indicating whether an object is aligned or not. */
+enum { 
+  /** Object is not correctly aligned for vectorization. */
+  Unaligned=0, 
+  /** Object is aligned for vectorization. */
+  Aligned=1 
+};
+
+/** \ingroup enums
+ * Enum used by DenseBase::corner() in Eigen2 compatibility mode. */
+// FIXME after the corner() API change, this was not needed anymore, except by AlignedBox
+// TODO: find out what to do with that. Adapt the AlignedBox API ?
+enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
+
+/** \ingroup enums
+  * Enum containing possible values for the \p Direction parameter of
+  * Reverse, PartialReduxExpr and VectorwiseOp. */
+enum DirectionType { 
+  /** For Reverse, all columns are reversed; 
+    * for PartialReduxExpr and VectorwiseOp, act on columns. */
+  Vertical, 
+  /** For Reverse, all rows are reversed; 
+    * for PartialReduxExpr and VectorwiseOp, act on rows. */
+  Horizontal, 
+  /** For Reverse, both rows and columns are reversed; 
+    * not used for PartialReduxExpr and VectorwiseOp. */
+  BothDirections 
+};
+
+/** \internal \ingroup enums
+  * Enum to specify how to traverse the entries of a matrix. */
+enum {
+  /** \internal Default traversal, no vectorization, no index-based access */
+  DefaultTraversal,
+  /** \internal No vectorization, use index-based access to have only one for loop instead of 2 nested loops */
+  LinearTraversal,
+  /** \internal Equivalent to a slice vectorization for fixed-size matrices having good alignment
+    * and good size */
+  InnerVectorizedTraversal,
+  /** \internal Vectorization path using a single loop plus scalar loops for the
+    * unaligned boundaries */
+  LinearVectorizedTraversal,
+  /** \internal Generic vectorization path using one vectorized loop per row/column with some
+    * scalar loops to handle the unaligned boundaries */
+  SliceVectorizedTraversal,
+  /** \internal Special case to properly handle incompatible scalar types or other defecting cases*/
+  InvalidTraversal,
+  /** \internal Evaluate all entries at once */
+  AllAtOnceTraversal
+};
+
+/** \internal \ingroup enums
+  * Enum to specify whether to unroll loops when traversing over the entries of a matrix. */
+enum {
+  /** \internal Do not unroll loops. */
+  NoUnrolling,
+  /** \internal Unroll only the inner loop, but not the outer loop. */
+  InnerUnrolling,
+  /** \internal Unroll both the inner and the outer loop. If there is only one loop, 
+    * because linear traversal is used, then unroll that loop. */
+  CompleteUnrolling
+};
+
+/** \internal \ingroup enums
+  * Enum to specify whether to use the default (built-in) implementation or the specialization. */
+enum {
+  Specialized,
+  BuiltIn
+};
+
+/** \ingroup enums
+  * Enum containing possible values for the \p _Options template parameter of
+  * Matrix, Array and BandMatrix. */
+enum {
+  /** Storage order is column major (see \ref TopicStorageOrders). */
+  ColMajor = 0,
+  /** Storage order is row major (see \ref TopicStorageOrders). */
+  RowMajor = 0x1,  // it is only a coincidence that this is equal to RowMajorBit -- don't rely on that
+  /** Align the matrix itself if it is vectorizable fixed-size */
+  AutoAlign = 0,
+  /** Don't require alignment for the matrix itself (the array of coefficients, if dynamically allocated, may still be requested to be aligned) */ // FIXME --- clarify the situation
+  DontAlign = 0x2
+};
+
+/** \ingroup enums
+  * Enum for specifying whether to apply or solve on the left or right. */
+enum {
+  /** Apply transformation on the left. */
+  OnTheLeft = 1,  
+  /** Apply transformation on the right. */
+  OnTheRight = 2  
+};
+
+/* the following used to be written as:
+ *
+ *   struct NoChange_t {};
+ *   namespace {
+ *     EIGEN_UNUSED NoChange_t NoChange;
+ *   }
+ *
+ * on the ground that it feels dangerous to disambiguate overloaded functions on enum/integer types.  
+ * However, this leads to "variable declared but never referenced" warnings on Intel Composer XE,
+ * and we do not know how to get rid of them (bug 450).
+ */
+
+enum NoChange_t   { NoChange };
+enum Sequential_t { Sequential };
+enum Default_t    { Default };
+
+/** \internal \ingroup enums
+  * Used in AmbiVector. */
+enum {
+  IsDense         = 0,
+  IsSparse
+};
+
+/** \ingroup enums
+  * Used as template parameter in DenseCoeffBase and MapBase to indicate 
+  * which accessors should be provided. */
+enum AccessorLevels {
+  /** Read-only access via a member function. */
+  ReadOnlyAccessors, 
+  /** Read/write access via member functions. */
+  WriteAccessors, 
+  /** Direct read-only access to the coefficients. */
+  DirectAccessors, 
+  /** Direct read/write access to the coefficients. */
+  DirectWriteAccessors
+};
+
+/** \ingroup enums
+  * Enum with options to give to various decompositions. */
+enum DecompositionOptions {
+  /** \internal Not used (meant for LDLT?). */
+  Pivoting            = 0x01, 
+  /** \internal Not used (meant for LDLT?). */
+  NoPivoting          = 0x02, 
+  /** Used in JacobiSVD to indicate that the square matrix U is to be computed. */
+  ComputeFullU        = 0x04,
+  /** Used in JacobiSVD to indicate that the thin matrix U is to be computed. */
+  ComputeThinU        = 0x08,
+  /** Used in JacobiSVD to indicate that the square matrix V is to be computed. */
+  ComputeFullV        = 0x10,
+  /** Used in JacobiSVD to indicate that the thin matrix V is to be computed. */
+  ComputeThinV        = 0x20,
+  /** Used in SelfAdjointEigenSolver and GeneralizedSelfAdjointEigenSolver to specify
+    * that only the eigenvalues are to be computed and not the eigenvectors. */
+  EigenvaluesOnly     = 0x40,
+  /** Used in SelfAdjointEigenSolver and GeneralizedSelfAdjointEigenSolver to specify
+    * that both the eigenvalues and the eigenvectors are to be computed. */
+  ComputeEigenvectors = 0x80,
+  /** \internal */
+  EigVecMask = EigenvaluesOnly | ComputeEigenvectors,
+  /** Used in GeneralizedSelfAdjointEigenSolver to indicate that it should
+    * solve the generalized eigenproblem \f$ Ax = \lambda B x \f$. */
+  Ax_lBx              = 0x100,
+  /** Used in GeneralizedSelfAdjointEigenSolver to indicate that it should
+    * solve the generalized eigenproblem \f$ ABx = \lambda x \f$. */
+  ABx_lx              = 0x200,
+  /** Used in GeneralizedSelfAdjointEigenSolver to indicate that it should
+    * solve the generalized eigenproblem \f$ BAx = \lambda x \f$. */
+  BAx_lx              = 0x400,
+  /** \internal */
+  GenEigMask = Ax_lBx | ABx_lx | BAx_lx
+};
+
+/** \ingroup enums
+  * Possible values for the \p QRPreconditioner template parameter of JacobiSVD. */
+enum QRPreconditioners {
+  /** Do not specify what is to be done if the SVD of a non-square matrix is asked for. */
+  NoQRPreconditioner,
+  /** Use a QR decomposition without pivoting as the first step. */
+  HouseholderQRPreconditioner,
+  /** Use a QR decomposition with column pivoting as the first step. */
+  ColPivHouseholderQRPreconditioner,
+  /** Use a QR decomposition with full pivoting as the first step. */
+  FullPivHouseholderQRPreconditioner
+};
+
+#ifdef Success
+#error The preprocessor symbol 'Success' is defined, possibly by the X11 header file X.h
+#endif
+
+/** \ingroup enums
+  * Enum for reporting the status of a computation. */
+enum ComputationInfo {
+  /** Computation was successful. */
+  Success = 0,        
+  /** The provided data did not satisfy the prerequisites. */
+  NumericalIssue = 1, 
+  /** Iterative procedure did not converge. */
+  NoConvergence = 2,
+  /** The inputs are invalid, or the algorithm has been improperly called.
+    * When assertions are enabled, such errors trigger an assert. */
+  InvalidInput = 3
+};
+
+/** \ingroup enums
+  * Enum used to specify how a particular transformation is stored in a matrix.
+  * \sa Transform, Hyperplane::transform(). */
+enum TransformTraits {
+  /** Transformation is an isometry. */
+  Isometry      = 0x1,
+  /** Transformation is an affine transformation stored as a (Dim+1)^2 matrix whose last row is 
+    * assumed to be [0 ... 0 1]. */
+  Affine        = 0x2,
+  /** Transformation is an affine transformation stored as a (Dim) x (Dim+1) matrix. */
+  AffineCompact = 0x10 | Affine,
+  /** Transformation is a general projective transformation stored as a (Dim+1)^2 matrix. */
+  Projective    = 0x20
+};
+
+/** \internal \ingroup enums
+  * Enum used to choose between implementation depending on the computer architecture. */
+namespace Architecture
+{
+  enum Type {
+    Generic = 0x0,
+    SSE = 0x1,
+    AltiVec = 0x2,
+#if defined EIGEN_VECTORIZE_SSE
+    Target = SSE
+#elif defined EIGEN_VECTORIZE_ALTIVEC
+    Target = AltiVec
+#else
+    Target = Generic
+#endif
+  };
+}
+
+/** \internal \ingroup enums
+  * Enum used as template parameter in GeneralProduct. */
+enum { CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
+
+/** \internal \ingroup enums
+  * Enum used in experimental parallel implementation. */
+enum Action {GetAction, SetAction};
+
+/** The type used to identify a dense storage. */
+struct Dense {};
+
+/** The type used to identify a matrix expression */
+struct MatrixXpr {};
+
+/** The type used to identify an array expression */
+struct ArrayXpr {};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CONSTANTS_H
diff --git a/include/eigen3/Eigen/src/Core/util/DisableStupidWarnings.h b/include/eigen3/Eigen/src/Core/util/DisableStupidWarnings.h
new file mode 100644
index 0000000..6a0bf06
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -0,0 +1,40 @@
+#ifndef EIGEN_WARNINGS_DISABLED
+#define EIGEN_WARNINGS_DISABLED
+
+#ifdef _MSC_VER
+  // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
+  // 4101 - unreferenced local variable
+  // 4127 - conditional expression is constant
+  // 4181 - qualifier applied to reference type ignored
+  // 4211 - nonstandard extension used : redefined extern to static
+  // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
+  // 4273 - QtAlignedMalloc, inconsistent DLL linkage
+  // 4324 - structure was padded due to declspec(align())
+  // 4512 - assignment operator could not be generated
+  // 4522 - 'class' : multiple assignment operators specified
+  // 4700 - uninitialized local variable 'xyz' used
+  // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
+  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+    #pragma warning( push )
+  #endif
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4512 4522 4700 4717 )
+#elif defined __INTEL_COMPILER
+  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
+  //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
+  //        typedef that may be a reference type.
+  // 279  - controlling expression is constant
+  //        ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case.
+  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+    #pragma warning push
+  #endif
+  #pragma warning disable 2196 279
+#elif defined __clang__
+  // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+  //     this is really a stupid warning as it warns on compile-time expressions involving enums
+  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+    #pragma clang diagnostic push
+  #endif
+  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+#endif
+
+#endif // not EIGEN_WARNINGS_DISABLED
diff --git a/include/eigen3/Eigen/src/Core/util/ForwardDeclarations.h b/include/eigen3/Eigen/src/Core/util/ForwardDeclarations.h
new file mode 100644
index 0000000..d6a8145
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/ForwardDeclarations.h
@@ -0,0 +1,299 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FORWARDDECLARATIONS_H
+#define EIGEN_FORWARDDECLARATIONS_H
+
+namespace Eigen {
+namespace internal {
+
+template<typename T> struct traits;
+
+// here we say once and for all that traits<const T> == traits<T>
+// When constness must affect traits, it has to be constness on template parameters on which T itself depends.
+// For example, traits<Map<const T> > != traits<Map<T> >, but
+//              traits<const Map<T> > == traits<Map<T> >
+template<typename T> struct traits<const T> : traits<T> {};
+
+template<typename Derived> struct has_direct_access
+{
+  enum { ret = (traits<Derived>::Flags & DirectAccessBit) ? 1 : 0 };
+};
+
+template<typename Derived> struct accessors_level
+{
+  enum { has_direct_access = (traits<Derived>::Flags & DirectAccessBit) ? 1 : 0,
+         has_write_access = (traits<Derived>::Flags & LvalueBit) ? 1 : 0,
+         value = has_direct_access ? (has_write_access ? DirectWriteAccessors : DirectAccessors)
+                                   : (has_write_access ? WriteAccessors       : ReadOnlyAccessors)
+  };
+};
+
+} // end namespace internal
+
+template<typename T> struct NumTraits;
+
+template<typename Derived> struct EigenBase;
+template<typename Derived> class DenseBase;
+template<typename Derived> class PlainObjectBase;
+
+
+template<typename Derived,
+         int Level = internal::accessors_level<Derived>::value >
+class DenseCoeffsBase;
+
+template<typename _Scalar, int _Rows, int _Cols,
+         int _Options = AutoAlign |
+#if defined(__GNUC__) && __GNUC__==3 && __GNUC_MINOR__==4
+    // workaround a bug in at least gcc 3.4.6
+    // the innermost ?: ternary operator is misparsed. We write it slightly
+    // differently and this makes gcc 3.4.6 happy, but it's ugly.
+    // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
+    // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
+                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
+                          : ColMajor ),
+#else
+                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          : (_Cols==1 && _Rows!=1) ? ColMajor
+                          : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
+#endif
+         int _MaxRows = _Rows,
+         int _MaxCols = _Cols
+> class Matrix;
+
+template<typename Derived> class MatrixBase;
+template<typename Derived> class ArrayBase;
+
+template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged;
+template<typename ExpressionType, template <typename> class StorageBase > class NoAlias;
+template<typename ExpressionType> class NestByValue;
+template<typename ExpressionType> class ForceAlignedAccess;
+template<typename ExpressionType> class SwapWrapper;
+
+template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false> class Block;
+
+template<typename MatrixType, int Size=Dynamic> class VectorBlock;
+template<typename MatrixType> class Transpose;
+template<typename MatrixType> class Conjugate;
+template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
+template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
+template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
+template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
+template<typename BinOp,     typename Lhs, typename Rhs>  class SelfCwiseBinaryOp;
+template<typename Derived,   typename Lhs, typename Rhs>  class ProductBase;
+template<typename Lhs, typename Rhs, int Mode>            class GeneralProduct;
+template<typename Lhs, typename Rhs, int NestingFlags>    class CoeffBasedProduct;
+
+template<typename Derived> class DiagonalBase;
+template<typename _DiagonalVectorType> class DiagonalWrapper;
+template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime=SizeAtCompileTime> class DiagonalMatrix;
+template<typename MatrixType, typename DiagonalType, int ProductOrder> class DiagonalProduct;
+template<typename MatrixType, int Index = 0> class Diagonal;
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime, typename IndexType=int> class PermutationMatrix;
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime, typename IndexType=int> class Transpositions;
+template<typename Derived> class PermutationBase;
+template<typename Derived> class TranspositionsBase;
+template<typename _IndicesType> class PermutationWrapper;
+template<typename _IndicesType> class TranspositionsWrapper;
+
+template<typename Derived,
+         int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
+> class MapBase;
+template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Stride;
+template<typename MatrixType, int MapOptions=Unaligned, typename StrideType = Stride<0,0> > class Map;
+
+template<typename Derived> class TriangularBase;
+template<typename MatrixType, unsigned int Mode> class TriangularView;
+template<typename MatrixType, unsigned int Mode> class SelfAdjointView;
+template<typename MatrixType> class SparseView;
+template<typename ExpressionType> class WithFormat;
+template<typename MatrixType> struct CommaInitializer;
+template<typename Derived> class ReturnByValue;
+template<typename ExpressionType> class ArrayWrapper;
+template<typename ExpressionType> class MatrixWrapper;
+
+namespace internal {
+template<typename DecompositionType, typename Rhs> struct solve_retval_base;
+template<typename DecompositionType, typename Rhs> struct solve_retval;
+template<typename DecompositionType> struct kernel_retval_base;
+template<typename DecompositionType> struct kernel_retval;
+template<typename DecompositionType> struct image_retval_base;
+template<typename DecompositionType> struct image_retval;
+} // end namespace internal
+
+namespace internal {
+template<typename _Scalar, int Rows=Dynamic, int Cols=Dynamic, int Supers=Dynamic, int Subs=Dynamic, int Options=0> class BandMatrix;
+}
+
+namespace internal {
+template<typename Lhs, typename Rhs> struct product_type;
+}
+
+template<typename Lhs, typename Rhs,
+         int ProductType = internal::product_type<Lhs,Rhs>::value>
+struct ProductReturnType;
+
+// this is a workaround for sun CC
+template<typename Lhs, typename Rhs> struct LazyProductReturnType;
+
+namespace internal {
+
+// Provides scalar/packet-wise product and product with accumulation
+// with optional conjugation of the arguments.
+template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRhs=false> struct conj_helper;
+
+template<typename Scalar> struct scalar_sum_op;
+template<typename Scalar> struct scalar_difference_op;
+template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op;
+template<typename Scalar> struct scalar_opposite_op;
+template<typename Scalar> struct scalar_conjugate_op;
+template<typename Scalar> struct scalar_real_op;
+template<typename Scalar> struct scalar_imag_op;
+template<typename Scalar> struct scalar_abs_op;
+template<typename Scalar> struct scalar_abs2_op;
+template<typename Scalar> struct scalar_sqrt_op;
+template<typename Scalar> struct scalar_exp_op;
+template<typename Scalar> struct scalar_log_op;
+template<typename Scalar> struct scalar_cos_op;
+template<typename Scalar> struct scalar_sin_op;
+template<typename Scalar> struct scalar_acos_op;
+template<typename Scalar> struct scalar_asin_op;
+template<typename Scalar> struct scalar_tan_op;
+template<typename Scalar> struct scalar_pow_op;
+template<typename Scalar> struct scalar_inverse_op;
+template<typename Scalar> struct scalar_square_op;
+template<typename Scalar> struct scalar_cube_op;
+template<typename Scalar, typename NewType> struct scalar_cast_op;
+template<typename Scalar> struct scalar_multiple_op;
+template<typename Scalar> struct scalar_quotient1_op;
+template<typename Scalar> struct scalar_min_op;
+template<typename Scalar> struct scalar_max_op;
+template<typename Scalar> struct scalar_random_op;
+template<typename Scalar> struct scalar_add_op;
+template<typename Scalar> struct scalar_constant_op;
+template<typename Scalar> struct scalar_identity_op;
+
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
+template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
+
+} // end namespace internal
+
+struct IOFormat;
+
+// Array module
+template<typename _Scalar, int _Rows, int _Cols,
+         int _Options = AutoAlign |
+#if defined(__GNUC__) && __GNUC__==3 && __GNUC_MINOR__==4
+    // workaround a bug in at least gcc 3.4.6
+    // the innermost ?: ternary operator is misparsed. We write it slightly
+    // differently and this makes gcc 3.4.6 happy, but it's ugly.
+    // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
+    // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
+                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
+                          : ColMajor ),
+#else
+                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          : (_Cols==1 && _Rows!=1) ? ColMajor
+                          : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
+#endif
+         int _MaxRows = _Rows, int _MaxCols = _Cols> class Array;
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType> class Select;
+template<typename MatrixType, typename BinaryOp, int Direction> class PartialReduxExpr;
+template<typename ExpressionType, int Direction> class VectorwiseOp;
+template<typename MatrixType,int RowFactor,int ColFactor> class Replicate;
+template<typename MatrixType, int Direction = BothDirections> class Reverse;
+
+template<typename MatrixType> class FullPivLU;
+template<typename MatrixType> class PartialPivLU;
+namespace internal {
+template<typename MatrixType> struct inverse_impl;
+}
+template<typename MatrixType> class HouseholderQR;
+template<typename MatrixType> class ColPivHouseholderQR;
+template<typename MatrixType> class FullPivHouseholderQR;
+template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
+template<typename MatrixType, int UpLo = Lower> class LLT;
+template<typename MatrixType, int UpLo = Lower> class LDLT;
+template<typename VectorsType, typename CoeffsType, int Side=OnTheLeft> class HouseholderSequence;
+template<typename Scalar>     class JacobiRotation;
+
+// Geometry module:
+template<typename Derived, int _Dim> class RotationBase;
+template<typename Lhs, typename Rhs> class Cross;
+template<typename Derived> class QuaternionBase;
+template<typename Scalar> class Rotation2D;
+template<typename Scalar> class AngleAxis;
+template<typename Scalar,int Dim> class Translation;
+
+#ifdef EIGEN2_SUPPORT
+template<typename Derived, int _Dim> class eigen2_RotationBase;
+template<typename Lhs, typename Rhs> class eigen2_Cross;
+template<typename Scalar> class eigen2_Quaternion;
+template<typename Scalar> class eigen2_Rotation2D;
+template<typename Scalar> class eigen2_AngleAxis;
+template<typename Scalar,int Dim> class eigen2_Transform;
+template <typename _Scalar, int _AmbientDim> class eigen2_ParametrizedLine;
+template <typename _Scalar, int _AmbientDim> class eigen2_Hyperplane;
+template<typename Scalar,int Dim> class eigen2_Translation;
+template<typename Scalar,int Dim> class eigen2_Scaling;
+#endif
+
+#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
+template<typename Scalar> class Quaternion;
+template<typename Scalar,int Dim> class Transform;
+template <typename _Scalar, int _AmbientDim> class ParametrizedLine;
+template <typename _Scalar, int _AmbientDim> class Hyperplane;
+template<typename Scalar,int Dim> class Scaling;
+#endif
+
+#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+template<typename Scalar, int Options = AutoAlign> class Quaternion;
+template<typename Scalar,int Dim,int Mode,int _Options=AutoAlign> class Transform;
+template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class ParametrizedLine;
+template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class Hyperplane;
+template<typename Scalar> class UniformScaling;
+template<typename MatrixType,int Direction> class Homogeneous;
+#endif
+
+// MatrixFunctions module
+template<typename Derived> struct MatrixExponentialReturnValue;
+template<typename Derived> class MatrixFunctionReturnValue;
+template<typename Derived> class MatrixSquareRootReturnValue;
+template<typename Derived> class MatrixLogarithmReturnValue;
+template<typename Derived> class MatrixPowerReturnValue;
+template<typename Derived, typename Lhs, typename Rhs> class MatrixPowerProduct;
+
+namespace internal {
+template <typename Scalar>
+struct stem_function
+{
+  typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+  typedef ComplexScalar type(ComplexScalar, int);
+};
+}
+
+
+#ifdef EIGEN2_SUPPORT
+template<typename ExpressionType> class Cwise;
+template<typename MatrixType> class Minor;
+template<typename MatrixType> class LU;
+template<typename MatrixType> class QR;
+template<typename MatrixType> class SVD;
+namespace internal {
+template<typename MatrixType, unsigned int Mode> struct eigen2_part_return_type;
+}
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_FORWARDDECLARATIONS_H
diff --git a/include/eigen3/Eigen/src/Core/util/MKL_support.h b/include/eigen3/Eigen/src/Core/util/MKL_support.h
new file mode 100644
index 0000000..8acca9c
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/MKL_support.h
@@ -0,0 +1,126 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Include file with common MKL declarations
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_MKL_SUPPORT_H
+#define EIGEN_MKL_SUPPORT_H
+
+#ifdef EIGEN_USE_MKL_ALL
+  #ifndef EIGEN_USE_BLAS
+    #define EIGEN_USE_BLAS
+  #endif
+  #ifndef EIGEN_USE_LAPACKE
+    #define EIGEN_USE_LAPACKE
+  #endif
+  #ifndef EIGEN_USE_MKL_VML
+    #define EIGEN_USE_MKL_VML
+  #endif
+#endif
+
+#ifdef EIGEN_USE_LAPACKE_STRICT
+  #define EIGEN_USE_LAPACKE
+#endif
+
+#if defined(EIGEN_USE_BLAS) || defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML)
+  #define EIGEN_USE_MKL
+#endif
+
+#if defined EIGEN_USE_MKL
+#   include <mkl.h> 
+/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
+#   ifndef INTEL_MKL_VERSION
+#       undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
+#   elif INTEL_MKL_VERSION < 100305    /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/
+#       undef EIGEN_USE_MKL
+#   endif
+#   ifndef EIGEN_USE_MKL
+    /*If the MKL version is too old, undef everything*/
+#       undef   EIGEN_USE_MKL_ALL
+#       undef   EIGEN_USE_BLAS
+#       undef   EIGEN_USE_LAPACKE
+#       undef   EIGEN_USE_MKL_VML
+#       undef   EIGEN_USE_LAPACKE_STRICT
+#       undef   EIGEN_USE_LAPACKE
+#   endif
+#endif
+
+#if defined EIGEN_USE_MKL
+#include <mkl_lapacke.h>
+#define EIGEN_MKL_VML_THRESHOLD 128
+
+namespace Eigen {
+
+typedef std::complex<double> dcomplex;
+typedef std::complex<float>  scomplex;
+
+namespace internal {
+
+template<typename MKLType, typename EigenType>
+static inline void assign_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
+  mklScalar=eigenScalar;
+}
+
+template<typename MKLType, typename EigenType>
+static inline void assign_conj_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
+  mklScalar=eigenScalar;
+}
+
+template <>
+inline void assign_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
+  mklScalar.real=eigenScalar.real();
+  mklScalar.imag=eigenScalar.imag();
+}
+
+template <>
+inline void assign_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
+  mklScalar.real=eigenScalar.real();
+  mklScalar.imag=eigenScalar.imag();
+}
+
+template <>
+inline void assign_conj_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
+  mklScalar.real=eigenScalar.real();
+  mklScalar.imag=-eigenScalar.imag();
+}
+
+template <>
+inline void assign_conj_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
+  mklScalar.real=eigenScalar.real();
+  mklScalar.imag=-eigenScalar.imag();
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif
+
+#endif // EIGEN_MKL_SUPPORT_H
diff --git a/include/eigen3/Eigen/src/Core/util/Macros.h b/include/eigen3/Eigen/src/Core/util/Macros.h
new file mode 100644
index 0000000..6d1e6c1
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/Macros.h
@@ -0,0 +1,431 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MACROS_H
+#define EIGEN_MACROS_H
+
+#define EIGEN_WORLD_VERSION 3
+#define EIGEN_MAJOR_VERSION 2
+#define EIGEN_MINOR_VERSION 4
+
+#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
+                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
+                                                                 EIGEN_MINOR_VERSION>=z))))
+#ifdef __GNUC__
+  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x)
+#else
+  #define EIGEN_GNUC_AT_LEAST(x,y) 0
+#endif
+ 
+#ifdef __GNUC__
+  #define EIGEN_GNUC_AT_MOST(x,y) ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__<x)
+#else
+  #define EIGEN_GNUC_AT_MOST(x,y) 0
+#endif
+
+#if EIGEN_GNUC_AT_MOST(4,3) && !defined(__clang__)
+  // see bug 89
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+#else
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+#endif
+
+#if defined(__GNUC__) && (__GNUC__ <= 3)
+#define EIGEN_GCC3_OR_OLDER 1
+#else
+#define EIGEN_GCC3_OR_OLDER 0
+#endif
+
+// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+// certain common platform (compiler+architecture combinations) to avoid these problems.
+// Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
+// work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
+// when we have to disable static alignment.
+#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__))
+#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+#else
+#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+#endif
+
+// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+ && !EIGEN_GCC3_OR_OLDER \
+ && !defined(__SUNPRO_CC) \
+ && !defined(__QNXNTO__)
+  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+#else
+  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+  #ifndef EIGEN_DONT_ALIGN_STATICALLY
+    #define EIGEN_DONT_ALIGN_STATICALLY
+  #endif
+  #define EIGEN_ALIGN 0
+#else
+  #define EIGEN_ALIGN 1
+#endif
+
+// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
+// alignment (EIGEN_DONT_ALIGN_STATICALLY) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STATICALLY should be used.
+#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STATICALLY)
+  #define EIGEN_ALIGN_STATICALLY 1
+#else
+  #define EIGEN_ALIGN_STATICALLY 0
+  #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+    #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+  #endif
+#endif
+
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION RowMajor
+#else
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ColMajor
+#endif
+
+#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#endif
+
+// Cross compiler wrapper around LLVM's __has_builtin
+#ifdef __has_builtin
+#  define EIGEN_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define EIGEN_HAS_BUILTIN(x) 0
+#endif
+
+/** Allows to disable some optimizations which might affect the accuracy of the result.
+  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
+  * They currently include:
+  *   - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
+  */
+#ifndef EIGEN_FAST_MATH
+#define EIGEN_FAST_MATH 1
+#endif
+
+#define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl;
+
+// concatenate two tokens
+#define EIGEN_CAT2(a,b) a ## b
+#define EIGEN_CAT(a,b) EIGEN_CAT2(a,b)
+
+// convert a token to a string
+#define EIGEN_MAKESTRING2(a) #a
+#define EIGEN_MAKESTRING(a) EIGEN_MAKESTRING2(a)
+
+// EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC,
+// but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
+// but GCC is still doing fine with just inline.
+#if (defined _MSC_VER) || (defined __INTEL_COMPILER)
+#define EIGEN_STRONG_INLINE __forceinline
+#else
+#define EIGEN_STRONG_INLINE inline
+#endif
+
+// EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible
+// attribute to maximize inlining. This should only be used when really necessary: in particular,
+// it uses __attribute__((always_inline)) on GCC, which most of the time is useless and can severely harm compile times.
+// FIXME with the always_inline attribute,
+// gcc 3.4.x reports the following compilation error:
+//   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
+//    : function body not available
+#if EIGEN_GNUC_AT_LEAST(4,0)
+#define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline
+#else
+#define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
+#endif
+
+#if (defined __GNUC__)
+#define EIGEN_DONT_INLINE __attribute__((noinline))
+#elif (defined _MSC_VER)
+#define EIGEN_DONT_INLINE __declspec(noinline)
+#else
+#define EIGEN_DONT_INLINE
+#endif
+
+#if (defined __GNUC__)
+#define EIGEN_PERMISSIVE_EXPR __extension__
+#else
+#define EIGEN_PERMISSIVE_EXPR
+#endif
+
+// this macro allows to get rid of linking errors about multiply defined functions.
+//  - static is not very good because it prevents definitions from different object files to be merged.
+//           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
+//  - inline is not perfect either as it unwantedly hints the compiler toward inlining the function.
+#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline
+
+#ifdef NDEBUG
+# ifndef EIGEN_NO_DEBUG
+#  define EIGEN_NO_DEBUG
+# endif
+#endif
+
+// eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89
+#ifdef EIGEN_NO_DEBUG
+  #define eigen_plain_assert(x)
+#else
+  #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO
+    namespace Eigen {
+    namespace internal {
+    inline bool copy_bool(bool b) { return b; }
+    }
+    }
+    #define eigen_plain_assert(x) assert(x)
+  #else
+    // work around bug 89
+    #include <cstdlib>   // for abort
+    #include <iostream>  // for std::cerr
+
+    namespace Eigen {
+    namespace internal {
+    // trivial function copying a bool. Must be EIGEN_DONT_INLINE, so we implement it after including Eigen headers.
+    // see bug 89.
+    namespace {
+    EIGEN_DONT_INLINE bool copy_bool(bool b) { return b; }
+    }
+    inline void assert_fail(const char *condition, const char *function, const char *file, int line)
+    {
+      std::cerr << "assertion failed: " << condition << " in function " << function << " at " << file << ":" << line << std::endl;
+      abort();
+    }
+    }
+    }
+    #define eigen_plain_assert(x) \
+      do { \
+        if(!Eigen::internal::copy_bool(x)) \
+          Eigen::internal::assert_fail(EIGEN_MAKESTRING(x), __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+      } while(false)
+  #endif
+#endif
+
+// eigen_assert can be overridden
+#ifndef eigen_assert
+#define eigen_assert(x) eigen_plain_assert(x)
+#endif
+
+#ifdef EIGEN_INTERNAL_DEBUGGING
+#define eigen_internal_assert(x) eigen_assert(x)
+#else
+#define eigen_internal_assert(x)
+#endif
+
+#ifdef EIGEN_NO_DEBUG
+#define EIGEN_ONLY_USED_FOR_DEBUG(x) (void)x
+#else
+#define EIGEN_ONLY_USED_FOR_DEBUG(x)
+#endif
+
+#ifndef EIGEN_NO_DEPRECATED_WARNING
+  #if (defined __GNUC__)
+    #define EIGEN_DEPRECATED __attribute__((deprecated))
+  #elif (defined _MSC_VER)
+    #define EIGEN_DEPRECATED __declspec(deprecated)
+  #else
+    #define EIGEN_DEPRECATED
+  #endif
+#else
+  #define EIGEN_DEPRECATED
+#endif
+
+#if (defined __GNUC__)
+#define EIGEN_UNUSED __attribute__((unused))
+#else
+#define EIGEN_UNUSED
+#endif
+
+// Suppresses 'unused variable' warnings.
+namespace Eigen {
+  namespace internal {
+    template<typename T> void ignore_unused_variable(const T&) {}
+  }
+}
+#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
+
+#if !defined(EIGEN_ASM_COMMENT)
+  #if (defined __GNUC__) && ( defined(__i386__) || defined(__x86_64__) )
+    #define EIGEN_ASM_COMMENT(X)  __asm__("#" X)
+  #else
+    #define EIGEN_ASM_COMMENT(X)
+  #endif
+#endif
+
+/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
+ * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
+ * so that vectorization doesn't affect binary compatibility.
+ *
+ * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
+ * vectorized and non-vectorized code.
+ */
+#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+#elif (defined _MSC_VER)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
+#elif (defined __SUNPRO_CC)
+  // FIXME not sure about this one:
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+#else
+  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
+#endif
+
+#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+
+#if EIGEN_ALIGN_STATICALLY
+#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
+#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
+#else
+#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
+#define EIGEN_USER_ALIGN16
+#endif
+
+#ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
+  #define EIGEN_RESTRICT
+#endif
+#ifndef EIGEN_RESTRICT
+  #define EIGEN_RESTRICT __restrict
+#endif
+
+#ifndef EIGEN_STACK_ALLOCATION_LIMIT
+// 131072 == 128 KB
+#define EIGEN_STACK_ALLOCATION_LIMIT 131072
+#endif
+
+#ifndef EIGEN_DEFAULT_IO_FORMAT
+#ifdef EIGEN_MAKING_DOCS
+// format used in Eigen's documentation
+// needed to define it here as escaping characters in CMake add_definition's argument seems very problematic.
+#define EIGEN_DEFAULT_IO_FORMAT Eigen::IOFormat(3, 0, " ", "\n", "", "")
+#else
+#define EIGEN_DEFAULT_IO_FORMAT Eigen::IOFormat()
+#endif
+#endif
+
+// just an empty macro !
+#define EIGEN_EMPTY
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900) && (!defined(__INTEL_COMPILER))
+#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+  using Base::operator =;
+#elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+  using Base::operator =; \
+  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+  template <typename OtherDerived> \
+  EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
+#else
+#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+  using Base::operator =; \
+  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
+  { \
+    Base::operator=(other); \
+    return *this; \
+  }
+#endif
+
+#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
+  EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+
+/**
+* Just a side note. Commenting within defines works only by documenting
+* behind the object (via '!<'). Comments cannot be multi-line and thus
+* we have these extra long lines. What is confusing doxygen over here is
+* that we use '\' and basically have a bunch of typedefs with their
+* documentation in a single line.
+**/
+
+#define EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
+  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
+  typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
+  typedef typename Eigen::internal::nested<Derived>::type Nested; \
+  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
+  typedef typename Eigen::internal::traits<Derived>::Index Index; \
+  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
+        ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
+        Flags = Eigen::internal::traits<Derived>::Flags, \
+        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
+        SizeAtCompileTime = Base::SizeAtCompileTime, \
+        MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
+        IsVectorAtCompileTime = Base::IsVectorAtCompileTime };
+
+
+#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
+  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
+  typedef typename Base::PacketScalar PacketScalar; \
+  typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
+  typedef typename Eigen::internal::nested<Derived>::type Nested; \
+  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
+  typedef typename Eigen::internal::traits<Derived>::Index Index; \
+  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
+        ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
+        MaxRowsAtCompileTime = Eigen::internal::traits<Derived>::MaxRowsAtCompileTime, \
+        MaxColsAtCompileTime = Eigen::internal::traits<Derived>::MaxColsAtCompileTime, \
+        Flags = Eigen::internal::traits<Derived>::Flags, \
+        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
+        SizeAtCompileTime = Base::SizeAtCompileTime, \
+        MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
+        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
+  using Base::derived; \
+  using Base::const_cast_derived;
+
+
+#define EIGEN_PLAIN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b)
+#define EIGEN_PLAIN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b)
+
+// EIGEN_SIZE_MIN_PREFER_DYNAMIC gives the min between compile-time sizes. 0 has absolute priority, followed by 1,
+// followed by Dynamic, followed by other finite values. The reason for giving Dynamic the priority over
+// finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3.
+#define EIGEN_SIZE_MIN_PREFER_DYNAMIC(a,b) (((int)a == 0 || (int)b == 0) ? 0 \
+                           : ((int)a == 1 || (int)b == 1) ? 1 \
+                           : ((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \
+                           : ((int)a <= (int)b) ? (int)a : (int)b)
+
+// EIGEN_SIZE_MIN_PREFER_FIXED is a variant of EIGEN_SIZE_MIN_PREFER_DYNAMIC comparing MaxSizes. The difference is that finite values
+// now have priority over Dynamic, so that min(3, Dynamic) gives 3. Indeed, whatever the actual value is
+// (between 0 and 3), it is not more than 3.
+#define EIGEN_SIZE_MIN_PREFER_FIXED(a,b)  (((int)a == 0 || (int)b == 0) ? 0 \
+                           : ((int)a == 1 || (int)b == 1) ? 1 \
+                           : ((int)a == Dynamic && (int)b == Dynamic) ? Dynamic \
+                           : ((int)a == Dynamic) ? (int)b \
+                           : ((int)b == Dynamic) ? (int)a \
+                           : ((int)a <= (int)b) ? (int)a : (int)b)
+
+// see EIGEN_SIZE_MIN_PREFER_DYNAMIC. No need for a separate variant for MaxSizes here.
+#define EIGEN_SIZE_MAX(a,b) (((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \
+                           : ((int)a >= (int)b) ? (int)a : (int)b)
+
+#define EIGEN_LOGICAL_XOR(a,b) (((a) || (b)) && !((a) && (b)))
+
+#define EIGEN_IMPLIES(a,b) (!(a) || (b))
+
+#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR) \
+  template<typename OtherDerived> \
+  EIGEN_STRONG_INLINE const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> \
+  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+  { \
+    return CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived>(derived(), other.derived()); \
+  }
+
+// the expression type of a cwise product
+#define EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS) \
+    CwiseBinaryOp< \
+      internal::scalar_product_op< \
+          typename internal::traits<LHS>::Scalar, \
+          typename internal::traits<RHS>::Scalar \
+      >, \
+      const LHS, \
+      const RHS \
+    >
+
+#endif // EIGEN_MACROS_H
diff --git a/include/eigen3/Eigen/src/Core/util/Memory.h b/include/eigen3/Eigen/src/Core/util/Memory.h
new file mode 100644
index 0000000..330bcf5
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/Memory.h
@@ -0,0 +1,977 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2009 Kenneth Riddile <kfriddile at yahoo.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel at gmail.com>
+// Copyright (C) 2010 Thomas Capricelli <orzel at freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+/*****************************************************************************
+*** Platform checks for aligned malloc functions                           ***
+*****************************************************************************/
+
+#ifndef EIGEN_MEMORY_H
+#define EIGEN_MEMORY_H
+
+#ifndef EIGEN_MALLOC_ALREADY_ALIGNED
+
+// Try to determine automatically if malloc is already aligned.
+
+// On 64-bit systems, glibc's malloc returns 16-byte-aligned pointers, see:
+//   http://www.gnu.org/s/libc/manual/html_node/Aligned-Memory-Blocks.html
+// This is true at least since glibc 2.8.
+// This leaves the question how to detect 64-bit. According to this document,
+//   http://gcc.fyxm.net/summit/2003/Porting%20to%2064%20bit.pdf
+// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
+// quite safe, at least within the context of glibc, to equate 64-bit with LP64.
+#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
+ && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ )
+  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
+#else
+  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
+#endif
+
+// FreeBSD 6 seems to have 16-byte aligned malloc
+//   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
+// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
+//   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
+#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
+  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
+#else
+  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
+#endif
+
+#if defined(__APPLE__) \
+ || defined(_WIN64) \
+ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
+ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
+  #define EIGEN_MALLOC_ALREADY_ALIGNED 1
+#else
+  #define EIGEN_MALLOC_ALREADY_ALIGNED 0
+#endif
+
+#endif
+
+// See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
+// It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
+// Currently, let's include it only on unix systems:
+#if defined(__unix__) || defined(__unix)
+  #include <unistd.h>
+  #if ((defined __QNXNTO__) || (defined _GNU_SOURCE) || (defined __PGI) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
+    #define EIGEN_HAS_POSIX_MEMALIGN 1
+  #endif
+#endif
+
+#ifndef EIGEN_HAS_POSIX_MEMALIGN
+  #define EIGEN_HAS_POSIX_MEMALIGN 0
+#endif
+
+#ifdef EIGEN_VECTORIZE_SSE
+  #define EIGEN_HAS_MM_MALLOC 1
+#else
+  #define EIGEN_HAS_MM_MALLOC 0
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+inline void throw_std_bad_alloc()
+{
+  #ifdef EIGEN_EXCEPTIONS
+    throw std::bad_alloc();
+  #else
+    std::size_t huge = -1;
+    new int[huge];
+  #endif
+}
+
+/*****************************************************************************
+*** Implementation of handmade aligned functions                           ***
+*****************************************************************************/
+
+/* ----- Hand made implementations of aligned malloc/free and realloc ----- */
+
+/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
+  * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
+  */
+inline void* handmade_aligned_malloc(std::size_t size)
+{
+  void *original = std::malloc(size+16);
+  if (original == 0) return 0;
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+  *(reinterpret_cast<void**>(aligned) - 1) = original;
+  return aligned;
+}
+
+/** \internal Frees memory allocated with handmade_aligned_malloc */
+inline void handmade_aligned_free(void *ptr)
+{
+  if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
+}
+
+/** \internal
+  * \brief Reallocates aligned memory.
+  * Since we know that our handmade version is based on std::realloc
+  * we can use std::realloc to implement efficient reallocation.
+  */
+inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0)
+{
+  if (ptr == 0) return handmade_aligned_malloc(size);
+  void *original = *(reinterpret_cast<void**>(ptr) - 1);
+  std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
+  original = std::realloc(original,size+16);
+  if (original == 0) return 0;
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+  void *previous_aligned = static_cast<char *>(original)+previous_offset;
+  if(aligned!=previous_aligned)
+    std::memmove(aligned, previous_aligned, size);
+  
+  *(reinterpret_cast<void**>(aligned) - 1) = original;
+  return aligned;
+}
+
+/*****************************************************************************
+*** Implementation of generic aligned realloc (when no realloc can be used)***
+*****************************************************************************/
+
+void* aligned_malloc(std::size_t size);
+void  aligned_free(void *ptr);
+
+/** \internal
+  * \brief Reallocates aligned memory.
+  * Allows reallocation with aligned ptr types. This implementation will
+  * always create a new memory chunk and copy the old data.
+  */
+inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
+{
+  if (ptr==0)
+    return aligned_malloc(size);
+
+  if (size==0)
+  {
+    aligned_free(ptr);
+    return 0;
+  }
+
+  void* newptr = aligned_malloc(size);
+  if (newptr == 0)
+  {
+    #ifdef EIGEN_HAS_ERRNO
+    errno = ENOMEM; // according to the standard
+    #endif
+    return 0;
+  }
+
+  if (ptr != 0)
+  {
+    std::memcpy(newptr, ptr, (std::min)(size,old_size));
+    aligned_free(ptr);
+  }
+
+  return newptr;
+}
+
+/*****************************************************************************
+*** Implementation of portable aligned versions of malloc/free/realloc     ***
+*****************************************************************************/
+
+#ifdef EIGEN_NO_MALLOC
+inline void check_that_malloc_is_allowed()
+{
+  eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
+}
+#elif defined EIGEN_RUNTIME_NO_MALLOC
+inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
+{
+  static bool value = true;
+  if (update == 1)
+    value = new_value;
+  return value;
+}
+inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
+inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
+inline void check_that_malloc_is_allowed()
+{
+  eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
+}
+#else 
+inline void check_that_malloc_is_allowed()
+{}
+#endif
+
+/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
+  * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
+  */
+inline void* aligned_malloc(size_t size)
+{
+  check_that_malloc_is_allowed();
+
+  void *result;
+  #if !EIGEN_ALIGN
+    result = std::malloc(size);
+  #elif EIGEN_MALLOC_ALREADY_ALIGNED
+    result = std::malloc(size);
+  #elif EIGEN_HAS_POSIX_MEMALIGN
+    if(posix_memalign(&result, 16, size)) result = 0;
+  #elif EIGEN_HAS_MM_MALLOC
+    result = _mm_malloc(size, 16);
+  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
+    result = _aligned_malloc(size, 16);
+  #else
+    result = handmade_aligned_malloc(size);
+  #endif
+
+  if(!result && size)
+    throw_std_bad_alloc();
+
+  return result;
+}
+
+/** \internal Frees memory allocated with aligned_malloc. */
+inline void aligned_free(void *ptr)
+{
+  #if !EIGEN_ALIGN
+    std::free(ptr);
+  #elif EIGEN_MALLOC_ALREADY_ALIGNED
+    std::free(ptr);
+  #elif EIGEN_HAS_POSIX_MEMALIGN
+    std::free(ptr);
+  #elif EIGEN_HAS_MM_MALLOC
+    _mm_free(ptr);
+  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
+    _aligned_free(ptr);
+  #else
+    handmade_aligned_free(ptr);
+  #endif
+}
+
+/**
+* \internal
+* \brief Reallocates an aligned block of memory.
+* \throws std::bad_alloc on allocation failure
+**/
+inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
+{
+  EIGEN_UNUSED_VARIABLE(old_size);
+
+  void *result;
+#if !EIGEN_ALIGN
+  result = std::realloc(ptr,new_size);
+#elif EIGEN_MALLOC_ALREADY_ALIGNED
+  result = std::realloc(ptr,new_size);
+#elif EIGEN_HAS_POSIX_MEMALIGN
+  result = generic_aligned_realloc(ptr,new_size,old_size);
+#elif EIGEN_HAS_MM_MALLOC
+  // The defined(_mm_free) is just here to verify that this MSVC version
+  // implements _mm_malloc/_mm_free based on the corresponding _aligned_
+  // functions. This may not always be the case and we just try to be safe.
+  #if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free)
+    result = _aligned_realloc(ptr,new_size,16);
+  #else
+    result = generic_aligned_realloc(ptr,new_size,old_size);
+  #endif
+#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
+  result = _aligned_realloc(ptr,new_size,16);
+#else
+  result = handmade_aligned_realloc(ptr,new_size,old_size);
+#endif
+
+  if (!result && new_size)
+    throw_std_bad_alloc();
+
+  return result;
+}
+
+/*****************************************************************************
+*** Implementation of conditionally aligned functions                      ***
+*****************************************************************************/
+
+/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
+  * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
+  */
+template<bool Align> inline void* conditional_aligned_malloc(size_t size)
+{
+  return aligned_malloc(size);
+}
+
+template<> inline void* conditional_aligned_malloc<false>(size_t size)
+{
+  check_that_malloc_is_allowed();
+
+  void *result = std::malloc(size);
+  if(!result && size)
+    throw_std_bad_alloc();
+  return result;
+}
+
+/** \internal Frees memory allocated with conditional_aligned_malloc */
+template<bool Align> inline void conditional_aligned_free(void *ptr)
+{
+  aligned_free(ptr);
+}
+
+template<> inline void conditional_aligned_free<false>(void *ptr)
+{
+  std::free(ptr);
+}
+
+template<bool Align> inline void* conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
+{
+  return aligned_realloc(ptr, new_size, old_size);
+}
+
+template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
+{
+  return std::realloc(ptr, new_size);
+}
+
+/*****************************************************************************
+*** Construction/destruction of array elements                             ***
+*****************************************************************************/
+
+/** \internal Constructs the elements of an array.
+  * The \a size parameter tells on how many objects to call the constructor of T.
+  */
+template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
+{
+  for (size_t i=0; i < size; ++i) ::new (ptr + i) T;
+  return ptr;
+}
+
+/** \internal Destructs the elements of an array.
+  * The \a size parameters tells on how many objects to call the destructor of T.
+  */
+template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
+{
+  // always destruct an array starting from the end.
+  if(ptr)
+    while(size) ptr[--size].~T();
+}
+
+/*****************************************************************************
+*** Implementation of aligned new/delete-like functions                    ***
+*****************************************************************************/
+
+template<typename T>
+EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
+{
+  if(size > size_t(-1) / sizeof(T))
+    throw_std_bad_alloc();
+}
+
+/** \internal Allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment.
+  * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
+  * The default constructor of T is called.
+  */
+template<typename T> inline T* aligned_new(size_t size)
+{
+  check_size_for_overflow<T>(size);
+  T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
+  return construct_elements_of_array(result, size);
+}
+
+template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
+{
+  check_size_for_overflow<T>(size);
+  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
+  return construct_elements_of_array(result, size);
+}
+
+/** \internal Deletes objects constructed with aligned_new
+  * The \a size parameters tells on how many objects to call the destructor of T.
+  */
+template<typename T> inline void aligned_delete(T *ptr, size_t size)
+{
+  destruct_elements_of_array<T>(ptr, size);
+  aligned_free(ptr);
+}
+
+/** \internal Deletes objects constructed with conditional_aligned_new
+  * The \a size parameters tells on how many objects to call the destructor of T.
+  */
+template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
+{
+  destruct_elements_of_array<T>(ptr, size);
+  conditional_aligned_free<Align>(ptr);
+}
+
+template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
+{
+  check_size_for_overflow<T>(new_size);
+  check_size_for_overflow<T>(old_size);
+  if(new_size < old_size)
+    destruct_elements_of_array(pts+new_size, old_size-new_size);
+  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
+  if(new_size > old_size)
+    construct_elements_of_array(result+old_size, new_size-old_size);
+  return result;
+}
+
+
+template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
+{
+  if(size==0)
+    return 0; // short-cut. Also fixes Bug 884
+  check_size_for_overflow<T>(size);
+  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
+  if(NumTraits<T>::RequireInitialization)
+    construct_elements_of_array(result, size);
+  return result;
+}
+
+template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, size_t new_size, size_t old_size)
+{
+  check_size_for_overflow<T>(new_size);
+  check_size_for_overflow<T>(old_size);
+  if(NumTraits<T>::RequireInitialization && (new_size < old_size))
+    destruct_elements_of_array(pts+new_size, old_size-new_size);
+  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
+  if(NumTraits<T>::RequireInitialization && (new_size > old_size))
+    construct_elements_of_array(result+old_size, new_size-old_size);
+  return result;
+}
+
+template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
+{
+  if(NumTraits<T>::RequireInitialization)
+    destruct_elements_of_array<T>(ptr, size);
+  conditional_aligned_free<Align>(ptr);
+}
+
+/****************************************************************************/
+
+/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
+  *
+  * \param array the address of the start of the array
+  * \param size the size of the array
+  *
+  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
+  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+  * packet size for the given scalar type is 1, then everything is considered well-aligned.
+  *
+  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
+  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
+  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * example with Scalar=double on certain 32-bit platforms, see bug #79.
+  *
+  * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
+  */
+template<typename Scalar, typename Index>
+static inline Index first_aligned(const Scalar* array, Index size)
+{
+  static const Index PacketSize = packet_traits<Scalar>::size;
+  static const Index PacketAlignedMask = PacketSize-1;
+
+  if(PacketSize==1)
+  {
+    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
+    // of the array have the same alignment.
+    return 0;
+  }
+  else if(size_t(array) & (sizeof(Scalar)-1))
+  {
+    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+    // Consequently, no element of the array is well aligned.
+    return size;
+  }
+  else
+  {
+    return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
+                           & PacketAlignedMask, size);
+  }
+}
+
+/** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
+  */ 
+template<typename Index> 
+inline static Index first_multiple(Index size, Index base)
+{
+  return ((size+base-1)/base)*base;
+}
+
+// std::copy is much slower than memcpy, so let's introduce a smart_copy which
+// use memcpy on trivial types, i.e., on types that does not require an initialization ctor.
+template<typename T, bool UseMemcpy> struct smart_copy_helper;
+
+template<typename T> void smart_copy(const T* start, const T* end, T* target)
+{
+  smart_copy_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
+}
+
+template<typename T> struct smart_copy_helper<T,true> {
+  static inline void run(const T* start, const T* end, T* target)
+  { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
+};
+
+template<typename T> struct smart_copy_helper<T,false> {
+  static inline void run(const T* start, const T* end, T* target)
+  { std::copy(start, end, target); }
+};
+
+
+/*****************************************************************************
+*** Implementation of runtime stack allocation (falling back to malloc)    ***
+*****************************************************************************/
+
+// you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
+// to the appropriate stack allocation function
+#ifndef EIGEN_ALLOCA
+  #if (defined __linux__)
+    #define EIGEN_ALLOCA alloca
+  #elif defined(_MSC_VER)
+    #define EIGEN_ALLOCA _alloca
+  #endif
+#endif
+
+// This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
+// at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
+template<typename T> class aligned_stack_memory_handler
+{
+  public:
+    /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size.
+     * Note that \a ptr can be 0 regardless of the other parameters.
+     * This constructor takes care of constructing/initializing the elements of the buffer if required by the scalar type T (see NumTraits<T>::RequireInitialization).
+     * In this case, the buffer elements will also be destructed when this handler will be destructed.
+     * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
+     **/
+    aligned_stack_memory_handler(T* ptr, size_t size, bool dealloc)
+      : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
+    {
+      if(NumTraits<T>::RequireInitialization && m_ptr)
+        Eigen::internal::construct_elements_of_array(m_ptr, size);
+    }
+    ~aligned_stack_memory_handler()
+    {
+      if(NumTraits<T>::RequireInitialization && m_ptr)
+        Eigen::internal::destruct_elements_of_array<T>(m_ptr, m_size);
+      if(m_deallocate)
+        Eigen::internal::aligned_free(m_ptr);
+    }
+  protected:
+    T* m_ptr;
+    size_t m_size;
+    bool m_deallocate;
+};
+
+} // end namespace internal
+
+/** \internal
+  * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+  * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+  * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
+  * The allocated buffer is automatically deleted when exiting the scope of this declaration.
+  * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
+  * Here is an example:
+  * \code
+  * {
+  *   ei_declare_aligned_stack_constructed_variable(float,data,size,0);
+  *   // use data[0] to data[size-1]
+  * }
+  * \endcode
+  * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+  */
+#ifdef EIGEN_ALLOCA
+
+  #if defined(__arm__) || defined(_WIN32)
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16)
+  #else
+    #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
+  #endif
+
+  #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
+    Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
+    TYPE* NAME = (BUFFER)!=0 ? (BUFFER) \
+               : reinterpret_cast<TYPE*>( \
+                      (sizeof(TYPE)*SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE)*SIZE) \
+                    : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
+    Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
+
+#else
+
+  #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
+    Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
+    TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
+    Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
+    
+#endif
+
+
+/*****************************************************************************
+*** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
+*****************************************************************************/
+
+#if EIGEN_ALIGN
+  #ifdef EIGEN_EXCEPTIONS
+    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      void* operator new(size_t size, const std::nothrow_t&) throw() { \
+        try { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
+        catch (...) { return 0; } \
+      }
+  #else
+    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      void* operator new(size_t size, const std::nothrow_t&) throw() { \
+        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
+      }
+  #endif
+
+  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
+      void *operator new(size_t size) { \
+        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
+      } \
+      void *operator new[](size_t size) { \
+        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
+      } \
+      void operator delete(void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete[](void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      /* in-place new and delete. since (at least afaik) there is no actual   */ \
+      /* memory allocated we can safely let the default implementation handle */ \
+      /* this particular case. */ \
+      static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \
+      static void *operator new[](size_t size, void* ptr) { return ::operator new[](size,ptr); } \
+      void operator delete(void * memory, void *ptr) throw() { return ::operator delete(memory,ptr); } \
+      void operator delete[](void * memory, void *ptr) throw() { return ::operator delete[](memory,ptr); } \
+      /* nothrow-new (returns zero instead of std::bad_alloc) */ \
+      EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      void operator delete(void *ptr, const std::nothrow_t&) throw() { \
+        Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
+      } \
+      typedef void eigen_aligned_operator_new_marker_type;
+#else
+  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+#endif
+
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)))
+
+/****************************************************************************/
+
+/** \class aligned_allocator
+* \ingroup Core_Module
+*
+* \brief STL compatible allocator to use with with 16 byte aligned types
+*
+* Example:
+* \code
+* // Matrix4f requires 16 bytes alignment:
+* std::map< int, Matrix4f, std::less<int>, 
+*           aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4;
+* // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator:
+* std::map< int, Vector3f > my_map_vec3;
+* \endcode
+*
+* \sa \ref TopicStlContainers.
+*/
+template<class T>
+class aligned_allocator
+{
+public:
+    typedef size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+    typedef T*        pointer;
+    typedef const T*  const_pointer;
+    typedef T&        reference;
+    typedef const T&  const_reference;
+    typedef T         value_type;
+
+    template<class U>
+    struct rebind
+    {
+        typedef aligned_allocator<U> other;
+    };
+
+    pointer address( reference value ) const
+    {
+        return &value;
+    }
+
+    const_pointer address( const_reference value ) const
+    {
+        return &value;
+    }
+
+    aligned_allocator()
+    {
+    }
+
+    aligned_allocator( const aligned_allocator& )
+    {
+    }
+
+    template<class U>
+    aligned_allocator( const aligned_allocator<U>& )
+    {
+    }
+
+    ~aligned_allocator()
+    {
+    }
+
+    size_type max_size() const
+    {
+        return (std::numeric_limits<size_type>::max)();
+    }
+
+    pointer allocate( size_type num, const void* hint = 0 )
+    {
+        EIGEN_UNUSED_VARIABLE(hint);
+        internal::check_size_for_overflow<T>(num);
+        return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) );
+    }
+
+    void construct( pointer p, const T& value )
+    {
+        ::new( p ) T( value );
+    }
+
+    void destroy( pointer p )
+    {
+        p->~T();
+    }
+
+    void deallocate( pointer p, size_type /*num*/ )
+    {
+        internal::aligned_free( p );
+    }
+
+    bool operator!=(const aligned_allocator<T>& ) const
+    { return false; }
+
+    bool operator==(const aligned_allocator<T>& ) const
+    { return true; }
+};
+
+//---------- Cache sizes ----------
+
+#if !defined(EIGEN_NO_CPUID)
+#  if defined(__GNUC__) && ( defined(__i386__) || defined(__x86_64__) )
+#    if defined(__PIC__) && defined(__i386__)
+       // Case for x86 with PIC
+#      define EIGEN_CPUID(abcd,func,id) \
+         __asm__ __volatile__ ("xchgl %%ebx, %k1;cpuid; xchgl %%ebx,%k1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id));
+#    elif defined(__PIC__) && defined(__x86_64__)
+       // Case for x64 with PIC. In theory this is only a problem with recent gcc and with medium or large code model, not with the default small code model.
+       // However, we cannot detect which code model is used, and the xchg overhead is negligible anyway.
+#      define EIGEN_CPUID(abcd,func,id) \
+        __asm__ __volatile__ ("xchg{q}\t{%%}rbx, %q1; cpuid; xchg{q}\t{%%}rbx, %q1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id));
+#    else
+       // Case for x86_64 or x86 w/o PIC
+#      define EIGEN_CPUID(abcd,func,id) \
+         __asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id) );
+#    endif
+#  elif defined(_MSC_VER)
+#    if (_MSC_VER > 1500) && ( defined(_M_IX86) || defined(_M_X64) )
+#      define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id)
+#    endif
+#  endif
+#endif
+
+namespace internal {
+
+#ifdef EIGEN_CPUID
+
+inline bool cpuid_is_vendor(int abcd[4], const int vendor[3])
+{
+  return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2];
+}
+
+inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
+{
+  int abcd[4];
+  l1 = l2 = l3 = 0;
+  int cache_id = 0;
+  int cache_type = 0;
+  do {
+    abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+    EIGEN_CPUID(abcd,0x4,cache_id);
+    cache_type  = (abcd[0] & 0x0F) >> 0;
+    if(cache_type==1||cache_type==3) // data or unified cache
+    {
+      int cache_level = (abcd[0] & 0xE0) >> 5;  // A[7:5]
+      int ways        = (abcd[1] & 0xFFC00000) >> 22; // B[31:22]
+      int partitions  = (abcd[1] & 0x003FF000) >> 12; // B[21:12]
+      int line_size   = (abcd[1] & 0x00000FFF) >>  0; // B[11:0]
+      int sets        = (abcd[2]);                    // C[31:0]
+
+      int cache_size = (ways+1) * (partitions+1) * (line_size+1) * (sets+1);
+
+      switch(cache_level)
+      {
+        case 1: l1 = cache_size; break;
+        case 2: l2 = cache_size; break;
+        case 3: l3 = cache_size; break;
+        default: break;
+      }
+    }
+    cache_id++;
+  } while(cache_type>0 && cache_id<16);
+}
+
+inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3)
+{
+  int abcd[4];
+  abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+  l1 = l2 = l3 = 0;
+  EIGEN_CPUID(abcd,0x00000002,0);
+  unsigned char * bytes = reinterpret_cast<unsigned char *>(abcd)+2;
+  bool check_for_p2_core2 = false;
+  for(int i=0; i<14; ++i)
+  {
+    switch(bytes[i])
+    {
+      case 0x0A: l1 = 8; break;   // 0Ah   data L1 cache, 8 KB, 2 ways, 32 byte lines
+      case 0x0C: l1 = 16; break;  // 0Ch   data L1 cache, 16 KB, 4 ways, 32 byte lines
+      case 0x0E: l1 = 24; break;  // 0Eh   data L1 cache, 24 KB, 6 ways, 64 byte lines
+      case 0x10: l1 = 16; break;  // 10h   data L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
+      case 0x15: l1 = 16; break;  // 15h   code L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
+      case 0x2C: l1 = 32; break;  // 2Ch   data L1 cache, 32 KB, 8 ways, 64 byte lines
+      case 0x30: l1 = 32; break;  // 30h   code L1 cache, 32 KB, 8 ways, 64 byte lines
+      case 0x60: l1 = 16; break;  // 60h   data L1 cache, 16 KB, 8 ways, 64 byte lines, sectored
+      case 0x66: l1 = 8; break;   // 66h   data L1 cache, 8 KB, 4 ways, 64 byte lines, sectored
+      case 0x67: l1 = 16; break;  // 67h   data L1 cache, 16 KB, 4 ways, 64 byte lines, sectored
+      case 0x68: l1 = 32; break;  // 68h   data L1 cache, 32 KB, 4 ways, 64 byte lines, sectored
+      case 0x1A: l2 = 96; break;   // code and data L2 cache, 96 KB, 6 ways, 64 byte lines (IA-64)
+      case 0x22: l3 = 512; break;   // code and data L3 cache, 512 KB, 4 ways (!), 64 byte lines, dual-sectored
+      case 0x23: l3 = 1024; break;   // code and data L3 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x25: l3 = 2048; break;   // code and data L3 cache, 2048 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x29: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x39: l2 = 128; break;   // code and data L2 cache, 128 KB, 4 ways, 64 byte lines, sectored
+      case 0x3A: l2 = 192; break;   // code and data L2 cache, 192 KB, 6 ways, 64 byte lines, sectored
+      case 0x3B: l2 = 128; break;   // code and data L2 cache, 128 KB, 2 ways, 64 byte lines, sectored
+      case 0x3C: l2 = 256; break;   // code and data L2 cache, 256 KB, 4 ways, 64 byte lines, sectored
+      case 0x3D: l2 = 384; break;   // code and data L2 cache, 384 KB, 6 ways, 64 byte lines, sectored
+      case 0x3E: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 64 byte lines, sectored
+      case 0x40: l2 = 0; break;   // no integrated L2 cache (P6 core) or L3 cache (P4 core)
+      case 0x41: l2 = 128; break;   // code and data L2 cache, 128 KB, 4 ways, 32 byte lines
+      case 0x42: l2 = 256; break;   // code and data L2 cache, 256 KB, 4 ways, 32 byte lines
+      case 0x43: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 32 byte lines
+      case 0x44: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 4 ways, 32 byte lines
+      case 0x45: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 4 ways, 32 byte lines
+      case 0x46: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines
+      case 0x47: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 8 ways, 64 byte lines
+      case 0x48: l2 = 3072; break;   // code and data L2 cache, 3072 KB, 12 ways, 64 byte lines
+      case 0x49: if(l2!=0) l3 = 4096; else {check_for_p2_core2=true; l3 = l2 = 4096;} break;// code and data L3 cache, 4096 KB, 16 ways, 64 byte lines (P4) or L2 for core2
+      case 0x4A: l3 = 6144; break;   // code and data L3 cache, 6144 KB, 12 ways, 64 byte lines
+      case 0x4B: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 16 ways, 64 byte lines
+      case 0x4C: l3 = 12288; break;   // code and data L3 cache, 12288 KB, 12 ways, 64 byte lines
+      case 0x4D: l3 = 16384; break;   // code and data L3 cache, 16384 KB, 16 ways, 64 byte lines
+      case 0x4E: l2 = 6144; break;   // code and data L2 cache, 6144 KB, 24 ways, 64 byte lines
+      case 0x78: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 4 ways, 64 byte lines
+      case 0x79: l2 = 128; break;   // code and data L2 cache, 128 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7A: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7B: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7C: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7D: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 8 ways, 64 byte lines
+      case 0x7E: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 128 byte lines, sect. (IA-64)
+      case 0x7F: l2 = 512; break;   // code and data L2 cache, 512 KB, 2 ways, 64 byte lines
+      case 0x80: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 64 byte lines
+      case 0x81: l2 = 128; break;   // code and data L2 cache, 128 KB, 8 ways, 32 byte lines
+      case 0x82: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 32 byte lines
+      case 0x83: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 32 byte lines
+      case 0x84: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 32 byte lines
+      case 0x85: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 8 ways, 32 byte lines
+      case 0x86: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 64 byte lines
+      case 0x87: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines
+      case 0x88: l3 = 2048; break;   // code and data L3 cache, 2048 KB, 4 ways, 64 byte lines (IA-64)
+      case 0x89: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines (IA-64)
+      case 0x8A: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 4 ways, 64 byte lines (IA-64)
+      case 0x8D: l3 = 3072; break;   // code and data L3 cache, 3072 KB, 12 ways, 128 byte lines (IA-64)
+
+      default: break;
+    }
+  }
+  if(check_for_p2_core2 && l2 == l3)
+    l3 = 0;
+  l1 *= 1024;
+  l2 *= 1024;
+  l3 *= 1024;
+}
+
+inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs)
+{
+  if(max_std_funcs>=4)
+    queryCacheSizes_intel_direct(l1,l2,l3);
+  else
+    queryCacheSizes_intel_codes(l1,l2,l3);
+}
+
+inline void queryCacheSizes_amd(int& l1, int& l2, int& l3)
+{
+  int abcd[4];
+  abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+  EIGEN_CPUID(abcd,0x80000005,0);
+  l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
+  abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+  EIGEN_CPUID(abcd,0x80000006,0);
+  l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
+  l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
+}
+#endif
+
+/** \internal
+ * Queries and returns the cache sizes in Bytes of the L1, L2, and L3 data caches respectively */
+inline void queryCacheSizes(int& l1, int& l2, int& l3)
+{
+  #ifdef EIGEN_CPUID
+  int abcd[4];
+  const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e};
+  const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163};
+  const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!"
+
+  // identify the CPU vendor
+  EIGEN_CPUID(abcd,0x0,0);
+  int max_std_funcs = abcd[1];
+  if(cpuid_is_vendor(abcd,GenuineIntel))
+    queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
+  else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
+    queryCacheSizes_amd(l1,l2,l3);
+  else
+    // by default let's use Intel's API
+    queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
+
+  // here is the list of other vendors:
+//   ||cpuid_is_vendor(abcd,"VIA VIA VIA ")
+//   ||cpuid_is_vendor(abcd,"CyrixInstead")
+//   ||cpuid_is_vendor(abcd,"CentaurHauls")
+//   ||cpuid_is_vendor(abcd,"GenuineTMx86")
+//   ||cpuid_is_vendor(abcd,"TransmetaCPU")
+//   ||cpuid_is_vendor(abcd,"RiseRiseRise")
+//   ||cpuid_is_vendor(abcd,"Geode by NSC")
+//   ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
+//   ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
+//   ||cpuid_is_vendor(abcd,"NexGenDriven")
+  #else
+  l1 = l2 = l3 = -1;
+  #endif
+}
+
+/** \internal
+ * \returns the size in Bytes of the L1 data cache */
+inline int queryL1CacheSize()
+{
+  int l1(-1), l2, l3;
+  queryCacheSizes(l1,l2,l3);
+  return l1;
+}
+
+/** \internal
+ * \returns the size in Bytes of the L2 or L3 cache if this later is present */
+inline int queryTopLevelCacheSize()
+{
+  int l1, l2(-1), l3(-1);
+  queryCacheSizes(l1,l2,l3);
+  return (std::max)(l2,l3);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MEMORY_H
diff --git a/include/eigen3/Eigen/src/Core/util/Meta.h b/include/eigen3/Eigen/src/Core/util/Meta.h
new file mode 100644
index 0000000..71d5871
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/Meta.h
@@ -0,0 +1,243 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_META_H
+#define EIGEN_META_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \file Meta.h
+  * This file contains generic metaprogramming classes which are not specifically related to Eigen.
+  * \note In case you wonder, yes we're aware that Boost already provides all these features,
+  * we however don't want to add a dependency to Boost.
+  */
+
+struct true_type {  enum { value = 1 }; };
+struct false_type { enum { value = 0 }; };
+
+template<bool Condition, typename Then, typename Else>
+struct conditional { typedef Then type; };
+
+template<typename Then, typename Else>
+struct conditional <false, Then, Else> { typedef Else type; };
+
+template<typename T, typename U> struct is_same { enum { value = 0 }; };
+template<typename T> struct is_same<T,T> { enum { value = 1 }; };
+
+template<typename T> struct remove_reference { typedef T type; };
+template<typename T> struct remove_reference<T&> { typedef T type; };
+
+template<typename T> struct remove_pointer { typedef T type; };
+template<typename T> struct remove_pointer<T*> { typedef T type; };
+template<typename T> struct remove_pointer<T*const> { typedef T type; };
+
+template <class T> struct remove_const { typedef T type; };
+template <class T> struct remove_const<const T> { typedef T type; };
+template <class T> struct remove_const<const T[]> { typedef T type[]; };
+template <class T, unsigned int Size> struct remove_const<const T[Size]> { typedef T type[Size]; };
+
+template<typename T> struct remove_all { typedef T type; };
+template<typename T> struct remove_all<const T>   { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T const&>  { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T&>        { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T const*>  { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T*>        { typedef typename remove_all<T>::type type; };
+
+template<typename T> struct is_arithmetic      { enum { value = false }; };
+template<> struct is_arithmetic<float>         { enum { value = true }; };
+template<> struct is_arithmetic<double>        { enum { value = true }; };
+template<> struct is_arithmetic<long double>   { enum { value = true }; };
+template<> struct is_arithmetic<bool>          { enum { value = true }; };
+template<> struct is_arithmetic<char>          { enum { value = true }; };
+template<> struct is_arithmetic<signed char>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned char> { enum { value = true }; };
+template<> struct is_arithmetic<signed short>  { enum { value = true }; };
+template<> struct is_arithmetic<unsigned short>{ enum { value = true }; };
+template<> struct is_arithmetic<signed int>    { enum { value = true }; };
+template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
+template<> struct is_arithmetic<signed long>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
+
+template <typename T> struct add_const { typedef const T type; };
+template <typename T> struct add_const<T&> { typedef T& type; };
+
+template <typename T> struct is_const { enum { value = 0 }; };
+template <typename T> struct is_const<T const> { enum { value = 1 }; };
+
+template<typename T> struct add_const_on_value_type            { typedef const T type;  };
+template<typename T> struct add_const_on_value_type<T&>        { typedef T const& type; };
+template<typename T> struct add_const_on_value_type<T*>        { typedef T const* type; };
+template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
+template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
+
+/** \internal Allows to enable/disable an overload
+  * according to a compile time condition.
+  */
+template<bool Condition, typename T> struct enable_if;
+
+template<typename T> struct enable_if<true,T>
+{ typedef T type; };
+
+
+
+/** \internal
+  * A base class do disable default copy ctor and copy assignement operator.
+  */
+class noncopyable
+{
+  noncopyable(const noncopyable&);
+  const noncopyable& operator=(const noncopyable&);
+protected:
+  noncopyable() {}
+  ~noncopyable() {}
+};
+
+
+/** \internal
+  * Convenient struct to get the result type of a unary or binary functor.
+  *
+  * It supports both the current STL mechanism (using the result_type member) as well as
+  * upcoming next STL generation (using a templated result member).
+  * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
+  */
+template<typename T> struct result_of {};
+
+struct has_none {int a[1];};
+struct has_std_result_type {int a[2];};
+struct has_tr1_result {int a[3];};
+
+template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
+struct unary_result_of_select {typedef ArgType type;};
+
+template<typename Func, typename ArgType>
+struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType>
+struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef typename Func::template result<Func(ArgType)>::type type;};
+
+template<typename Func, typename ArgType>
+struct result_of<Func(ArgType)> {
+    template<typename T>
+    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
+    static has_none            testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename unary_result_of_select<Func, ArgType, FunctorType>::type type;
+};
+
+template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
+struct binary_result_of_select {typedef ArgType0 type;};
+
+template<typename Func, typename ArgType0, typename ArgType1>
+struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
+{typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1>
+struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
+{typedef typename Func::template result<Func(ArgType0,ArgType1)>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1>
+struct result_of<Func(ArgType0,ArgType1)> {
+    template<typename T>
+    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
+    static has_none            testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
+};
+
+/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
+  * Usage example: \code meta_sqrt<1023>::ret \endcode
+  */
+template<int Y,
+         int InfX = 0,
+         int SupX = ((Y==1) ? 1 : Y/2),
+         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
+                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
+class meta_sqrt
+{
+    enum {
+      MidX = (InfX+SupX)/2,
+      TakeInf = MidX*MidX > Y ? 1 : 0,
+      NewInf = int(TakeInf) ? InfX : int(MidX),
+      NewSup = int(TakeInf) ? int(MidX) : SupX
+    };
+  public:
+    enum { ret = meta_sqrt<Y,NewInf,NewSup>::ret };
+};
+
+template<int Y, int InfX, int SupX>
+class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
+
+/** \internal determines whether the product of two numeric types is allowed and what the return type is */
+template<typename T, typename U> struct scalar_product_traits
+{
+  enum { Defined = 0 };
+};
+
+template<typename T> struct scalar_product_traits<T,T>
+{
+  enum {
+    // Cost = NumTraits<T>::MulCost,
+    Defined = 1
+  };
+  typedef T ReturnType;
+};
+
+template<typename T> struct scalar_product_traits<T,std::complex<T> >
+{
+  enum {
+    // Cost = 2*NumTraits<T>::MulCost,
+    Defined = 1
+  };
+  typedef std::complex<T> ReturnType;
+};
+
+template<typename T> struct scalar_product_traits<std::complex<T>, T>
+{
+  enum {
+    // Cost = 2*NumTraits<T>::MulCost,
+    Defined = 1
+  };
+  typedef std::complex<T> ReturnType;
+};
+
+// FIXME quick workaround around current limitation of result_of
+// template<typename Scalar, typename ArgType0, typename ArgType1>
+// struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
+// typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
+// };
+
+template<typename T> struct is_diagonal
+{ enum { ret = false }; };
+
+template<typename T> struct is_diagonal<DiagonalBase<T> >
+{ enum { ret = true }; };
+
+template<typename T> struct is_diagonal<DiagonalWrapper<T> >
+{ enum { ret = true }; };
+
+template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
+{ enum { ret = true }; };
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_META_H
diff --git a/include/eigen3/Eigen/src/Core/util/NonMPL2.h b/include/eigen3/Eigen/src/Core/util/NonMPL2.h
new file mode 100644
index 0000000..1af67cf
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/NonMPL2.h
@@ -0,0 +1,3 @@
+#ifdef EIGEN_MPL2_ONLY
+#error Including non-MPL2 code in EIGEN_MPL2_ONLY mode
+#endif
diff --git a/include/eigen3/Eigen/src/Core/util/ReenableStupidWarnings.h b/include/eigen3/Eigen/src/Core/util/ReenableStupidWarnings.h
new file mode 100644
index 0000000..5ddfbd4
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -0,0 +1,14 @@
+#ifdef EIGEN_WARNINGS_DISABLED
+#undef EIGEN_WARNINGS_DISABLED
+
+#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+  #ifdef _MSC_VER
+    #pragma warning( pop )
+  #elif defined __INTEL_COMPILER
+    #pragma warning pop
+  #elif defined __clang__
+    #pragma clang diagnostic pop
+  #endif
+#endif
+
+#endif // EIGEN_WARNINGS_DISABLED
diff --git a/include/eigen3/Eigen/src/Core/util/StaticAssert.h b/include/eigen3/Eigen/src/Core/util/StaticAssert.h
new file mode 100644
index 0000000..bac5d9f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/StaticAssert.h
@@ -0,0 +1,208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STATIC_ASSERT_H
+#define EIGEN_STATIC_ASSERT_H
+
+/* Some notes on Eigen's static assertion mechanism:
+ *
+ *  - in EIGEN_STATIC_ASSERT(CONDITION,MSG) the parameter CONDITION must be a compile time boolean
+ *    expression, and MSG an enum listed in struct internal::static_assertion<true>
+ *
+ *  - define EIGEN_NO_STATIC_ASSERT to disable them (and save compilation time)
+ *    in that case, the static assertion is converted to the following runtime assert:
+ *      eigen_assert(CONDITION && "MSG")
+ *
+ *  - currently EIGEN_STATIC_ASSERT can only be used in function scope
+ *
+ */
+
+#ifndef EIGEN_NO_STATIC_ASSERT
+
+  #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (defined(_MSC_VER) && (_MSC_VER >= 1600))
+
+    // if native static_assert is enabled, let's use it
+    #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
+
+  #else // not CXX0X
+
+    namespace Eigen {
+
+    namespace internal {
+
+    template<bool condition>
+    struct static_assertion {};
+
+    template<>
+    struct static_assertion<true>
+    {
+      enum {
+        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX,
+        YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES,
+        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES,
+        THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE,
+        THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE,
+        THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE,
+        YOU_MADE_A_PROGRAMMING_MISTAKE,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT,
+        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE,
+        YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR,
+        YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR,
+        UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC,
+        THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES,
+        FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED,
+        NUMERIC_TYPE_MUST_BE_REAL,
+        COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED,
+        WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED,
+        THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE,
+        INVALID_MATRIX_PRODUCT,
+        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS,
+        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION,
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY,
+        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES,
+        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES,
+        INVALID_MATRIX_TEMPLATE_PARAMETERS,
+        INVALID_MATRIXBASE_TEMPLATE_PARAMETERS,
+        BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER,
+        THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX,
+        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE,
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES,
+        YOU_ALREADY_SPECIFIED_THIS_STRIDE,
+        INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION,
+        THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD,
+        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1,
+        THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS,
+        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES,
+        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION,
+        THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY,
+        YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT,
+        THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS,
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL,
+        THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES,
+        YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED,
+        YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED,
+        THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE,
+        THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
+        OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
+        IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
+        STORAGE_LAYOUT_DOES_NOT_MATCH
+      };
+    };
+
+    } // end namespace internal
+
+    } // end namespace Eigen
+
+    // Specialized implementation for MSVC to avoid "conditional
+    // expression is constant" warnings.  This implementation doesn't
+    // appear to work under GCC, hence the multiple implementations.
+    #ifdef _MSC_VER
+
+      #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
+        {Eigen::internal::static_assertion<bool(CONDITION)>::MSG;}
+
+    #else
+
+      #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
+        if (Eigen::internal::static_assertion<bool(CONDITION)>::MSG) {}
+
+    #endif
+
+  #endif // not CXX0X
+
+#else // EIGEN_NO_STATIC_ASSERT
+
+  #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG);
+
+#endif // EIGEN_NO_STATIC_ASSERT
+
+
+// static assertion failing if the type \a TYPE is not a vector type
+#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
+  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, \
+                      YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX)
+
+// static assertion failing if the type \a TYPE is not fixed-size
+#define EIGEN_STATIC_ASSERT_FIXED_SIZE(TYPE) \
+  EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime!=Eigen::Dynamic, \
+                      YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR)
+
+// static assertion failing if the type \a TYPE is not dynamic-size
+#define EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(TYPE) \
+  EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime==Eigen::Dynamic, \
+                      YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR)
+
+// static assertion failing if the type \a TYPE is not a vector type of the given size
+#define EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(TYPE, SIZE) \
+  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime && TYPE::SizeAtCompileTime==SIZE, \
+                      THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE)
+
+// static assertion failing if the type \a TYPE is not a vector type of the given size
+#define EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(TYPE, ROWS, COLS) \
+  EIGEN_STATIC_ASSERT(TYPE::RowsAtCompileTime==ROWS && TYPE::ColsAtCompileTime==COLS, \
+                      THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE)
+
+// static assertion failing if the two vector expression types are not compatible (same fixed-size or dynamic size)
+#define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) \
+  EIGEN_STATIC_ASSERT( \
+      (int(TYPE0::SizeAtCompileTime)==Eigen::Dynamic \
+    || int(TYPE1::SizeAtCompileTime)==Eigen::Dynamic \
+    || int(TYPE0::SizeAtCompileTime)==int(TYPE1::SizeAtCompileTime)),\
+    YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES)
+
+#define EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
+     ( \
+        (int(TYPE0::SizeAtCompileTime)==0 && int(TYPE1::SizeAtCompileTime)==0) \
+    || (\
+          (int(TYPE0::RowsAtCompileTime)==Eigen::Dynamic \
+        || int(TYPE1::RowsAtCompileTime)==Eigen::Dynamic \
+        || int(TYPE0::RowsAtCompileTime)==int(TYPE1::RowsAtCompileTime)) \
+      &&  (int(TYPE0::ColsAtCompileTime)==Eigen::Dynamic \
+        || int(TYPE1::ColsAtCompileTime)==Eigen::Dynamic \
+        || int(TYPE0::ColsAtCompileTime)==int(TYPE1::ColsAtCompileTime))\
+       ) \
+     )
+
+#ifdef EIGEN2_SUPPORT
+  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
+    eigen_assert(!NumTraits<Scalar>::IsInteger);
+#else
+  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
+    EIGEN_STATIC_ASSERT(!NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
+#endif
+
+
+// static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes
+#define EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
+  EIGEN_STATIC_ASSERT( \
+     EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1),\
+    YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
+
+#define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \
+      EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \
+                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \
+                          THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS)
+
+#define EIGEN_STATIC_ASSERT_LVALUE(Derived) \
+      EIGEN_STATIC_ASSERT(internal::is_lvalue<Derived>::value, \
+                          THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY)
+
+#define EIGEN_STATIC_ASSERT_ARRAYXPR(Derived) \
+      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value), \
+                          THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES)
+
+#define EIGEN_STATIC_ASSERT_SAME_XPR_KIND(Derived1, Derived2) \
+      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived1>::XprKind, \
+                                             typename internal::traits<Derived2>::XprKind \
+                                            >::value), \
+                          YOU_CANNOT_MIX_ARRAYS_AND_MATRICES)
+
+
+#endif // EIGEN_STATIC_ASSERT_H
diff --git a/include/eigen3/Eigen/src/Core/util/XprHelper.h b/include/eigen3/Eigen/src/Core/util/XprHelper.h
new file mode 100644
index 0000000..781965d
--- /dev/null
+++ b/include/eigen3/Eigen/src/Core/util/XprHelper.h
@@ -0,0 +1,469 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_XPRHELPER_H
+#define EIGEN_XPRHELPER_H
+
+// just a workaround because GCC seems to not really like empty structs
+// FIXME: gcc 4.3 generates bad code when strict-aliasing is enabled
+// so currently we simply disable this optimization for gcc 4.3
+#if (defined __GNUG__) && !((__GNUC__==4) && (__GNUC_MINOR__==3))
+  #define EIGEN_EMPTY_STRUCT_CTOR(X) \
+    EIGEN_STRONG_INLINE X() {} \
+    EIGEN_STRONG_INLINE X(const X& ) {}
+#else
+  #define EIGEN_EMPTY_STRUCT_CTOR(X)
+#endif
+
+namespace Eigen {
+
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
+
+namespace internal {
+
+//classes inheriting no_assignment_operator don't generate a default operator=.
+class no_assignment_operator
+{
+  private:
+    no_assignment_operator& operator=(const no_assignment_operator&);
+};
+
+/** \internal return the index type with the largest number of bits */
+template<typename I1, typename I2>
+struct promote_index_type
+{
+  typedef typename conditional<(sizeof(I1)<sizeof(I2)), I2, I1>::type type;
+};
+
+/** \internal If the template parameter Value is Dynamic, this class is just a wrapper around a T variable that
+  * can be accessed using value() and setValue().
+  * Otherwise, this class is an empty structure and value() just returns the template parameter Value.
+  */
+template<typename T, int Value> class variable_if_dynamic
+{
+  public:
+    EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic)
+    explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); assert(v == T(Value)); }
+    static T value() { return T(Value); }
+    void setValue(T) {}
+};
+
+template<typename T> class variable_if_dynamic<T, Dynamic>
+{
+    T m_value;
+    variable_if_dynamic() { assert(false); }
+  public:
+    explicit variable_if_dynamic(T value) : m_value(value) {}
+    T value() const { return m_value; }
+    void setValue(T value) { m_value = value; }
+};
+
+/** \internal like variable_if_dynamic but for DynamicIndex
+  */
+template<typename T, int Value> class variable_if_dynamicindex
+{
+  public:
+    EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex)
+    explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); assert(v == T(Value)); }
+    static T value() { return T(Value); }
+    void setValue(T) {}
+};
+
+template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
+{
+    T m_value;
+    variable_if_dynamicindex() { assert(false); }
+  public:
+    explicit variable_if_dynamicindex(T value) : m_value(value) {}
+    T value() const { return m_value; }
+    void setValue(T value) { m_value = value; }
+};
+
+template<typename T> struct functor_traits
+{
+  enum
+  {
+    Cost = 10,
+    PacketAccess = false,
+    IsRepeatable = false
+  };
+};
+
+template<typename T> struct packet_traits;
+
+template<typename T> struct unpacket_traits
+{
+  typedef T type;
+  enum {size=1};
+};
+
+template<typename _Scalar, int _Rows, int _Cols,
+         int _Options = AutoAlign |
+                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          : (_Cols==1 && _Rows!=1) ? ColMajor
+                          : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
+         int _MaxRows = _Rows,
+         int _MaxCols = _Cols
+> class make_proper_matrix_type
+{
+    enum {
+      IsColVector = _Cols==1 && _Rows!=1,
+      IsRowVector = _Rows==1 && _Cols!=1,
+      Options = IsColVector ? (_Options | ColMajor) & ~RowMajor
+              : IsRowVector ? (_Options | RowMajor) & ~ColMajor
+              : _Options
+    };
+  public:
+    typedef Matrix<_Scalar, _Rows, _Cols, Options, _MaxRows, _MaxCols> type;
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+class compute_matrix_flags
+{
+    enum {
+      row_major_bit = Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
+
+      aligned_bit =
+      (
+            ((Options&DontAlign)==0)
+        && (
+#if EIGEN_ALIGN_STATICALLY
+             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0))
+#else
+             0
+#endif
+
+          ||
+
+#if EIGEN_ALIGN
+             is_dynamic_size_storage
+#else
+             0
+#endif
+
+          )
+      ) ? AlignedBit : 0,
+      packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
+    };
+
+  public:
+    enum { ret = LinearAccessBit | LvalueBit | DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit };
+};
+
+template<int _Rows, int _Cols> struct size_at_compile_time
+{
+  enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
+};
+
+/* plain_matrix_type : the difference from eval is that plain_matrix_type is always a plain matrix type,
+ * whereas eval is a const reference in the case of a matrix
+ */
+
+template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_matrix_type;
+template<typename T, typename BaseClassType> struct plain_matrix_type_dense;
+template<typename T> struct plain_matrix_type<T,Dense>
+{
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind>::type type;
+};
+
+template<typename T> struct plain_matrix_type_dense<T,MatrixXpr>
+{
+  typedef Matrix<typename traits<T>::Scalar,
+                traits<T>::RowsAtCompileTime,
+                traits<T>::ColsAtCompileTime,
+                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                traits<T>::MaxRowsAtCompileTime,
+                traits<T>::MaxColsAtCompileTime
+          > type;
+};
+
+template<typename T> struct plain_matrix_type_dense<T,ArrayXpr>
+{
+  typedef Array<typename traits<T>::Scalar,
+                traits<T>::RowsAtCompileTime,
+                traits<T>::ColsAtCompileTime,
+                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                traits<T>::MaxRowsAtCompileTime,
+                traits<T>::MaxColsAtCompileTime
+          > type;
+};
+
+/* eval : the return type of eval(). For matrices, this is just a const reference
+ * in order to avoid a useless copy
+ */
+
+template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct eval;
+
+template<typename T> struct eval<T,Dense>
+{
+  typedef typename plain_matrix_type<T>::type type;
+//   typedef typename T::PlainObject type;
+//   typedef T::Matrix<typename traits<T>::Scalar,
+//                 traits<T>::RowsAtCompileTime,
+//                 traits<T>::ColsAtCompileTime,
+//                 AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+//                 traits<T>::MaxRowsAtCompileTime,
+//                 traits<T>::MaxColsAtCompileTime
+//           > type;
+};
+
+// for matrices, no need to evaluate, just use a const reference to avoid a useless copy
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct eval<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
+{
+  typedef const Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type;
+};
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct eval<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
+{
+  typedef const Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type;
+};
+
+
+
+/* plain_matrix_type_column_major : same as plain_matrix_type but guaranteed to be column-major
+ */
+template<typename T> struct plain_matrix_type_column_major
+{
+  enum { Rows = traits<T>::RowsAtCompileTime,
+         Cols = traits<T>::ColsAtCompileTime,
+         MaxRows = traits<T>::MaxRowsAtCompileTime,
+         MaxCols = traits<T>::MaxColsAtCompileTime
+  };
+  typedef Matrix<typename traits<T>::Scalar,
+                Rows,
+                Cols,
+                (MaxRows==1&&MaxCols!=1) ? RowMajor : ColMajor,
+                MaxRows,
+                MaxCols
+          > type;
+};
+
+/* plain_matrix_type_row_major : same as plain_matrix_type but guaranteed to be row-major
+ */
+template<typename T> struct plain_matrix_type_row_major
+{
+  enum { Rows = traits<T>::RowsAtCompileTime,
+         Cols = traits<T>::ColsAtCompileTime,
+         MaxRows = traits<T>::MaxRowsAtCompileTime,
+         MaxCols = traits<T>::MaxColsAtCompileTime
+  };
+  typedef Matrix<typename traits<T>::Scalar,
+                Rows,
+                Cols,
+                (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor,
+                MaxRows,
+                MaxCols
+          > type;
+};
+
+// we should be able to get rid of this one too
+template<typename T> struct must_nest_by_value { enum { ret = false }; };
+
+/** \internal The reference selector for template expressions. The idea is that we don't
+  * need to use references for expressions since they are light weight proxy
+  * objects which should generate no copying overhead. */
+template <typename T>
+struct ref_selector
+{
+  typedef typename conditional<
+    bool(traits<T>::Flags & NestByRefBit),
+    T const&,
+    const T
+  >::type type;
+};
+
+/** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */
+template<typename T1, typename T2>
+struct transfer_constness
+{
+  typedef typename conditional<
+    bool(internal::is_const<T1>::value),
+    typename internal::add_const_on_value_type<T2>::type,
+    T2
+  >::type type;
+};
+
+/** \internal Determines how a given expression should be nested into another one.
+  * For example, when you do a * (b+c), Eigen will determine how the expression b+c should be
+  * nested into the bigger product expression. The choice is between nesting the expression b+c as-is, or
+  * evaluating that expression b+c into a temporary variable d, and nest d so that the resulting expression is
+  * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes
+  * many coefficient accesses in the nested expressions -- as is the case with matrix product for example.
+  *
+  * \param T the type of the expression being nested
+  * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
+  *
+  * Note that if no evaluation occur, then the constness of T is preserved.
+  *
+  * Example. Suppose that a, b, and c are of type Matrix3d. The user forms the expression a*(b+c).
+  * b+c is an expression "sum of matrices", which we will denote by S. In order to determine how to nest it,
+  * the Product expression uses: nested<S, 3>::ret, which turns out to be Matrix3d because the internal logic of
+  * nested determined that in this case it was better to evaluate the expression b+c into a temporary. On the other hand,
+  * since a is of type Matrix3d, the Product expression nests it as nested<Matrix3d, 3>::ret, which turns out to be
+  * const Matrix3d&, because the internal logic of nested determined that since a was already a matrix, there was no point
+  * in copying it into another matrix.
+  */
+template<typename T, int n=1, typename PlainObject = typename eval<T>::type> struct nested
+{
+  enum {
+    // for the purpose of this test, to keep it reasonably simple, we arbitrarily choose a value of Dynamic values.
+    // the choice of 10000 makes it larger than any practical fixed value and even most dynamic values.
+    // in extreme cases where these assumptions would be wrong, we would still at worst suffer performance issues
+    // (poor choice of temporaries).
+    // it's important that this value can still be squared without integer overflowing.
+    DynamicAsInteger = 10000,
+    ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
+    ScalarReadCostAsInteger = ScalarReadCost == Dynamic ? int(DynamicAsInteger) : int(ScalarReadCost),
+    CoeffReadCost = traits<T>::CoeffReadCost,
+    CoeffReadCostAsInteger = CoeffReadCost == Dynamic ? int(DynamicAsInteger) : int(CoeffReadCost),
+    NAsInteger = n == Dynamic ? int(DynamicAsInteger) : n,
+    CostEvalAsInteger   = (NAsInteger+1) * ScalarReadCostAsInteger + CoeffReadCostAsInteger,
+    CostNoEvalAsInteger = NAsInteger * CoeffReadCostAsInteger
+  };
+
+  typedef typename conditional<
+      ( (int(traits<T>::Flags) & EvalBeforeNestingBit) ||
+        int(CostEvalAsInteger) < int(CostNoEvalAsInteger)
+      ),
+      PlainObject,
+      typename ref_selector<T>::type
+  >::type type;
+};
+
+template<typename T>
+inline T* const_cast_ptr(const T* ptr)
+{
+  return const_cast<T*>(ptr);
+}
+
+template<typename Derived, typename XprKind = typename traits<Derived>::XprKind>
+struct dense_xpr_base
+{
+  /* dense_xpr_base should only ever be used on dense expressions, thus falling either into the MatrixXpr or into the ArrayXpr cases */
+};
+
+template<typename Derived>
+struct dense_xpr_base<Derived, MatrixXpr>
+{
+  typedef MatrixBase<Derived> type;
+};
+
+template<typename Derived>
+struct dense_xpr_base<Derived, ArrayXpr>
+{
+  typedef ArrayBase<Derived> type;
+};
+
+/** \internal Helper base class to add a scalar multiple operator
+  * overloads for complex types */
+template<typename Derived,typename Scalar,typename OtherScalar,
+         bool EnableIt = !is_same<Scalar,OtherScalar>::value >
+struct special_scalar_op_base : public DenseCoeffsBase<Derived>
+{
+  // dummy operator* so that the
+  // "using special_scalar_op_base::operator*" compiles
+  void operator*() const;
+};
+
+template<typename Derived,typename Scalar,typename OtherScalar>
+struct special_scalar_op_base<Derived,Scalar,OtherScalar,true>  : public DenseCoeffsBase<Derived>
+{
+  const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+  operator*(const OtherScalar& scalar) const
+  {
+    return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+      (*static_cast<const Derived*>(this), scalar_multiple2_op<Scalar,OtherScalar>(scalar));
+  }
+
+  inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+  operator*(const OtherScalar& scalar, const Derived& matrix)
+  { return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar); }
+};
+
+template<typename XprType, typename CastType> struct cast_return_type
+{
+  typedef typename XprType::Scalar CurrentScalarType;
+  typedef typename remove_all<CastType>::type _CastType;
+  typedef typename _CastType::Scalar NewScalarType;
+  typedef typename conditional<is_same<CurrentScalarType,NewScalarType>::value,
+                              const XprType&,CastType>::type type;
+};
+
+template <typename A, typename B> struct promote_storage_type;
+
+template <typename A> struct promote_storage_type<A,A>
+{
+  typedef A ret;
+};
+
+/** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
+  * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
+  */
+template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
+struct plain_row_type
+{
+  typedef Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime,
+                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType;
+  typedef Array<Scalar, 1, ExpressionType::ColsAtCompileTime,
+                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType;
+
+  typedef typename conditional<
+    is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
+    MatrixRowType,
+    ArrayRowType 
+  >::type type;
+};
+
+template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
+struct plain_col_type
+{
+  typedef Matrix<Scalar, ExpressionType::RowsAtCompileTime, 1,
+                 ExpressionType::PlainObject::Options & ~RowMajor, ExpressionType::MaxRowsAtCompileTime, 1> MatrixColType;
+  typedef Array<Scalar, ExpressionType::RowsAtCompileTime, 1,
+                 ExpressionType::PlainObject::Options & ~RowMajor, ExpressionType::MaxRowsAtCompileTime, 1> ArrayColType;
+
+  typedef typename conditional<
+    is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
+    MatrixColType,
+    ArrayColType 
+  >::type type;
+};
+
+template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
+struct plain_diag_type
+{
+  enum { diag_size = EIGEN_SIZE_MIN_PREFER_DYNAMIC(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime),
+         max_diag_size = EIGEN_SIZE_MIN_PREFER_FIXED(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime)
+  };
+  typedef Matrix<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1> MatrixDiagType;
+  typedef Array<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1> ArrayDiagType;
+
+  typedef typename conditional<
+    is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
+    MatrixDiagType,
+    ArrayDiagType 
+  >::type type;
+};
+
+template<typename ExpressionType>
+struct is_lvalue
+{
+  enum { value = !bool(is_const<ExpressionType>::value) &&
+                 bool(traits<ExpressionType>::Flags & LvalueBit) };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_XPRHELPER_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Block.h b/include/eigen3/Eigen/src/Eigen2Support/Block.h
new file mode 100644
index 0000000..604456f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Block.h
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BLOCK2_H
+#define EIGEN_BLOCK2_H
+
+namespace Eigen { 
+
+/** \returns a dynamic-size expression of a corner of *this.
+  *
+  * \param type the type of corner. Can be \a Eigen::TopLeft, \a Eigen::TopRight,
+  * \a Eigen::BottomLeft, \a Eigen::BottomRight.
+  * \param cRows the number of rows in the corner
+  * \param cCols the number of columns in the corner
+  *
+  * Example: \include MatrixBase_corner_enum_int_int.cpp
+  * Output: \verbinclude MatrixBase_corner_enum_int_int.out
+  *
+  * \note Even though the returned expression has dynamic size, in the case
+  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
+  * which means that evaluating it does not cause a dynamic memory allocation.
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<typename Derived>
+inline Block<Derived> DenseBase<Derived>
+  ::corner(CornerType type, Index cRows, Index cCols)
+{
+  switch(type)
+  {
+    default:
+      eigen_assert(false && "Bad corner type.");
+    case TopLeft:
+      return Block<Derived>(derived(), 0, 0, cRows, cCols);
+    case TopRight:
+      return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
+    case BottomLeft:
+      return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
+    case BottomRight:
+      return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  }
+}
+
+/** This is the const version of corner(CornerType, Index, Index).*/
+template<typename Derived>
+inline const Block<Derived>
+DenseBase<Derived>::corner(CornerType type, Index cRows, Index cCols) const
+{
+  switch(type)
+  {
+    default:
+      eigen_assert(false && "Bad corner type.");
+    case TopLeft:
+      return Block<Derived>(derived(), 0, 0, cRows, cCols);
+    case TopRight:
+      return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
+    case BottomLeft:
+      return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
+    case BottomRight:
+      return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  }
+}
+
+/** \returns a fixed-size expression of a corner of *this.
+  *
+  * \param type the type of corner. Can be \a Eigen::TopLeft, \a Eigen::TopRight,
+  * \a Eigen::BottomLeft, \a Eigen::BottomRight.
+  *
+  * The template parameters CRows and CCols arethe number of rows and columns in the corner.
+  *
+  * Example: \include MatrixBase_template_int_int_corner_enum.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_corner_enum.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<typename Derived>
+template<int CRows, int CCols>
+inline Block<Derived, CRows, CCols>
+DenseBase<Derived>::corner(CornerType type)
+{
+  switch(type)
+  {
+    default:
+      eigen_assert(false && "Bad corner type.");
+    case TopLeft:
+      return Block<Derived, CRows, CCols>(derived(), 0, 0);
+    case TopRight:
+      return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
+    case BottomLeft:
+      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
+    case BottomRight:
+      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
+  }
+}
+
+/** This is the const version of corner<int, int>(CornerType).*/
+template<typename Derived>
+template<int CRows, int CCols>
+inline const Block<Derived, CRows, CCols>
+DenseBase<Derived>::corner(CornerType type) const
+{
+  switch(type)
+  {
+    default:
+      eigen_assert(false && "Bad corner type.");
+    case TopLeft:
+      return Block<Derived, CRows, CCols>(derived(), 0, 0);
+    case TopRight:
+      return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
+    case BottomLeft:
+      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
+    case BottomRight:
+      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
+  }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_BLOCK2_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Cwise.h b/include/eigen3/Eigen/src/Eigen2Support/Cwise.h
new file mode 100644
index 0000000..d95009b
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Cwise.h
@@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_H
+#define EIGEN_CWISE_H
+
+namespace Eigen { 
+
+/** \internal
+  * convenient macro to defined the return type of a cwise binary operation */
+#define EIGEN_CWISE_BINOP_RETURN_TYPE(OP) \
+    CwiseBinaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType, OtherDerived>
+
+/** \internal
+  * convenient macro to defined the return type of a cwise unary operation */
+#define EIGEN_CWISE_UNOP_RETURN_TYPE(OP) \
+    CwiseUnaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType>
+
+/** \internal
+  * convenient macro to defined the return type of a cwise comparison to a scalar */
+#define EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(OP) \
+    CwiseBinaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType, \
+        typename ExpressionType::ConstantReturnType >
+
+/** \class Cwise
+  *
+  * \brief Pseudo expression providing additional coefficient-wise operations
+  *
+  * \param ExpressionType the type of the object on which to do coefficient-wise operations
+  *
+  * This class represents an expression with additional coefficient-wise features.
+  * It is the return type of MatrixBase::cwise()
+  * and most of the time this is the only way it is used.
+  *
+  * Example: \include MatrixBase_cwise_const.cpp
+  * Output: \verbinclude MatrixBase_cwise_const.out
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_CWISE_PLUGIN.
+  *
+  * \sa MatrixBase::cwise() const, MatrixBase::cwise()
+  */
+template<typename ExpressionType> class Cwise
+{
+  public:
+
+    typedef typename internal::traits<ExpressionType>::Scalar Scalar;
+    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
+        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
+    typedef CwiseUnaryOp<internal::scalar_add_op<Scalar>, ExpressionType> ScalarAddReturnType;
+
+    inline Cwise(const ExpressionType& matrix) : m_matrix(matrix) {}
+
+    /** \internal */
+    inline const ExpressionType& _expression() const { return m_matrix; }
+
+    template<typename OtherDerived>
+    const EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)
+    operator*(const MatrixBase<OtherDerived> &other) const;
+
+    template<typename OtherDerived>
+    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)
+    operator/(const MatrixBase<OtherDerived> &other) const;
+
+    /** \deprecated ArrayBase::min() */
+    template<typename OtherDerived>
+    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_min_op)
+    (min)(const MatrixBase<OtherDerived> &other) const
+    { return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_min_op)(_expression(), other.derived()); }
+
+    /** \deprecated ArrayBase::max() */
+    template<typename OtherDerived>
+    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_max_op)
+    (max)(const MatrixBase<OtherDerived> &other) const
+    { return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_max_op)(_expression(), other.derived()); }
+
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs_op)      abs() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs2_op)     abs2() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_square_op)   square() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cube_op)     cube() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_inverse_op)  inverse() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sqrt_op)     sqrt() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_exp_op)      exp() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_log_op)      log() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cos_op)      cos() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sin_op)      sin() const;
+    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)      pow(const Scalar& exponent) const;
+
+    const ScalarAddReturnType
+    operator+(const Scalar& scalar) const;
+
+    /** \relates Cwise */
+    friend const ScalarAddReturnType
+    operator+(const Scalar& scalar, const Cwise& mat)
+    { return mat + scalar; }
+
+    ExpressionType& operator+=(const Scalar& scalar);
+
+    const ScalarAddReturnType
+    operator-(const Scalar& scalar) const;
+
+    ExpressionType& operator-=(const Scalar& scalar);
+
+    template<typename OtherDerived>
+    inline ExpressionType& operator*=(const MatrixBase<OtherDerived> &other);
+
+    template<typename OtherDerived>
+    inline ExpressionType& operator/=(const MatrixBase<OtherDerived> &other);
+
+    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)
+    operator<(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)
+    operator<=(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)
+    operator>(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)
+    operator>=(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)
+    operator==(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)
+    operator!=(const MatrixBase<OtherDerived>& other) const;
+
+    // comparisons to a scalar value
+    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)
+    operator<(Scalar s) const;
+
+    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)
+    operator<=(Scalar s) const;
+
+    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)
+    operator>(Scalar s) const;
+
+    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)
+    operator>=(Scalar s) const;
+
+    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)
+    operator==(Scalar s) const;
+
+    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)
+    operator!=(Scalar s) const;
+
+    // allow to extend Cwise outside Eigen
+    #ifdef EIGEN_CWISE_PLUGIN
+    #include EIGEN_CWISE_PLUGIN
+    #endif
+
+  protected:
+    ExpressionTypeNested m_matrix;
+};
+
+
+/** \returns a Cwise wrapper of *this providing additional coefficient-wise operations
+  *
+  * Example: \include MatrixBase_cwise_const.cpp
+  * Output: \verbinclude MatrixBase_cwise_const.out
+  *
+  * \sa class Cwise, cwise()
+  */
+template<typename Derived>
+inline const Cwise<Derived> MatrixBase<Derived>::cwise() const
+{
+  return derived();
+}
+
+/** \returns a Cwise wrapper of *this providing additional coefficient-wise operations
+  *
+  * Example: \include MatrixBase_cwise.cpp
+  * Output: \verbinclude MatrixBase_cwise.out
+  *
+  * \sa class Cwise, cwise() const
+  */
+template<typename Derived>
+inline Cwise<Derived> MatrixBase<Derived>::cwise()
+{
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CWISE_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/CwiseOperators.h b/include/eigen3/Eigen/src/Eigen2Support/CwiseOperators.h
new file mode 100644
index 0000000..482f306
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/CwiseOperators.h
@@ -0,0 +1,298 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARRAY_CWISE_OPERATORS_H
+#define EIGEN_ARRAY_CWISE_OPERATORS_H
+
+namespace Eigen { 
+
+/***************************************************************************
+* The following functions were defined in Core
+***************************************************************************/
+
+
+/** \deprecated ArrayBase::abs() */
+template<typename ExpressionType>
+EIGEN_STRONG_INLINE const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs_op)
+Cwise<ExpressionType>::abs() const
+{
+  return _expression();
+}
+
+/** \deprecated ArrayBase::abs2() */
+template<typename ExpressionType>
+EIGEN_STRONG_INLINE const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs2_op)
+Cwise<ExpressionType>::abs2() const
+{
+  return _expression();
+}
+
+/** \deprecated ArrayBase::exp() */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_exp_op)
+Cwise<ExpressionType>::exp() const
+{
+  return _expression();
+}
+
+/** \deprecated ArrayBase::log() */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_log_op)
+Cwise<ExpressionType>::log() const
+{
+  return _expression();
+}
+
+/** \deprecated ArrayBase::operator*() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)
+Cwise<ExpressionType>::operator*(const MatrixBase<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)(_expression(), other.derived());
+}
+
+/** \deprecated ArrayBase::operator/() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)
+Cwise<ExpressionType>::operator/(const MatrixBase<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)(_expression(), other.derived());
+}
+
+/** \deprecated ArrayBase::operator*=() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+inline ExpressionType& Cwise<ExpressionType>::operator*=(const MatrixBase<OtherDerived> &other)
+{
+  return m_matrix.const_cast_derived() = *this * other;
+}
+
+/** \deprecated ArrayBase::operator/=() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+inline ExpressionType& Cwise<ExpressionType>::operator/=(const MatrixBase<OtherDerived> &other)
+{
+  return m_matrix.const_cast_derived() = *this / other;
+}
+
+/***************************************************************************
+* The following functions were defined in Array
+***************************************************************************/
+
+// -- unary operators --
+
+/** \deprecated ArrayBase::sqrt() */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sqrt_op)
+Cwise<ExpressionType>::sqrt() const
+{
+  return _expression();
+}
+
+/** \deprecated ArrayBase::cos() */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cos_op)
+Cwise<ExpressionType>::cos() const
+{
+  return _expression();
+}
+
+
+/** \deprecated ArrayBase::sin() */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sin_op)
+Cwise<ExpressionType>::sin() const
+{
+  return _expression();
+}
+
+
+/** \deprecated ArrayBase::log() */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)
+Cwise<ExpressionType>::pow(const Scalar& exponent) const
+{
+  return EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)(_expression(), internal::scalar_pow_op<Scalar>(exponent));
+}
+
+
+/** \deprecated ArrayBase::inverse() */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_inverse_op)
+Cwise<ExpressionType>::inverse() const
+{
+  return _expression();
+}
+
+/** \deprecated ArrayBase::square() */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_square_op)
+Cwise<ExpressionType>::square() const
+{
+  return _expression();
+}
+
+/** \deprecated ArrayBase::cube() */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cube_op)
+Cwise<ExpressionType>::cube() const
+{
+  return _expression();
+}
+
+
+// -- binary operators --
+
+/** \deprecated ArrayBase::operator<() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)
+Cwise<ExpressionType>::operator<(const MatrixBase<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)(_expression(), other.derived());
+}
+
+/** \deprecated ArrayBase::<=() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)
+Cwise<ExpressionType>::operator<=(const MatrixBase<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)(_expression(), other.derived());
+}
+
+/** \deprecated ArrayBase::operator>() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)
+Cwise<ExpressionType>::operator>(const MatrixBase<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)(_expression(), other.derived());
+}
+
+/** \deprecated ArrayBase::operator>=() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)
+Cwise<ExpressionType>::operator>=(const MatrixBase<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)(_expression(), other.derived());
+}
+
+/** \deprecated ArrayBase::operator==() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)
+Cwise<ExpressionType>::operator==(const MatrixBase<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)(_expression(), other.derived());
+}
+
+/** \deprecated ArrayBase::operator!=() */
+template<typename ExpressionType>
+template<typename OtherDerived>
+inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)
+Cwise<ExpressionType>::operator!=(const MatrixBase<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)(_expression(), other.derived());
+}
+
+// comparisons to scalar value
+
+/** \deprecated ArrayBase::operator<(Scalar) */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)
+Cwise<ExpressionType>::operator<(Scalar s) const
+{
+  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)(_expression(),
+            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
+}
+
+/** \deprecated ArrayBase::operator<=(Scalar) */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)
+Cwise<ExpressionType>::operator<=(Scalar s) const
+{
+  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)(_expression(),
+            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
+}
+
+/** \deprecated ArrayBase::operator>(Scalar) */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)
+Cwise<ExpressionType>::operator>(Scalar s) const
+{
+  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)(_expression(),
+            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
+}
+
+/** \deprecated ArrayBase::operator>=(Scalar) */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)
+Cwise<ExpressionType>::operator>=(Scalar s) const
+{
+  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)(_expression(),
+            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
+}
+
+/** \deprecated ArrayBase::operator==(Scalar) */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)
+Cwise<ExpressionType>::operator==(Scalar s) const
+{
+  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)(_expression(),
+            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
+}
+
+/** \deprecated ArrayBase::operator!=(Scalar) */
+template<typename ExpressionType>
+inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)
+Cwise<ExpressionType>::operator!=(Scalar s) const
+{
+  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)(_expression(),
+            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
+}
+
+// scalar addition
+
+/** \deprecated ArrayBase::operator+(Scalar) */
+template<typename ExpressionType>
+inline const typename Cwise<ExpressionType>::ScalarAddReturnType
+Cwise<ExpressionType>::operator+(const Scalar& scalar) const
+{
+  return typename Cwise<ExpressionType>::ScalarAddReturnType(m_matrix, internal::scalar_add_op<Scalar>(scalar));
+}
+
+/** \deprecated ArrayBase::operator+=(Scalar) */
+template<typename ExpressionType>
+inline ExpressionType& Cwise<ExpressionType>::operator+=(const Scalar& scalar)
+{
+  return m_matrix.const_cast_derived() = *this + scalar;
+}
+
+/** \deprecated ArrayBase::operator-(Scalar) */
+template<typename ExpressionType>
+inline const typename Cwise<ExpressionType>::ScalarAddReturnType
+Cwise<ExpressionType>::operator-(const Scalar& scalar) const
+{
+  return *this + (-scalar);
+}
+
+/** \deprecated ArrayBase::operator-=(Scalar) */
+template<typename ExpressionType>
+inline ExpressionType& Cwise<ExpressionType>::operator-=(const Scalar& scalar)
+{
+  return m_matrix.const_cast_derived() = *this - scalar;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARRAY_CWISE_OPERATORS_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/AlignedBox.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/AlignedBox.h
new file mode 100644
index 0000000..2e4309d
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/AlignedBox.h
@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  * \nonstableyet
+  *
+  * \class AlignedBox
+  *
+  * \brief An axis aligned box
+  *
+  * \param _Scalar the type of the scalar coefficients
+  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  *
+  * This class represents an axis aligned box as a pair of the minimal and maximal corners.
+  */
+template <typename _Scalar, int _AmbientDim>
+class AlignedBox
+{
+public:
+EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
+  enum { AmbientDimAtCompileTime = _AmbientDim };
+  typedef _Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
+
+  /** Default constructor initializing a null box. */
+  inline AlignedBox()
+  { if (AmbientDimAtCompileTime!=Dynamic) setNull(); }
+
+  /** Constructs a null box with \a _dim the dimension of the ambient space. */
+  inline explicit AlignedBox(int _dim) : m_min(_dim), m_max(_dim)
+  { setNull(); }
+
+  /** Constructs a box with extremities \a _min and \a _max. */
+  inline AlignedBox(const VectorType& _min, const VectorType& _max) : m_min(_min), m_max(_max) {}
+
+  /** Constructs a box containing a single point \a p. */
+  inline explicit AlignedBox(const VectorType& p) : m_min(p), m_max(p) {}
+
+  ~AlignedBox() {}
+
+  /** \returns the dimension in which the box holds */
+  inline int dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size()-1 : AmbientDimAtCompileTime; }
+
+  /** \returns true if the box is null, i.e, empty. */
+  inline bool isNull() const { return (m_min.cwise() > m_max).any(); }
+
+  /** Makes \c *this a null/empty box. */
+  inline void setNull()
+  {
+    m_min.setConstant( (std::numeric_limits<Scalar>::max)());
+    m_max.setConstant(-(std::numeric_limits<Scalar>::max)());
+  }
+
+  /** \returns the minimal corner */
+  inline const VectorType& (min)() const { return m_min; }
+  /** \returns a non const reference to the minimal corner */
+  inline VectorType& (min)() { return m_min; }
+  /** \returns the maximal corner */
+  inline const VectorType& (max)() const { return m_max; }
+  /** \returns a non const reference to the maximal corner */
+  inline VectorType& (max)() { return m_max; }
+
+  /** \returns true if the point \a p is inside the box \c *this. */
+  inline bool contains(const VectorType& p) const
+  { return (m_min.cwise()<=p).all() && (p.cwise()<=m_max).all(); }
+
+  /** \returns true if the box \a b is entirely inside the box \c *this. */
+  inline bool contains(const AlignedBox& b) const
+  { return (m_min.cwise()<=(b.min)()).all() && ((b.max)().cwise()<=m_max).all(); }
+
+  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this. */
+  inline AlignedBox& extend(const VectorType& p)
+  { m_min = (m_min.cwise().min)(p); m_max = (m_max.cwise().max)(p); return *this; }
+
+  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this. */
+  inline AlignedBox& extend(const AlignedBox& b)
+  { m_min = (m_min.cwise().min)(b.m_min); m_max = (m_max.cwise().max)(b.m_max); return *this; }
+
+  /** Clamps \c *this by the box \a b and returns a reference to \c *this. */
+  inline AlignedBox& clamp(const AlignedBox& b)
+  { m_min = (m_min.cwise().max)(b.m_min); m_max = (m_max.cwise().min)(b.m_max); return *this; }
+
+  /** Translate \c *this by the vector \a t and returns a reference to \c *this. */
+  inline AlignedBox& translate(const VectorType& t)
+  { m_min += t; m_max += t; return *this; }
+
+  /** \returns the squared distance between the point \a p and the box \c *this,
+    * and zero if \a p is inside the box.
+    * \sa exteriorDistance()
+    */
+  inline Scalar squaredExteriorDistance(const VectorType& p) const;
+
+  /** \returns the distance between the point \a p and the box \c *this,
+    * and zero if \a p is inside the box.
+    * \sa squaredExteriorDistance()
+    */
+  inline Scalar exteriorDistance(const VectorType& p) const
+  { return ei_sqrt(squaredExteriorDistance(p)); }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<AlignedBox,
+           AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
+  {
+    return typename internal::cast_return_type<AlignedBox,
+                    AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
+  }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
+  {
+    m_min = (other.min)().template cast<Scalar>();
+    m_max = (other.max)().template cast<Scalar>();
+  }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const AlignedBox& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return m_min.isApprox(other.m_min, prec) && m_max.isApprox(other.m_max, prec); }
+
+protected:
+
+  VectorType m_min, m_max;
+};
+
+template<typename Scalar,int AmbiantDim>
+inline Scalar AlignedBox<Scalar,AmbiantDim>::squaredExteriorDistance(const VectorType& p) const
+{
+  Scalar dist2(0);
+  Scalar aux;
+  for (int k=0; k<dim(); ++k)
+  {
+    if ((aux = (p[k]-m_min[k]))<Scalar(0))
+      dist2 += aux*aux;
+    else if ( (aux = (m_max[k]-p[k]))<Scalar(0))
+      dist2 += aux*aux;
+  }
+  return dist2;
+}
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/All.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/All.h
new file mode 100644
index 0000000..e0b00fc
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/All.h
@@ -0,0 +1,115 @@
+#ifndef EIGEN2_GEOMETRY_MODULE_H
+#define EIGEN2_GEOMETRY_MODULE_H
+
+#include <limits>
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
+#include "RotationBase.h"
+#include "Rotation2D.h"
+#include "Quaternion.h"
+#include "AngleAxis.h"
+#include "Transform.h"
+#include "Translation.h"
+#include "Scaling.h"
+#include "AlignedBox.h"
+#include "Hyperplane.h"
+#include "ParametrizedLine.h"
+#endif
+
+
+#define RotationBase eigen2_RotationBase
+#define Rotation2D eigen2_Rotation2D
+#define Rotation2Df eigen2_Rotation2Df
+#define Rotation2Dd eigen2_Rotation2Dd
+
+#define Quaternion  eigen2_Quaternion
+#define Quaternionf eigen2_Quaternionf
+#define Quaterniond eigen2_Quaterniond
+
+#define AngleAxis eigen2_AngleAxis
+#define AngleAxisf eigen2_AngleAxisf
+#define AngleAxisd eigen2_AngleAxisd
+
+#define Transform   eigen2_Transform
+#define Transform2f eigen2_Transform2f
+#define Transform2d eigen2_Transform2d
+#define Transform3f eigen2_Transform3f
+#define Transform3d eigen2_Transform3d
+
+#define Translation eigen2_Translation
+#define Translation2f eigen2_Translation2f
+#define Translation2d eigen2_Translation2d
+#define Translation3f eigen2_Translation3f
+#define Translation3d eigen2_Translation3d
+
+#define Scaling eigen2_Scaling
+#define Scaling2f eigen2_Scaling2f
+#define Scaling2d eigen2_Scaling2d
+#define Scaling3f eigen2_Scaling3f
+#define Scaling3d eigen2_Scaling3d
+
+#define AlignedBox eigen2_AlignedBox
+
+#define Hyperplane eigen2_Hyperplane
+#define ParametrizedLine eigen2_ParametrizedLine
+
+#define ei_toRotationMatrix eigen2_ei_toRotationMatrix
+#define ei_quaternion_assign_impl eigen2_ei_quaternion_assign_impl
+#define ei_transform_product_impl eigen2_ei_transform_product_impl
+
+#include "RotationBase.h"
+#include "Rotation2D.h"
+#include "Quaternion.h"
+#include "AngleAxis.h"
+#include "Transform.h"
+#include "Translation.h"
+#include "Scaling.h"
+#include "AlignedBox.h"
+#include "Hyperplane.h"
+#include "ParametrizedLine.h"
+
+#undef ei_toRotationMatrix
+#undef ei_quaternion_assign_impl
+#undef ei_transform_product_impl
+
+#undef RotationBase
+#undef Rotation2D
+#undef Rotation2Df
+#undef Rotation2Dd
+
+#undef Quaternion
+#undef Quaternionf
+#undef Quaterniond
+
+#undef AngleAxis
+#undef AngleAxisf
+#undef AngleAxisd
+
+#undef Transform
+#undef Transform2f
+#undef Transform2d
+#undef Transform3f
+#undef Transform3d
+
+#undef Translation
+#undef Translation2f
+#undef Translation2d
+#undef Translation3f
+#undef Translation3d
+
+#undef Scaling
+#undef Scaling2f
+#undef Scaling2d
+#undef Scaling3f
+#undef Scaling3d
+
+#undef AlignedBox
+
+#undef Hyperplane
+#undef ParametrizedLine
+
+#endif // EIGEN2_GEOMETRY_MODULE_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/AngleAxis.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/AngleAxis.h
new file mode 100644
index 0000000..af598a4
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/AngleAxis.h
@@ -0,0 +1,214 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class AngleAxis
+  *
+  * \brief Represents a 3D rotation as a rotation angle around an arbitrary 3D axis
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  *
+  * The following two typedefs are provided for convenience:
+  * \li \c AngleAxisf for \c float
+  * \li \c AngleAxisd for \c double
+  *
+  * \addexample AngleAxisForEuler \label How to define a rotation from Euler-angles
+  *
+  * Combined with MatrixBase::Unit{X,Y,Z}, AngleAxis can be used to easily
+  * mimic Euler-angles. Here is an example:
+  * \include AngleAxis_mimic_euler.cpp
+  * Output: \verbinclude AngleAxis_mimic_euler.out
+  *
+  * \note This class is not aimed to be used to store a rotation transformation,
+  * but rather to make easier the creation of other rotation (Quaternion, rotation Matrix)
+  * and transformation objects.
+  *
+  * \sa class Quaternion, class Transform, MatrixBase::UnitX()
+  */
+
+template<typename _Scalar> struct ei_traits<AngleAxis<_Scalar> >
+{
+  typedef _Scalar Scalar;
+};
+
+template<typename _Scalar>
+class AngleAxis : public RotationBase<AngleAxis<_Scalar>,3>
+{
+  typedef RotationBase<AngleAxis<_Scalar>,3> Base;
+
+public:
+
+  using Base::operator*;
+
+  enum { Dim = 3 };
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Quaternion<Scalar> QuaternionType;
+
+protected:
+
+  Vector3 m_axis;
+  Scalar m_angle;
+
+public:
+
+  /** Default constructor without initialization. */
+  AngleAxis() {}
+  /** Constructs and initialize the angle-axis rotation from an \a angle in radian
+    * and an \a axis which must be normalized. */
+  template<typename Derived>
+  inline AngleAxis(Scalar angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
+  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
+  inline AngleAxis(const QuaternionType& q) { *this = q; }
+  /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
+  template<typename Derived>
+  inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
+
+  Scalar angle() const { return m_angle; }
+  Scalar& angle() { return m_angle; }
+
+  const Vector3& axis() const { return m_axis; }
+  Vector3& axis() { return m_axis; }
+
+  /** Concatenates two rotations */
+  inline QuaternionType operator* (const AngleAxis& other) const
+  { return QuaternionType(*this) * QuaternionType(other); }
+
+  /** Concatenates two rotations */
+  inline QuaternionType operator* (const QuaternionType& other) const
+  { return QuaternionType(*this) * other; }
+
+  /** Concatenates two rotations */
+  friend inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
+  { return a * QuaternionType(b); }
+
+  /** Concatenates two rotations */
+  inline Matrix3 operator* (const Matrix3& other) const
+  { return toRotationMatrix() * other; }
+
+  /** Concatenates two rotations */
+  inline friend Matrix3 operator* (const Matrix3& a, const AngleAxis& b)
+  { return a * b.toRotationMatrix(); }
+
+  /** Applies rotation to vector */
+  inline Vector3 operator* (const Vector3& other) const
+  { return toRotationMatrix() * other; }
+
+  /** \returns the inverse rotation, i.e., an angle-axis with opposite rotation angle */
+  AngleAxis inverse() const
+  { return AngleAxis(-m_angle, m_axis); }
+
+  AngleAxis& operator=(const QuaternionType& q);
+  template<typename Derived>
+  AngleAxis& operator=(const MatrixBase<Derived>& m);
+
+  template<typename Derived>
+  AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
+  Matrix3 toRotationMatrix(void) const;
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
+  { return typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
+  {
+    m_axis = other.axis().template cast<Scalar>();
+    m_angle = Scalar(other.angle());
+  }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const AngleAxis& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return m_axis.isApprox(other.m_axis, prec) && ei_isApprox(m_angle,other.m_angle, prec); }
+};
+
+/** \ingroup Geometry_Module
+  * single precision angle-axis type */
+typedef AngleAxis<float> AngleAxisf;
+/** \ingroup Geometry_Module
+  * double precision angle-axis type */
+typedef AngleAxis<double> AngleAxisd;
+
+/** Set \c *this from a quaternion.
+  * The axis is normalized.
+  */
+template<typename Scalar>
+AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionType& q)
+{
+  Scalar n2 = q.vec().squaredNorm();
+  if (n2 < precision<Scalar>()*precision<Scalar>())
+  {
+    m_angle = 0;
+    m_axis << 1, 0, 0;
+  }
+  else
+  {
+    m_angle = 2*std::acos(q.w());
+    m_axis = q.vec() / ei_sqrt(n2);
+  }
+  return *this;
+}
+
+/** Set \c *this from a 3x3 rotation matrix \a mat.
+  */
+template<typename Scalar>
+template<typename Derived>
+AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
+{
+  // Since a direct conversion would not be really faster,
+  // let's use the robust Quaternion implementation:
+  return *this = QuaternionType(mat);
+}
+
+/** Constructs and \returns an equivalent 3x3 rotation matrix.
+  */
+template<typename Scalar>
+typename AngleAxis<Scalar>::Matrix3
+AngleAxis<Scalar>::toRotationMatrix(void) const
+{
+  Matrix3 res;
+  Vector3 sin_axis  = ei_sin(m_angle) * m_axis;
+  Scalar c = ei_cos(m_angle);
+  Vector3 cos1_axis = (Scalar(1)-c) * m_axis;
+
+  Scalar tmp;
+  tmp = cos1_axis.x() * m_axis.y();
+  res.coeffRef(0,1) = tmp - sin_axis.z();
+  res.coeffRef(1,0) = tmp + sin_axis.z();
+
+  tmp = cos1_axis.x() * m_axis.z();
+  res.coeffRef(0,2) = tmp + sin_axis.y();
+  res.coeffRef(2,0) = tmp - sin_axis.y();
+
+  tmp = cos1_axis.y() * m_axis.z();
+  res.coeffRef(1,2) = tmp - sin_axis.x();
+  res.coeffRef(2,1) = tmp + sin_axis.x();
+
+  res.diagonal() = (cos1_axis.cwise() * m_axis).cwise() + c;
+
+  return res;
+}
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/Hyperplane.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Hyperplane.h
new file mode 100644
index 0000000..b95bf00
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Hyperplane.h
@@ -0,0 +1,254 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Hyperplane
+  *
+  * \brief A hyperplane
+  *
+  * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
+  * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  *             Notice that the dimension of the hyperplane is _AmbientDim-1.
+  *
+  * This class represents an hyperplane as the zero set of the implicit equation
+  * \f$ n \cdot x + d = 0 \f$ where \f$ n \f$ is a unit normal vector of the plane (linear part)
+  * and \f$ d \f$ is the distance (offset) to the origin.
+  */
+template <typename _Scalar, int _AmbientDim>
+class Hyperplane
+{
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
+  enum { AmbientDimAtCompileTime = _AmbientDim };
+  typedef _Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
+  typedef Matrix<Scalar,int(AmbientDimAtCompileTime)==Dynamic
+                        ? Dynamic
+                        : int(AmbientDimAtCompileTime)+1,1> Coefficients;
+  typedef Block<Coefficients,AmbientDimAtCompileTime,1> NormalReturnType;
+
+  /** Default constructor without initialization */
+  inline Hyperplane() {}
+
+  /** Constructs a dynamic-size hyperplane with \a _dim the dimension
+    * of the ambient space */
+  inline explicit Hyperplane(int _dim) : m_coeffs(_dim+1) {}
+
+  /** Construct a plane from its normal \a n and a point \a e onto the plane.
+    * \warning the vector normal is assumed to be normalized.
+    */
+  inline Hyperplane(const VectorType& n, const VectorType& e)
+    : m_coeffs(n.size()+1)
+  {
+    normal() = n;
+    offset() = -e.eigen2_dot(n);
+  }
+
+  /** Constructs a plane from its normal \a n and distance to the origin \a d
+    * such that the algebraic equation of the plane is \f$ n \cdot x + d = 0 \f$.
+    * \warning the vector normal is assumed to be normalized.
+    */
+  inline Hyperplane(const VectorType& n, Scalar d)
+    : m_coeffs(n.size()+1)
+  {
+    normal() = n;
+    offset() = d;
+  }
+
+  /** Constructs a hyperplane passing through the two points. If the dimension of the ambient space
+    * is greater than 2, then there isn't uniqueness, so an arbitrary choice is made.
+    */
+  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
+  {
+    Hyperplane result(p0.size());
+    result.normal() = (p1 - p0).unitOrthogonal();
+    result.offset() = -result.normal().eigen2_dot(p0);
+    return result;
+  }
+
+  /** Constructs a hyperplane passing through the three points. The dimension of the ambient space
+    * is required to be exactly 3.
+    */
+  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
+  {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 3)
+    Hyperplane result(p0.size());
+    result.normal() = (p2 - p0).cross(p1 - p0).normalized();
+    result.offset() = -result.normal().eigen2_dot(p0);
+    return result;
+  }
+
+  /** Constructs a hyperplane passing through the parametrized line \a parametrized.
+    * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
+    * so an arbitrary choice is made.
+    */
+  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
+  explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
+  {
+    normal() = parametrized.direction().unitOrthogonal();
+    offset() = -normal().eigen2_dot(parametrized.origin());
+  }
+
+  ~Hyperplane() {}
+
+  /** \returns the dimension in which the plane holds */
+  inline int dim() const { return int(AmbientDimAtCompileTime)==Dynamic ? m_coeffs.size()-1 : int(AmbientDimAtCompileTime); }
+
+  /** normalizes \c *this */
+  void normalize(void)
+  {
+    m_coeffs /= normal().norm();
+  }
+
+  /** \returns the signed distance between the plane \c *this and a point \a p.
+    * \sa absDistance()
+    */
+  inline Scalar signedDistance(const VectorType& p) const { return p.eigen2_dot(normal()) + offset(); }
+
+  /** \returns the absolute distance between the plane \c *this and a point \a p.
+    * \sa signedDistance()
+    */
+  inline Scalar absDistance(const VectorType& p) const { return ei_abs(signedDistance(p)); }
+
+  /** \returns the projection of a point \a p onto the plane \c *this.
+    */
+  inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
+
+  /** \returns a constant reference to the unit normal vector of the plane, which corresponds
+    * to the linear part of the implicit equation.
+    */
+  inline const NormalReturnType normal() const { return NormalReturnType(*const_cast<Coefficients*>(&m_coeffs),0,0,dim(),1); }
+
+  /** \returns a non-constant reference to the unit normal vector of the plane, which corresponds
+    * to the linear part of the implicit equation.
+    */
+  inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
+
+  /** \returns the distance to the origin, which is also the "constant term" of the implicit equation
+    * \warning the vector normal is assumed to be normalized.
+    */
+  inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
+
+  /** \returns a non-constant reference to the distance to the origin, which is also the constant part
+    * of the implicit equation */
+  inline Scalar& offset() { return m_coeffs(dim()); }
+
+  /** \returns a constant reference to the coefficients c_i of the plane equation:
+    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
+    */
+  inline const Coefficients& coeffs() const { return m_coeffs; }
+
+  /** \returns a non-constant reference to the coefficients c_i of the plane equation:
+    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
+    */
+  inline Coefficients& coeffs() { return m_coeffs; }
+
+  /** \returns the intersection of *this with \a other.
+    *
+    * \warning The ambient space must be a plane, i.e. have dimension 2, so that \c *this and \a other are lines.
+    *
+    * \note If \a other is approximately parallel to *this, this method will return any point on *this.
+    */
+  VectorType intersection(const Hyperplane& other)
+  {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
+    Scalar det = coeffs().coeff(0) * other.coeffs().coeff(1) - coeffs().coeff(1) * other.coeffs().coeff(0);
+    // since the line equations ax+by=c are normalized with a^2+b^2=1, the following tests
+    // whether the two lines are approximately parallel.
+    if(ei_isMuchSmallerThan(det, Scalar(1)))
+    {   // special case where the two lines are approximately parallel. Pick any point on the first line.
+        if(ei_abs(coeffs().coeff(1))>ei_abs(coeffs().coeff(0)))
+            return VectorType(coeffs().coeff(1), -coeffs().coeff(2)/coeffs().coeff(1)-coeffs().coeff(0));
+        else
+            return VectorType(-coeffs().coeff(2)/coeffs().coeff(0)-coeffs().coeff(1), coeffs().coeff(0));
+    }
+    else
+    {   // general case
+        Scalar invdet = Scalar(1) / det;
+        return VectorType(invdet*(coeffs().coeff(1)*other.coeffs().coeff(2)-other.coeffs().coeff(1)*coeffs().coeff(2)),
+                          invdet*(other.coeffs().coeff(0)*coeffs().coeff(2)-coeffs().coeff(0)*other.coeffs().coeff(2)));
+    }
+  }
+
+  /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
+    *
+    * \param mat the Dim x Dim transformation matrix
+    * \param traits specifies whether the matrix \a mat represents an Isometry
+    *               or a more generic Affine transformation. The default is Affine.
+    */
+  template<typename XprType>
+  inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
+  {
+    if (traits==Affine)
+      normal() = mat.inverse().transpose() * normal();
+    else if (traits==Isometry)
+      normal() = mat * normal();
+    else
+    {
+      ei_assert("invalid traits value in Hyperplane::transform()");
+    }
+    return *this;
+  }
+
+  /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
+    *
+    * \param t the transformation of dimension Dim
+    * \param traits specifies whether the transformation \a t represents an Isometry
+    *               or a more generic Affine transformation. The default is Affine.
+    *               Other kind of transformations are not supported.
+    */
+  inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime>& t,
+                                TransformTraits traits = Affine)
+  {
+    transform(t.linear(), traits);
+    offset() -= t.translation().eigen2_dot(normal());
+    return *this;
+  }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Hyperplane,
+           Hyperplane<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
+  {
+    return typename internal::cast_return_type<Hyperplane,
+                    Hyperplane<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
+  }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime>& other)
+  { m_coeffs = other.coeffs().template cast<Scalar>(); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const Hyperplane& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return m_coeffs.isApprox(other.m_coeffs, prec); }
+
+protected:
+
+  Coefficients m_coeffs;
+};
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
new file mode 100644
index 0000000..9b57b7e
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class ParametrizedLine
+  *
+  * \brief A parametrized line
+  *
+  * A parametrized line is defined by an origin point \f$ \mathbf{o} \f$ and a unit
+  * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
+  * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ l \in \mathbf{R} \f$.
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  */
+template <typename _Scalar, int _AmbientDim>
+class ParametrizedLine
+{
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
+  enum { AmbientDimAtCompileTime = _AmbientDim };
+  typedef _Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
+
+  /** Default constructor without initialization */
+  inline ParametrizedLine() {}
+
+  /** Constructs a dynamic-size line with \a _dim the dimension
+    * of the ambient space */
+  inline explicit ParametrizedLine(int _dim) : m_origin(_dim), m_direction(_dim) {}
+
+  /** Initializes a parametrized line of direction \a direction and origin \a origin.
+    * \warning the vector direction is assumed to be normalized.
+    */
+  ParametrizedLine(const VectorType& origin, const VectorType& direction)
+    : m_origin(origin), m_direction(direction) {}
+
+  explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim>& hyperplane);
+
+  /** Constructs a parametrized line going from \a p0 to \a p1. */
+  static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
+  { return ParametrizedLine(p0, (p1-p0).normalized()); }
+
+  ~ParametrizedLine() {}
+
+  /** \returns the dimension in which the line holds */
+  inline int dim() const { return m_direction.size(); }
+
+  const VectorType& origin() const { return m_origin; }
+  VectorType& origin() { return m_origin; }
+
+  const VectorType& direction() const { return m_direction; }
+  VectorType& direction() { return m_direction; }
+
+  /** \returns the squared distance of a point \a p to its projection onto the line \c *this.
+    * \sa distance()
+    */
+  RealScalar squaredDistance(const VectorType& p) const
+  {
+    VectorType diff = p-origin();
+    return (diff - diff.eigen2_dot(direction())* direction()).squaredNorm();
+  }
+  /** \returns the distance of a point \a p to its projection onto the line \c *this.
+    * \sa squaredDistance()
+    */
+  RealScalar distance(const VectorType& p) const { return ei_sqrt(squaredDistance(p)); }
+
+  /** \returns the projection of a point \a p onto the line \c *this. */
+  VectorType projection(const VectorType& p) const
+  { return origin() + (p-origin()).eigen2_dot(direction()) * direction(); }
+
+  Scalar intersection(const Hyperplane<_Scalar, _AmbientDim>& hyperplane);
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<ParametrizedLine,
+           ParametrizedLine<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
+  {
+    return typename internal::cast_return_type<ParametrizedLine,
+                    ParametrizedLine<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
+  }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime>& other)
+  {
+    m_origin = other.origin().template cast<Scalar>();
+    m_direction = other.direction().template cast<Scalar>();
+  }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }
+
+protected:
+
+  VectorType m_origin, m_direction;
+};
+
+/** Constructs a parametrized line from a 2D hyperplane
+  *
+  * \warning the ambient space must have dimension 2 such that the hyperplane actually describes a line
+  */
+template <typename _Scalar, int _AmbientDim>
+inline ParametrizedLine<_Scalar, _AmbientDim>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim>& hyperplane)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
+  direction() = hyperplane.normal().unitOrthogonal();
+  origin() = -hyperplane.normal()*hyperplane.offset();
+}
+
+/** \returns the parameter value of the intersection between \c *this and the given hyperplane
+  */
+template <typename _Scalar, int _AmbientDim>
+inline _Scalar ParametrizedLine<_Scalar, _AmbientDim>::intersection(const Hyperplane<_Scalar, _AmbientDim>& hyperplane)
+{
+  return -(hyperplane.offset()+origin().eigen2_dot(hyperplane.normal()))
+          /(direction().eigen2_dot(hyperplane.normal()));
+}
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/Quaternion.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Quaternion.h
new file mode 100644
index 0000000..4b6390c
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Quaternion.h
@@ -0,0 +1,495 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+template<typename Other,
+         int OtherRows=Other::RowsAtCompileTime,
+         int OtherCols=Other::ColsAtCompileTime>
+struct ei_quaternion_assign_impl;
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Quaternion
+  *
+  * \brief The quaternion class used to represent 3D orientations and rotations
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  *
+  * This class represents a quaternion \f$ w+xi+yj+zk \f$ that is a convenient representation of
+  * orientations and rotations of objects in three dimensions. Compared to other representations
+  * like Euler angles or 3x3 matrices, quatertions offer the following advantages:
+  * \li \b compact storage (4 scalars)
+  * \li \b efficient to compose (28 flops),
+  * \li \b stable spherical interpolation
+  *
+  * The following two typedefs are provided for convenience:
+  * \li \c Quaternionf for \c float
+  * \li \c Quaterniond for \c double
+  *
+  * \sa  class AngleAxis, class Transform
+  */
+
+template<typename _Scalar> struct ei_traits<Quaternion<_Scalar> >
+{
+  typedef _Scalar Scalar;
+};
+
+template<typename _Scalar>
+class Quaternion : public RotationBase<Quaternion<_Scalar>,3>
+{
+  typedef RotationBase<Quaternion<_Scalar>,3> Base;
+
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,4)
+
+  using Base::operator*;
+
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+
+  /** the type of the Coefficients 4-vector */
+  typedef Matrix<Scalar, 4, 1> Coefficients;
+  /** the type of a 3D vector */
+  typedef Matrix<Scalar,3,1> Vector3;
+  /** the equivalent rotation matrix type */
+  typedef Matrix<Scalar,3,3> Matrix3;
+  /** the equivalent angle-axis type */
+  typedef AngleAxis<Scalar> AngleAxisType;
+
+  /** \returns the \c x coefficient */
+  inline Scalar x() const { return m_coeffs.coeff(0); }
+  /** \returns the \c y coefficient */
+  inline Scalar y() const { return m_coeffs.coeff(1); }
+  /** \returns the \c z coefficient */
+  inline Scalar z() const { return m_coeffs.coeff(2); }
+  /** \returns the \c w coefficient */
+  inline Scalar w() const { return m_coeffs.coeff(3); }
+
+  /** \returns a reference to the \c x coefficient */
+  inline Scalar& x() { return m_coeffs.coeffRef(0); }
+  /** \returns a reference to the \c y coefficient */
+  inline Scalar& y() { return m_coeffs.coeffRef(1); }
+  /** \returns a reference to the \c z coefficient */
+  inline Scalar& z() { return m_coeffs.coeffRef(2); }
+  /** \returns a reference to the \c w coefficient */
+  inline Scalar& w() { return m_coeffs.coeffRef(3); }
+
+  /** \returns a read-only vector expression of the imaginary part (x,y,z) */
+  inline const Block<const Coefficients,3,1> vec() const { return m_coeffs.template start<3>(); }
+
+  /** \returns a vector expression of the imaginary part (x,y,z) */
+  inline Block<Coefficients,3,1> vec() { return m_coeffs.template start<3>(); }
+
+  /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
+  inline const Coefficients& coeffs() const { return m_coeffs; }
+
+  /** \returns a vector expression of the coefficients (x,y,z,w) */
+  inline Coefficients& coeffs() { return m_coeffs; }
+
+  /** Default constructor leaving the quaternion uninitialized. */
+  inline Quaternion() {}
+
+  /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
+    * its four coefficients \a w, \a x, \a y and \a z.
+    *
+    * \warning Note the order of the arguments: the real \a w coefficient first,
+    * while internally the coefficients are stored in the following order:
+    * [\c x, \c y, \c z, \c w]
+    */
+  inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z)
+  { m_coeffs << x, y, z, w; }
+
+  /** Copy constructor */
+  inline Quaternion(const Quaternion& other) { m_coeffs = other.m_coeffs; }
+
+  /** Constructs and initializes a quaternion from the angle-axis \a aa */
+  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
+
+  /** Constructs and initializes a quaternion from either:
+    *  - a rotation matrix expression,
+    *  - a 4D vector expression representing quaternion coefficients.
+    * \sa operator=(MatrixBase<Derived>)
+    */
+  template<typename Derived>
+  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
+
+  Quaternion& operator=(const Quaternion& other);
+  Quaternion& operator=(const AngleAxisType& aa);
+  template<typename Derived>
+  Quaternion& operator=(const MatrixBase<Derived>& m);
+
+  /** \returns a quaternion representing an identity rotation
+    * \sa MatrixBase::Identity()
+    */
+  static inline Quaternion Identity() { return Quaternion(1, 0, 0, 0); }
+
+  /** \sa Quaternion::Identity(), MatrixBase::setIdentity()
+    */
+  inline Quaternion& setIdentity() { m_coeffs << 0, 0, 0, 1; return *this; }
+
+  /** \returns the squared norm of the quaternion's coefficients
+    * \sa Quaternion::norm(), MatrixBase::squaredNorm()
+    */
+  inline Scalar squaredNorm() const { return m_coeffs.squaredNorm(); }
+
+  /** \returns the norm of the quaternion's coefficients
+    * \sa Quaternion::squaredNorm(), MatrixBase::norm()
+    */
+  inline Scalar norm() const { return m_coeffs.norm(); }
+
+  /** Normalizes the quaternion \c *this
+    * \sa normalized(), MatrixBase::normalize() */
+  inline void normalize() { m_coeffs.normalize(); }
+  /** \returns a normalized version of \c *this
+    * \sa normalize(), MatrixBase::normalized() */
+  inline Quaternion normalized() const { return Quaternion(m_coeffs.normalized()); }
+
+  /** \returns the dot product of \c *this and \a other
+    * Geometrically speaking, the dot product of two unit quaternions
+    * corresponds to the cosine of half the angle between the two rotations.
+    * \sa angularDistance()
+    */
+  inline Scalar eigen2_dot(const Quaternion& other) const { return m_coeffs.eigen2_dot(other.m_coeffs); }
+
+  inline Scalar angularDistance(const Quaternion& other) const;
+
+  Matrix3 toRotationMatrix(void) const;
+
+  template<typename Derived1, typename Derived2>
+  Quaternion& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+
+  inline Quaternion operator* (const Quaternion& q) const;
+  inline Quaternion& operator*= (const Quaternion& q);
+
+  Quaternion inverse(void) const;
+  Quaternion conjugate(void) const;
+
+  Quaternion slerp(Scalar t, const Quaternion& other) const;
+
+  template<typename Derived>
+  Vector3 operator* (const MatrixBase<Derived>& vec) const;
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Quaternion,Quaternion<NewScalarType> >::type cast() const
+  { return typename internal::cast_return_type<Quaternion,Quaternion<NewScalarType> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit Quaternion(const Quaternion<OtherScalarType>& other)
+  { m_coeffs = other.coeffs().template cast<Scalar>(); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const Quaternion& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return m_coeffs.isApprox(other.m_coeffs, prec); }
+
+protected:
+  Coefficients m_coeffs;
+};
+
+/** \ingroup Geometry_Module
+  * single precision quaternion type */
+typedef Quaternion<float> Quaternionf;
+/** \ingroup Geometry_Module
+  * double precision quaternion type */
+typedef Quaternion<double> Quaterniond;
+
+// Generic Quaternion * Quaternion product
+template<typename Scalar> inline Quaternion<Scalar>
+ei_quaternion_product(const Quaternion<Scalar>& a, const Quaternion<Scalar>& b)
+{
+  return Quaternion<Scalar>
+  (
+    a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
+    a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
+    a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
+    a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x()
+  );
+}
+
+/** \returns the concatenation of two rotations as a quaternion-quaternion product */
+template <typename Scalar>
+inline Quaternion<Scalar> Quaternion<Scalar>::operator* (const Quaternion& other) const
+{
+  return ei_quaternion_product(*this,other);
+}
+
+/** \sa operator*(Quaternion) */
+template <typename Scalar>
+inline Quaternion<Scalar>& Quaternion<Scalar>::operator*= (const Quaternion& other)
+{
+  return (*this = *this * other);
+}
+
+/** Rotation of a vector by a quaternion.
+  * \remarks If the quaternion is used to rotate several points (>1)
+  * then it is much more efficient to first convert it to a 3x3 Matrix.
+  * Comparison of the operation cost for n transformations:
+  *   - Quaternion:    30n
+  *   - Via a Matrix3: 24 + 15n
+  */
+template <typename Scalar>
+template<typename Derived>
+inline typename Quaternion<Scalar>::Vector3
+Quaternion<Scalar>::operator* (const MatrixBase<Derived>& v) const
+{
+    // Note that this algorithm comes from the optimization by hand
+    // of the conversion to a Matrix followed by a Matrix/Vector product.
+    // It appears to be much faster than the common algorithm found
+    // in the litterature (30 versus 39 flops). It also requires two
+    // Vector3 as temporaries.
+    Vector3 uv;
+    uv = 2 * this->vec().cross(v);
+    return v + this->w() * uv + this->vec().cross(uv);
+}
+
+template<typename Scalar>
+inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const Quaternion& other)
+{
+  m_coeffs = other.m_coeffs;
+  return *this;
+}
+
+/** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
+  */
+template<typename Scalar>
+inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const AngleAxisType& aa)
+{
+  Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
+  this->w() = ei_cos(ha);
+  this->vec() = ei_sin(ha) * aa.axis();
+  return *this;
+}
+
+/** Set \c *this from the expression \a xpr:
+  *   - if \a xpr is a 4x1 vector, then \a xpr is assumed to be a quaternion
+  *   - if \a xpr is a 3x3 matrix, then \a xpr is assumed to be rotation matrix
+  *     and \a xpr is converted to a quaternion
+  */
+template<typename Scalar>
+template<typename Derived>
+inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const MatrixBase<Derived>& xpr)
+{
+  ei_quaternion_assign_impl<Derived>::run(*this, xpr.derived());
+  return *this;
+}
+
+/** Convert the quaternion to a 3x3 rotation matrix */
+template<typename Scalar>
+inline typename Quaternion<Scalar>::Matrix3
+Quaternion<Scalar>::toRotationMatrix(void) const
+{
+  // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
+  // if not inlined then the cost of the return by value is huge ~ +35%,
+  // however, not inlining this function is an order of magnitude slower, so
+  // it has to be inlined, and so the return by value is not an issue
+  Matrix3 res;
+
+  const Scalar tx  = Scalar(2)*this->x();
+  const Scalar ty  = Scalar(2)*this->y();
+  const Scalar tz  = Scalar(2)*this->z();
+  const Scalar twx = tx*this->w();
+  const Scalar twy = ty*this->w();
+  const Scalar twz = tz*this->w();
+  const Scalar txx = tx*this->x();
+  const Scalar txy = ty*this->x();
+  const Scalar txz = tz*this->x();
+  const Scalar tyy = ty*this->y();
+  const Scalar tyz = tz*this->y();
+  const Scalar tzz = tz*this->z();
+
+  res.coeffRef(0,0) = Scalar(1)-(tyy+tzz);
+  res.coeffRef(0,1) = txy-twz;
+  res.coeffRef(0,2) = txz+twy;
+  res.coeffRef(1,0) = txy+twz;
+  res.coeffRef(1,1) = Scalar(1)-(txx+tzz);
+  res.coeffRef(1,2) = tyz-twx;
+  res.coeffRef(2,0) = txz-twy;
+  res.coeffRef(2,1) = tyz+twx;
+  res.coeffRef(2,2) = Scalar(1)-(txx+tyy);
+
+  return res;
+}
+
+/** Sets *this to be a quaternion representing a rotation sending the vector \a a to the vector \a b.
+  *
+  * \returns a reference to *this.
+  *
+  * Note that the two input vectors do \b not have to be normalized.
+  */
+template<typename Scalar>
+template<typename Derived1, typename Derived2>
+inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
+{
+  Vector3 v0 = a.normalized();
+  Vector3 v1 = b.normalized();
+  Scalar c = v0.eigen2_dot(v1);
+
+  // if dot == 1, vectors are the same
+  if (ei_isApprox(c,Scalar(1)))
+  {
+    // set to identity
+    this->w() = 1; this->vec().setZero();
+    return *this;
+  }
+  // if dot == -1, vectors are opposites
+  if (ei_isApprox(c,Scalar(-1)))
+  {
+    this->vec() = v0.unitOrthogonal();
+    this->w() = 0;
+    return *this;
+  }
+
+  Vector3 axis = v0.cross(v1);
+  Scalar s = ei_sqrt((Scalar(1)+c)*Scalar(2));
+  Scalar invs = Scalar(1)/s;
+  this->vec() = axis * invs;
+  this->w() = s * Scalar(0.5);
+
+  return *this;
+}
+
+/** \returns the multiplicative inverse of \c *this
+  * Note that in most cases, i.e., if you simply want the opposite rotation,
+  * and/or the quaternion is normalized, then it is enough to use the conjugate.
+  *
+  * \sa Quaternion::conjugate()
+  */
+template <typename Scalar>
+inline Quaternion<Scalar> Quaternion<Scalar>::inverse() const
+{
+  // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
+  Scalar n2 = this->squaredNorm();
+  if (n2 > 0)
+    return Quaternion(conjugate().coeffs() / n2);
+  else
+  {
+    // return an invalid result to flag the error
+    return Quaternion(Coefficients::Zero());
+  }
+}
+
+/** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
+  * if the quaternion is normalized.
+  * The conjugate of a quaternion represents the opposite rotation.
+  *
+  * \sa Quaternion::inverse()
+  */
+template <typename Scalar>
+inline Quaternion<Scalar> Quaternion<Scalar>::conjugate() const
+{
+  return Quaternion(this->w(),-this->x(),-this->y(),-this->z());
+}
+
+/** \returns the angle (in radian) between two rotations
+  * \sa eigen2_dot()
+  */
+template <typename Scalar>
+inline Scalar Quaternion<Scalar>::angularDistance(const Quaternion& other) const
+{
+  double d = ei_abs(this->eigen2_dot(other));
+  if (d>=1.0)
+    return 0;
+  return Scalar(2) * std::acos(d);
+}
+
+/** \returns the spherical linear interpolation between the two quaternions
+  * \c *this and \a other at the parameter \a t
+  */
+template <typename Scalar>
+Quaternion<Scalar> Quaternion<Scalar>::slerp(Scalar t, const Quaternion& other) const
+{
+  static const Scalar one = Scalar(1) - machine_epsilon<Scalar>();
+  Scalar d = this->eigen2_dot(other);
+  Scalar absD = ei_abs(d);
+
+  Scalar scale0;
+  Scalar scale1;
+
+  if (absD>=one)
+  {
+    scale0 = Scalar(1) - t;
+    scale1 = t;
+  }
+  else
+  {
+    // theta is the angle between the 2 quaternions
+    Scalar theta = std::acos(absD);
+    Scalar sinTheta = ei_sin(theta);
+
+    scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
+    scale1 = ei_sin( ( t * theta) ) / sinTheta;
+    if (d<0)
+      scale1 = -scale1;
+  }
+
+  return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
+}
+
+// set from a rotation matrix
+template<typename Other>
+struct ei_quaternion_assign_impl<Other,3,3>
+{
+  typedef typename Other::Scalar Scalar;
+  static inline void run(Quaternion<Scalar>& q, const Other& mat)
+  {
+    // This algorithm comes from  "Quaternion Calculus and Fast Animation",
+    // Ken Shoemake, 1987 SIGGRAPH course notes
+    Scalar t = mat.trace();
+    if (t > 0)
+    {
+      t = ei_sqrt(t + Scalar(1.0));
+      q.w() = Scalar(0.5)*t;
+      t = Scalar(0.5)/t;
+      q.x() = (mat.coeff(2,1) - mat.coeff(1,2)) * t;
+      q.y() = (mat.coeff(0,2) - mat.coeff(2,0)) * t;
+      q.z() = (mat.coeff(1,0) - mat.coeff(0,1)) * t;
+    }
+    else
+    {
+      int i = 0;
+      if (mat.coeff(1,1) > mat.coeff(0,0))
+        i = 1;
+      if (mat.coeff(2,2) > mat.coeff(i,i))
+        i = 2;
+      int j = (i+1)%3;
+      int k = (j+1)%3;
+
+      t = ei_sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
+      q.coeffs().coeffRef(i) = Scalar(0.5) * t;
+      t = Scalar(0.5)/t;
+      q.w() = (mat.coeff(k,j)-mat.coeff(j,k))*t;
+      q.coeffs().coeffRef(j) = (mat.coeff(j,i)+mat.coeff(i,j))*t;
+      q.coeffs().coeffRef(k) = (mat.coeff(k,i)+mat.coeff(i,k))*t;
+    }
+  }
+};
+
+// set from a vector of coefficients assumed to be a quaternion
+template<typename Other>
+struct ei_quaternion_assign_impl<Other,4,1>
+{
+  typedef typename Other::Scalar Scalar;
+  static inline void run(Quaternion<Scalar>& q, const Other& vec)
+  {
+    q.coeffs() = vec;
+  }
+};
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/Rotation2D.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Rotation2D.h
new file mode 100644
index 0000000..19b8582
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Rotation2D.h
@@ -0,0 +1,145 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Rotation2D
+  *
+  * \brief Represents a rotation/orientation in a 2 dimensional space.
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  *
+  * This class is equivalent to a single scalar representing a counter clock wise rotation
+  * as a single angle in radian. It provides some additional features such as the automatic
+  * conversion from/to a 2x2 rotation matrix. Moreover this class aims to provide a similar
+  * interface to Quaternion in order to facilitate the writing of generic algorithms
+  * dealing with rotations.
+  *
+  * \sa class Quaternion, class Transform
+  */
+template<typename _Scalar> struct ei_traits<Rotation2D<_Scalar> >
+{
+  typedef _Scalar Scalar;
+};
+
+template<typename _Scalar>
+class Rotation2D : public RotationBase<Rotation2D<_Scalar>,2>
+{
+  typedef RotationBase<Rotation2D<_Scalar>,2> Base;
+
+public:
+
+  using Base::operator*;
+
+  enum { Dim = 2 };
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+  typedef Matrix<Scalar,2,1> Vector2;
+  typedef Matrix<Scalar,2,2> Matrix2;
+
+protected:
+
+  Scalar m_angle;
+
+public:
+
+  /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
+  inline Rotation2D(Scalar a) : m_angle(a) {}
+
+  /** \returns the rotation angle */
+  inline Scalar angle() const { return m_angle; }
+
+  /** \returns a read-write reference to the rotation angle */
+  inline Scalar& angle() { return m_angle; }
+
+  /** \returns the inverse rotation */
+  inline Rotation2D inverse() const { return -m_angle; }
+
+  /** Concatenates two rotations */
+  inline Rotation2D operator*(const Rotation2D& other) const
+  { return m_angle + other.m_angle; }
+
+  /** Concatenates two rotations */
+  inline Rotation2D& operator*=(const Rotation2D& other)
+  { return m_angle += other.m_angle; return *this; }
+
+  /** Applies the rotation to a 2D vector */
+  Vector2 operator* (const Vector2& vec) const
+  { return toRotationMatrix() * vec; }
+
+  template<typename Derived>
+  Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
+  Matrix2 toRotationMatrix(void) const;
+
+  /** \returns the spherical interpolation between \c *this and \a other using
+    * parameter \a t. It is in fact equivalent to a linear interpolation.
+    */
+  inline Rotation2D slerp(Scalar t, const Rotation2D& other) const
+  { return m_angle * (1-t) + other.angle() * t; }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
+  { return typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
+  {
+    m_angle = Scalar(other.angle());
+  }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const Rotation2D& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return ei_isApprox(m_angle,other.m_angle, prec); }
+};
+
+/** \ingroup Geometry_Module
+  * single precision 2D rotation type */
+typedef Rotation2D<float> Rotation2Df;
+/** \ingroup Geometry_Module
+  * double precision 2D rotation type */
+typedef Rotation2D<double> Rotation2Dd;
+
+/** Set \c *this from a 2x2 rotation matrix \a mat.
+  * In other words, this function extract the rotation angle
+  * from the rotation matrix.
+  */
+template<typename Scalar>
+template<typename Derived>
+Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
+{
+  EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
+  m_angle = ei_atan2(mat.coeff(1,0), mat.coeff(0,0));
+  return *this;
+}
+
+/** Constructs and \returns an equivalent 2x2 rotation matrix.
+  */
+template<typename Scalar>
+typename Rotation2D<Scalar>::Matrix2
+Rotation2D<Scalar>::toRotationMatrix(void) const
+{
+  Scalar sinA = ei_sin(m_angle);
+  Scalar cosA = ei_cos(m_angle);
+  return (Matrix2() << cosA, -sinA, sinA, cosA).finished();
+}
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/RotationBase.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/RotationBase.h
new file mode 100644
index 0000000..b1c8f38
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/RotationBase.h
@@ -0,0 +1,123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+// this file aims to contains the various representations of rotation/orientation
+// in 2D and 3D space excepted Matrix and Quaternion.
+
+/** \class RotationBase
+  *
+  * \brief Common base class for compact rotation representations
+  *
+  * \param Derived is the derived type, i.e., a rotation type
+  * \param _Dim the dimension of the space
+  */
+template<typename Derived, int _Dim>
+class RotationBase
+{
+  public:
+    enum { Dim = _Dim };
+    /** the scalar type of the coefficients */
+    typedef typename ei_traits<Derived>::Scalar Scalar;
+    
+    /** corresponding linear transformation matrix type */
+    typedef Matrix<Scalar,Dim,Dim> RotationMatrixType;
+
+    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    inline Derived& derived() { return *static_cast<Derived*>(this); }
+
+    /** \returns an equivalent rotation matrix */
+    inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
+
+    /** \returns the inverse rotation */
+    inline Derived inverse() const { return derived().inverse(); }
+
+    /** \returns the concatenation of the rotation \c *this with a translation \a t */
+    inline Transform<Scalar,Dim> operator*(const Translation<Scalar,Dim>& t) const
+    { return toRotationMatrix() * t; }
+
+    /** \returns the concatenation of the rotation \c *this with a scaling \a s */
+    inline RotationMatrixType operator*(const Scaling<Scalar,Dim>& s) const
+    { return toRotationMatrix() * s; }
+
+    /** \returns the concatenation of the rotation \c *this with an affine transformation \a t */
+    inline Transform<Scalar,Dim> operator*(const Transform<Scalar,Dim>& t) const
+    { return toRotationMatrix() * t; }
+};
+
+/** \geometry_module
+  *
+  * Constructs a Dim x Dim rotation matrix from the rotation \a r
+  */
+template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
+template<typename OtherDerived>
+Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
+::Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
+{
+  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
+  *this = r.toRotationMatrix();
+}
+
+/** \geometry_module
+  *
+  * Set a Dim x Dim rotation matrix from the rotation \a r
+  */
+template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
+template<typename OtherDerived>
+Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
+Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
+::operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
+{
+  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
+  return *this = r.toRotationMatrix();
+}
+
+/** \internal
+  *
+  * Helper function to return an arbitrary rotation object to a rotation matrix.
+  *
+  * \param Scalar the numeric type of the matrix coefficients
+  * \param Dim the dimension of the current space
+  *
+  * It returns a Dim x Dim fixed size matrix.
+  *
+  * Default specializations are provided for:
+  *   - any scalar type (2D),
+  *   - any matrix expression,
+  *   - any type based on RotationBase (e.g., Quaternion, AngleAxis, Rotation2D)
+  *
+  * Currently ei_toRotationMatrix is only used by Transform.
+  *
+  * \sa class Transform, class Rotation2D, class Quaternion, class AngleAxis
+  */
+template<typename Scalar, int Dim>
+static inline Matrix<Scalar,2,2> ei_toRotationMatrix(const Scalar& s)
+{
+  EIGEN_STATIC_ASSERT(Dim==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
+  return Rotation2D<Scalar>(s).toRotationMatrix();
+}
+
+template<typename Scalar, int Dim, typename OtherDerived>
+static inline Matrix<Scalar,Dim,Dim> ei_toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
+{
+  return r.toRotationMatrix();
+}
+
+template<typename Scalar, int Dim, typename OtherDerived>
+static inline const MatrixBase<OtherDerived>& ei_toRotationMatrix(const MatrixBase<OtherDerived>& mat)
+{
+  EIGEN_STATIC_ASSERT(OtherDerived::RowsAtCompileTime==Dim && OtherDerived::ColsAtCompileTime==Dim,
+    YOU_MADE_A_PROGRAMMING_MISTAKE)
+  return mat;
+}
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/Scaling.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Scaling.h
new file mode 100644
index 0000000..b8fa6cd
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Scaling.h
@@ -0,0 +1,167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Scaling
+  *
+  * \brief Represents a possibly non uniform scaling transformation
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
+  *
+  * \note This class is not aimed to be used to store a scaling transformation,
+  * but rather to make easier the constructions and updates of Transform objects.
+  *
+  * \sa class Translation, class Transform
+  */
+template<typename _Scalar, int _Dim>
+class Scaling
+{
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
+  /** dimension of the space */
+  enum { Dim = _Dim };
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+  /** corresponding vector type */
+  typedef Matrix<Scalar,Dim,1> VectorType;
+  /** corresponding linear transformation matrix type */
+  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
+  /** corresponding translation type */
+  typedef Translation<Scalar,Dim> TranslationType;
+  /** corresponding affine transformation type */
+  typedef Transform<Scalar,Dim> TransformType;
+
+protected:
+
+  VectorType m_coeffs;
+
+public:
+
+  /** Default constructor without initialization. */
+  Scaling() {}
+  /** Constructs and initialize a uniform scaling transformation */
+  explicit inline Scaling(const Scalar& s) { m_coeffs.setConstant(s); }
+  /** 2D only */
+  inline Scaling(const Scalar& sx, const Scalar& sy)
+  {
+    ei_assert(Dim==2);
+    m_coeffs.x() = sx;
+    m_coeffs.y() = sy;
+  }
+  /** 3D only */
+  inline Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
+  {
+    ei_assert(Dim==3);
+    m_coeffs.x() = sx;
+    m_coeffs.y() = sy;
+    m_coeffs.z() = sz;
+  }
+  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
+  explicit inline Scaling(const VectorType& coeffs) : m_coeffs(coeffs) {}
+
+  const VectorType& coeffs() const { return m_coeffs; }
+  VectorType& coeffs() { return m_coeffs; }
+
+  /** Concatenates two scaling */
+  inline Scaling operator* (const Scaling& other) const
+  { return Scaling(coeffs().cwise() * other.coeffs()); }
+
+  /** Concatenates a scaling and a translation */
+  inline TransformType operator* (const TranslationType& t) const;
+
+  /** Concatenates a scaling and an affine transformation */
+  inline TransformType operator* (const TransformType& t) const;
+
+  /** Concatenates a scaling and a linear transformation matrix */
+  // TODO returns an expression
+  inline LinearMatrixType operator* (const LinearMatrixType& other) const
+  { return coeffs().asDiagonal() * other; }
+
+  /** Concatenates a linear transformation matrix and a scaling */
+  // TODO returns an expression
+  friend inline LinearMatrixType operator* (const LinearMatrixType& other, const Scaling& s)
+  { return other * s.coeffs().asDiagonal(); }
+
+  template<typename Derived>
+  inline LinearMatrixType operator*(const RotationBase<Derived,Dim>& r) const
+  { return *this * r.toRotationMatrix(); }
+
+  /** Applies scaling to vector */
+  inline VectorType operator* (const VectorType& other) const
+  { return coeffs().asDiagonal() * other; }
+
+  /** \returns the inverse scaling */
+  inline Scaling inverse() const
+  { return Scaling(coeffs().cwise().inverse()); }
+
+  inline Scaling& operator=(const Scaling& other)
+  {
+    m_coeffs = other.m_coeffs;
+    return *this;
+  }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type cast() const
+  { return typename internal::cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit Scaling(const Scaling<OtherScalarType,Dim>& other)
+  { m_coeffs = other.coeffs().template cast<Scalar>(); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const Scaling& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return m_coeffs.isApprox(other.m_coeffs, prec); }
+
+};
+
+/** \addtogroup Geometry_Module */
+//@{
+typedef Scaling<float, 2> Scaling2f;
+typedef Scaling<double,2> Scaling2d;
+typedef Scaling<float, 3> Scaling3f;
+typedef Scaling<double,3> Scaling3d;
+//@}
+
+template<typename Scalar, int Dim>
+inline typename Scaling<Scalar,Dim>::TransformType
+Scaling<Scalar,Dim>::operator* (const TranslationType& t) const
+{
+  TransformType res;
+  res.matrix().setZero();
+  res.linear().diagonal() = coeffs();
+  res.translation() = m_coeffs.cwise() * t.vector();
+  res(Dim,Dim) = Scalar(1);
+  return res;
+}
+
+template<typename Scalar, int Dim>
+inline typename Scaling<Scalar,Dim>::TransformType
+Scaling<Scalar,Dim>::operator* (const TransformType& t) const
+{
+  TransformType res = t;
+  res.prescale(m_coeffs);
+  return res;
+}
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/Transform.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Transform.h
new file mode 100644
index 0000000..fab60b2
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Transform.h
@@ -0,0 +1,786 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+// Note that we have to pass Dim and HDim because it is not allowed to use a template
+// parameter to define a template specialization. To be more precise, in the following
+// specializations, it is not allowed to use Dim+1 instead of HDim.
+template< typename Other,
+          int Dim,
+          int HDim,
+          int OtherRows=Other::RowsAtCompileTime,
+          int OtherCols=Other::ColsAtCompileTime>
+struct ei_transform_product_impl;
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Transform
+  *
+  * \brief Represents an homogeneous transformation in a N dimensional space
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  * \param _Dim the dimension of the space
+  *
+  * The homography is internally represented and stored as a (Dim+1)^2 matrix which
+  * is available through the matrix() method.
+  *
+  * Conversion methods from/to Qt's QMatrix and QTransform are available if the
+  * preprocessor token EIGEN_QT_SUPPORT is defined.
+  *
+  * \sa class Matrix, class Quaternion
+  */
+template<typename _Scalar, int _Dim>
+class Transform
+{
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim==Dynamic ? Dynamic : (_Dim+1)*(_Dim+1))
+  enum {
+    Dim = _Dim,     ///< space dimension in which the transformation holds
+    HDim = _Dim+1   ///< size of a respective homogeneous vector
+  };
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+  /** type of the matrix used to represent the transformation */
+  typedef Matrix<Scalar,HDim,HDim> MatrixType;
+  /** type of the matrix used to represent the linear part of the transformation */
+  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
+  /** type of read/write reference to the linear part of the transformation */
+  typedef Block<MatrixType,Dim,Dim> LinearPart;
+  /** type of read/write reference to the linear part of the transformation */
+  typedef const Block<const MatrixType,Dim,Dim> ConstLinearPart;
+  /** type of a vector */
+  typedef Matrix<Scalar,Dim,1> VectorType;
+  /** type of a read/write reference to the translation part of the rotation */
+  typedef Block<MatrixType,Dim,1> TranslationPart;
+  /** type of a read/write reference to the translation part of the rotation */
+  typedef const Block<const MatrixType,Dim,1> ConstTranslationPart;
+  /** corresponding translation type */
+  typedef Translation<Scalar,Dim> TranslationType;
+  /** corresponding scaling transformation type */
+  typedef Scaling<Scalar,Dim> ScalingType;
+
+protected:
+
+  MatrixType m_matrix;
+
+public:
+
+  /** Default constructor without initialization of the coefficients. */
+  inline Transform() { }
+
+  inline Transform(const Transform& other)
+  {
+    m_matrix = other.m_matrix;
+  }
+
+  inline explicit Transform(const TranslationType& t) { *this = t; }
+  inline explicit Transform(const ScalingType& s) { *this = s; }
+  template<typename Derived>
+  inline explicit Transform(const RotationBase<Derived, Dim>& r) { *this = r; }
+
+  inline Transform& operator=(const Transform& other)
+  { m_matrix = other.m_matrix; return *this; }
+
+  template<typename OtherDerived, bool BigMatrix> // MSVC 2005 will commit suicide if BigMatrix has a default value
+  struct construct_from_matrix
+  {
+    static inline void run(Transform *transform, const MatrixBase<OtherDerived>& other)
+    {
+      transform->matrix() = other;
+    }
+  };
+
+  template<typename OtherDerived> struct construct_from_matrix<OtherDerived, true>
+  {
+    static inline void run(Transform *transform, const MatrixBase<OtherDerived>& other)
+    {
+      transform->linear() = other;
+      transform->translation().setZero();
+      transform->matrix()(Dim,Dim) = Scalar(1);
+      transform->matrix().template block<1,Dim>(Dim,0).setZero();
+    }
+  };
+
+  /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */
+  template<typename OtherDerived>
+  inline explicit Transform(const MatrixBase<OtherDerived>& other)
+  {
+    construct_from_matrix<OtherDerived, int(OtherDerived::RowsAtCompileTime) == Dim>::run(this, other);
+  }
+
+  /** Set \c *this from a (Dim+1)^2 matrix. */
+  template<typename OtherDerived>
+  inline Transform& operator=(const MatrixBase<OtherDerived>& other)
+  { m_matrix = other; return *this; }
+
+  #ifdef EIGEN_QT_SUPPORT
+  inline Transform(const QMatrix& other);
+  inline Transform& operator=(const QMatrix& other);
+  inline QMatrix toQMatrix(void) const;
+  inline Transform(const QTransform& other);
+  inline Transform& operator=(const QTransform& other);
+  inline QTransform toQTransform(void) const;
+  #endif
+
+  /** shortcut for m_matrix(row,col);
+    * \sa MatrixBase::operaror(int,int) const */
+  inline Scalar operator() (int row, int col) const { return m_matrix(row,col); }
+  /** shortcut for m_matrix(row,col);
+    * \sa MatrixBase::operaror(int,int) */
+  inline Scalar& operator() (int row, int col) { return m_matrix(row,col); }
+
+  /** \returns a read-only expression of the transformation matrix */
+  inline const MatrixType& matrix() const { return m_matrix; }
+  /** \returns a writable expression of the transformation matrix */
+  inline MatrixType& matrix() { return m_matrix; }
+
+  /** \returns a read-only expression of the linear (linear) part of the transformation */
+  inline ConstLinearPart linear() const { return m_matrix.template block<Dim,Dim>(0,0); }
+  /** \returns a writable expression of the linear (linear) part of the transformation */
+  inline LinearPart linear() { return m_matrix.template block<Dim,Dim>(0,0); }
+
+  /** \returns a read-only expression of the translation vector of the transformation */
+  inline ConstTranslationPart translation() const { return m_matrix.template block<Dim,1>(0,Dim); }
+  /** \returns a writable expression of the translation vector of the transformation */
+  inline TranslationPart translation() { return m_matrix.template block<Dim,1>(0,Dim); }
+
+  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
+  *
+  * The right hand side \a other might be either:
+  * \li a vector of size Dim,
+  * \li an homogeneous vector of size Dim+1,
+  * \li a transformation matrix of size Dim+1 x Dim+1.
+  */
+  // note: this function is defined here because some compilers cannot find the respective declaration
+  template<typename OtherDerived>
+  inline const typename ei_transform_product_impl<OtherDerived,_Dim,_Dim+1>::ResultType
+  operator * (const MatrixBase<OtherDerived> &other) const
+  { return ei_transform_product_impl<OtherDerived,Dim,HDim>::run(*this,other.derived()); }
+
+  /** \returns the product expression of a transformation matrix \a a times a transform \a b
+    * The transformation matrix \a a must have a Dim+1 x Dim+1 sizes. */
+  template<typename OtherDerived>
+  friend inline const typename ProductReturnType<OtherDerived,MatrixType>::Type
+  operator * (const MatrixBase<OtherDerived> &a, const Transform &b)
+  { return a.derived() * b.matrix(); }
+
+  /** Contatenates two transformations */
+  inline const Transform
+  operator * (const Transform& other) const
+  { return Transform(m_matrix * other.matrix()); }
+
+  /** \sa MatrixBase::setIdentity() */
+  void setIdentity() { m_matrix.setIdentity(); }
+  static const typename MatrixType::IdentityReturnType Identity()
+  {
+    return MatrixType::Identity();
+  }
+
+  template<typename OtherDerived>
+  inline Transform& scale(const MatrixBase<OtherDerived> &other);
+
+  template<typename OtherDerived>
+  inline Transform& prescale(const MatrixBase<OtherDerived> &other);
+
+  inline Transform& scale(Scalar s);
+  inline Transform& prescale(Scalar s);
+
+  template<typename OtherDerived>
+  inline Transform& translate(const MatrixBase<OtherDerived> &other);
+
+  template<typename OtherDerived>
+  inline Transform& pretranslate(const MatrixBase<OtherDerived> &other);
+
+  template<typename RotationType>
+  inline Transform& rotate(const RotationType& rotation);
+
+  template<typename RotationType>
+  inline Transform& prerotate(const RotationType& rotation);
+
+  Transform& shear(Scalar sx, Scalar sy);
+  Transform& preshear(Scalar sx, Scalar sy);
+
+  inline Transform& operator=(const TranslationType& t);
+  inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
+  inline Transform operator*(const TranslationType& t) const;
+
+  inline Transform& operator=(const ScalingType& t);
+  inline Transform& operator*=(const ScalingType& s) { return scale(s.coeffs()); }
+  inline Transform operator*(const ScalingType& s) const;
+  friend inline Transform operator*(const LinearMatrixType& mat, const Transform& t)
+  {
+    Transform res = t;
+    res.matrix().row(Dim) = t.matrix().row(Dim);
+    res.matrix().template block<Dim,HDim>(0,0) = (mat * t.matrix().template block<Dim,HDim>(0,0)).lazy();
+    return res;
+  }
+
+  template<typename Derived>
+  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
+  template<typename Derived>
+  inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
+  template<typename Derived>
+  inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
+
+  LinearMatrixType rotation() const;
+  template<typename RotationMatrixType, typename ScalingMatrixType>
+  void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
+  template<typename ScalingMatrixType, typename RotationMatrixType>
+  void computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const;
+
+  template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
+  Transform& fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
+    const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale);
+
+  inline const MatrixType inverse(TransformTraits traits = Affine) const;
+
+  /** \returns a const pointer to the column major internal matrix */
+  const Scalar* data() const { return m_matrix.data(); }
+  /** \returns a non-const pointer to the column major internal matrix */
+  Scalar* data() { return m_matrix.data(); }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim> >::type cast() const
+  { return typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit Transform(const Transform<OtherScalarType,Dim>& other)
+  { m_matrix = other.matrix().template cast<Scalar>(); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const Transform& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return m_matrix.isApprox(other.m_matrix, prec); }
+
+  #ifdef EIGEN_TRANSFORM_PLUGIN
+  #include EIGEN_TRANSFORM_PLUGIN
+  #endif
+
+protected:
+
+};
+
+/** \ingroup Geometry_Module */
+typedef Transform<float,2> Transform2f;
+/** \ingroup Geometry_Module */
+typedef Transform<float,3> Transform3f;
+/** \ingroup Geometry_Module */
+typedef Transform<double,2> Transform2d;
+/** \ingroup Geometry_Module */
+typedef Transform<double,3> Transform3d;
+
+/**************************
+*** Optional QT support ***
+**************************/
+
+#ifdef EIGEN_QT_SUPPORT
+/** Initialises \c *this from a QMatrix assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim>
+Transform<Scalar,Dim>::Transform(const QMatrix& other)
+{
+  *this = other;
+}
+
+/** Set \c *this from a QMatrix assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim>
+Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const QMatrix& other)
+{
+  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  m_matrix << other.m11(), other.m21(), other.dx(),
+              other.m12(), other.m22(), other.dy(),
+              0, 0, 1;
+   return *this;
+}
+
+/** \returns a QMatrix from \c *this assuming the dimension is 2.
+  *
+  * \warning this convertion might loss data if \c *this is not affine
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim>
+QMatrix Transform<Scalar,Dim>::toQMatrix(void) const
+{
+  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  return QMatrix(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
+                 m_matrix.coeff(0,1), m_matrix.coeff(1,1),
+                 m_matrix.coeff(0,2), m_matrix.coeff(1,2));
+}
+
+/** Initialises \c *this from a QTransform assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim>
+Transform<Scalar,Dim>::Transform(const QTransform& other)
+{
+  *this = other;
+}
+
+/** Set \c *this from a QTransform assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim>
+Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const QTransform& other)
+{
+  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  m_matrix << other.m11(), other.m21(), other.dx(),
+              other.m12(), other.m22(), other.dy(),
+              other.m13(), other.m23(), other.m33();
+   return *this;
+}
+
+/** \returns a QTransform from \c *this assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim>
+QTransform Transform<Scalar,Dim>::toQTransform(void) const
+{
+  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0), m_matrix.coeff(2,0),
+                    m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(2,1),
+                    m_matrix.coeff(0,2), m_matrix.coeff(1,2), m_matrix.coeff(2,2));
+}
+#endif
+
+/*********************
+*** Procedural API ***
+*********************/
+
+/** Applies on the right the non uniform scale transformation represented
+  * by the vector \a other to \c *this and returns a reference to \c *this.
+  * \sa prescale()
+  */
+template<typename Scalar, int Dim>
+template<typename OtherDerived>
+Transform<Scalar,Dim>&
+Transform<Scalar,Dim>::scale(const MatrixBase<OtherDerived> &other)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
+  linear() = (linear() * other.asDiagonal()).lazy();
+  return *this;
+}
+
+/** Applies on the right a uniform scale of a factor \a c to \c *this
+  * and returns a reference to \c *this.
+  * \sa prescale(Scalar)
+  */
+template<typename Scalar, int Dim>
+inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::scale(Scalar s)
+{
+  linear() *= s;
+  return *this;
+}
+
+/** Applies on the left the non uniform scale transformation represented
+  * by the vector \a other to \c *this and returns a reference to \c *this.
+  * \sa scale()
+  */
+template<typename Scalar, int Dim>
+template<typename OtherDerived>
+Transform<Scalar,Dim>&
+Transform<Scalar,Dim>::prescale(const MatrixBase<OtherDerived> &other)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
+  m_matrix.template block<Dim,HDim>(0,0) = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0)).lazy();
+  return *this;
+}
+
+/** Applies on the left a uniform scale of a factor \a c to \c *this
+  * and returns a reference to \c *this.
+  * \sa scale(Scalar)
+  */
+template<typename Scalar, int Dim>
+inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::prescale(Scalar s)
+{
+  m_matrix.template corner<Dim,HDim>(TopLeft) *= s;
+  return *this;
+}
+
+/** Applies on the right the translation matrix represented by the vector \a other
+  * to \c *this and returns a reference to \c *this.
+  * \sa pretranslate()
+  */
+template<typename Scalar, int Dim>
+template<typename OtherDerived>
+Transform<Scalar,Dim>&
+Transform<Scalar,Dim>::translate(const MatrixBase<OtherDerived> &other)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
+  translation() += linear() * other;
+  return *this;
+}
+
+/** Applies on the left the translation matrix represented by the vector \a other
+  * to \c *this and returns a reference to \c *this.
+  * \sa translate()
+  */
+template<typename Scalar, int Dim>
+template<typename OtherDerived>
+Transform<Scalar,Dim>&
+Transform<Scalar,Dim>::pretranslate(const MatrixBase<OtherDerived> &other)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
+  translation() += other;
+  return *this;
+}
+
+/** Applies on the right the rotation represented by the rotation \a rotation
+  * to \c *this and returns a reference to \c *this.
+  *
+  * The template parameter \a RotationType is the type of the rotation which
+  * must be known by ei_toRotationMatrix<>.
+  *
+  * Natively supported types includes:
+  *   - any scalar (2D),
+  *   - a Dim x Dim matrix expression,
+  *   - a Quaternion (3D),
+  *   - a AngleAxis (3D)
+  *
+  * This mechanism is easily extendable to support user types such as Euler angles,
+  * or a pair of Quaternion for 4D rotations.
+  *
+  * \sa rotate(Scalar), class Quaternion, class AngleAxis, prerotate(RotationType)
+  */
+template<typename Scalar, int Dim>
+template<typename RotationType>
+Transform<Scalar,Dim>&
+Transform<Scalar,Dim>::rotate(const RotationType& rotation)
+{
+  linear() *= ei_toRotationMatrix<Scalar,Dim>(rotation);
+  return *this;
+}
+
+/** Applies on the left the rotation represented by the rotation \a rotation
+  * to \c *this and returns a reference to \c *this.
+  *
+  * See rotate() for further details.
+  *
+  * \sa rotate()
+  */
+template<typename Scalar, int Dim>
+template<typename RotationType>
+Transform<Scalar,Dim>&
+Transform<Scalar,Dim>::prerotate(const RotationType& rotation)
+{
+  m_matrix.template block<Dim,HDim>(0,0) = ei_toRotationMatrix<Scalar,Dim>(rotation)
+                                         * m_matrix.template block<Dim,HDim>(0,0);
+  return *this;
+}
+
+/** Applies on the right the shear transformation represented
+  * by the vector \a other to \c *this and returns a reference to \c *this.
+  * \warning 2D only.
+  * \sa preshear()
+  */
+template<typename Scalar, int Dim>
+Transform<Scalar,Dim>&
+Transform<Scalar,Dim>::shear(Scalar sx, Scalar sy)
+{
+  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  VectorType tmp = linear().col(0)*sy + linear().col(1);
+  linear() << linear().col(0) + linear().col(1)*sx, tmp;
+  return *this;
+}
+
+/** Applies on the left the shear transformation represented
+  * by the vector \a other to \c *this and returns a reference to \c *this.
+  * \warning 2D only.
+  * \sa shear()
+  */
+template<typename Scalar, int Dim>
+Transform<Scalar,Dim>&
+Transform<Scalar,Dim>::preshear(Scalar sx, Scalar sy)
+{
+  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  m_matrix.template block<Dim,HDim>(0,0) = LinearMatrixType(1, sx, sy, 1) * m_matrix.template block<Dim,HDim>(0,0);
+  return *this;
+}
+
+/******************************************************
+*** Scaling, Translation and Rotation compatibility ***
+******************************************************/
+
+template<typename Scalar, int Dim>
+inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const TranslationType& t)
+{
+  linear().setIdentity();
+  translation() = t.vector();
+  m_matrix.template block<1,Dim>(Dim,0).setZero();
+  m_matrix(Dim,Dim) = Scalar(1);
+  return *this;
+}
+
+template<typename Scalar, int Dim>
+inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const TranslationType& t) const
+{
+  Transform res = *this;
+  res.translate(t.vector());
+  return res;
+}
+
+template<typename Scalar, int Dim>
+inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const ScalingType& s)
+{
+  m_matrix.setZero();
+  linear().diagonal() = s.coeffs();
+  m_matrix.coeffRef(Dim,Dim) = Scalar(1);
+  return *this;
+}
+
+template<typename Scalar, int Dim>
+inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const ScalingType& s) const
+{
+  Transform res = *this;
+  res.scale(s.coeffs());
+  return res;
+}
+
+template<typename Scalar, int Dim>
+template<typename Derived>
+inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const RotationBase<Derived,Dim>& r)
+{
+  linear() = ei_toRotationMatrix<Scalar,Dim>(r);
+  translation().setZero();
+  m_matrix.template block<1,Dim>(Dim,0).setZero();
+  m_matrix.coeffRef(Dim,Dim) = Scalar(1);
+  return *this;
+}
+
+template<typename Scalar, int Dim>
+template<typename Derived>
+inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const RotationBase<Derived,Dim>& r) const
+{
+  Transform res = *this;
+  res.rotate(r.derived());
+  return res;
+}
+
+/************************
+*** Special functions ***
+************************/
+
+/** \returns the rotation part of the transformation
+  * \nonstableyet
+  *
+  * \svd_module
+  *
+  * \sa computeRotationScaling(), computeScalingRotation(), class SVD
+  */
+template<typename Scalar, int Dim>
+typename Transform<Scalar,Dim>::LinearMatrixType
+Transform<Scalar,Dim>::rotation() const
+{
+  LinearMatrixType result;
+  computeRotationScaling(&result, (LinearMatrixType*)0);
+  return result;
+}
+
+
+/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
+  * not necessarily positive.
+  *
+  * If either pointer is zero, the corresponding computation is skipped.
+  *
+  * \nonstableyet
+  *
+  * \svd_module
+  *
+  * \sa computeScalingRotation(), rotation(), class SVD
+  */
+template<typename Scalar, int Dim>
+template<typename RotationMatrixType, typename ScalingMatrixType>
+void Transform<Scalar,Dim>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
+{
+  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU|ComputeFullV);
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  Matrix<Scalar, Dim, 1> sv(svd.singularValues());
+  sv.coeffRef(0) *= x;
+  if(scaling)
+  {
+    scaling->noalias() = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
+  }
+  if(rotation)
+  {
+    LinearMatrixType m(svd.matrixU());
+    m.col(0) /= x;
+    rotation->noalias() = m * svd.matrixV().adjoint();
+  }
+}
+
+/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
+  * not necessarily positive.
+  *
+  * If either pointer is zero, the corresponding computation is skipped.
+  *
+  * \nonstableyet
+  *
+  * \svd_module
+  *
+  * \sa computeRotationScaling(), rotation(), class SVD
+  */
+template<typename Scalar, int Dim>
+template<typename ScalingMatrixType, typename RotationMatrixType>
+void Transform<Scalar,Dim>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
+{
+  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU|ComputeFullV);
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  Matrix<Scalar, Dim, 1> sv(svd.singularValues());
+  sv.coeffRef(0) *= x;
+  if(scaling)
+  {
+    scaling->noalias() = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint();
+  }
+  if(rotation)
+  {
+    LinearMatrixType m(svd.matrixU());
+    m.col(0) /= x;
+    rotation->noalias() = m * svd.matrixV().adjoint();
+  }
+}
+
+/** Convenient method to set \c *this from a position, orientation and scale
+  * of a 3D object.
+  */
+template<typename Scalar, int Dim>
+template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
+Transform<Scalar,Dim>&
+Transform<Scalar,Dim>::fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
+  const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale)
+{
+  linear() = ei_toRotationMatrix<Scalar,Dim>(orientation);
+  linear() *= scale.asDiagonal();
+  translation() = position;
+  m_matrix.template block<1,Dim>(Dim,0).setZero();
+  m_matrix(Dim,Dim) = Scalar(1);
+  return *this;
+}
+
+/** \nonstableyet
+  *
+  * \returns the inverse transformation matrix according to some given knowledge
+  * on \c *this.
+  *
+  * \param traits allows to optimize the inversion process when the transformion
+  * is known to be not a general transformation. The possible values are:
+  *  - Projective if the transformation is not necessarily affine, i.e., if the
+  *    last row is not guaranteed to be [0 ... 0 1]
+  *  - Affine is the default, the last row is assumed to be [0 ... 0 1]
+  *  - Isometry if the transformation is only a concatenations of translations
+  *    and rotations.
+  *
+  * \warning unless \a traits is always set to NoShear or NoScaling, this function
+  * requires the generic inverse method of MatrixBase defined in the LU module. If
+  * you forget to include this module, then you will get hard to debug linking errors.
+  *
+  * \sa MatrixBase::inverse()
+  */
+template<typename Scalar, int Dim>
+inline const typename Transform<Scalar,Dim>::MatrixType
+Transform<Scalar,Dim>::inverse(TransformTraits traits) const
+{
+  if (traits == Projective)
+  {
+    return m_matrix.inverse();
+  }
+  else
+  {
+    MatrixType res;
+    if (traits == Affine)
+    {
+      res.template corner<Dim,Dim>(TopLeft) = linear().inverse();
+    }
+    else if (traits == Isometry)
+    {
+      res.template corner<Dim,Dim>(TopLeft) = linear().transpose();
+    }
+    else
+    {
+      ei_assert("invalid traits value in Transform::inverse()");
+    }
+    // translation and remaining parts
+    res.template corner<Dim,1>(TopRight) = - res.template corner<Dim,Dim>(TopLeft) * translation();
+    res.template corner<1,Dim>(BottomLeft).setZero();
+    res.coeffRef(Dim,Dim) = Scalar(1);
+    return res;
+  }
+}
+
+/*****************************************************
+*** Specializations of operator* with a MatrixBase ***
+*****************************************************/
+
+template<typename Other, int Dim, int HDim>
+struct ei_transform_product_impl<Other,Dim,HDim, HDim,HDim>
+{
+  typedef Transform<typename Other::Scalar,Dim> TransformType;
+  typedef typename TransformType::MatrixType MatrixType;
+  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
+  static ResultType run(const TransformType& tr, const Other& other)
+  { return tr.matrix() * other; }
+};
+
+template<typename Other, int Dim, int HDim>
+struct ei_transform_product_impl<Other,Dim,HDim, Dim,Dim>
+{
+  typedef Transform<typename Other::Scalar,Dim> TransformType;
+  typedef typename TransformType::MatrixType MatrixType;
+  typedef TransformType ResultType;
+  static ResultType run(const TransformType& tr, const Other& other)
+  {
+    TransformType res;
+    res.translation() = tr.translation();
+    res.matrix().row(Dim) = tr.matrix().row(Dim);
+    res.linear() = (tr.linear() * other).lazy();
+    return res;
+  }
+};
+
+template<typename Other, int Dim, int HDim>
+struct ei_transform_product_impl<Other,Dim,HDim, HDim,1>
+{
+  typedef Transform<typename Other::Scalar,Dim> TransformType;
+  typedef typename TransformType::MatrixType MatrixType;
+  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
+  static ResultType run(const TransformType& tr, const Other& other)
+  { return tr.matrix() * other; }
+};
+
+template<typename Other, int Dim, int HDim>
+struct ei_transform_product_impl<Other,Dim,HDim, Dim,1>
+{
+  typedef typename Other::Scalar Scalar;
+  typedef Transform<Scalar,Dim> TransformType;
+  typedef Matrix<Scalar,Dim,1> ResultType;
+  static ResultType run(const TransformType& tr, const Other& other)
+  { return ((tr.linear() * other) + tr.translation())
+          * (Scalar(1) / ( (tr.matrix().template block<1,Dim>(Dim,0) * other).coeff(0) + tr.matrix().coeff(Dim,Dim))); }
+};
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Geometry/Translation.h b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Translation.h
new file mode 100644
index 0000000..2b9859f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Geometry/Translation.h
@@ -0,0 +1,184 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Translation
+  *
+  * \brief Represents a translation transformation
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
+  *
+  * \note This class is not aimed to be used to store a translation transformation,
+  * but rather to make easier the constructions and updates of Transform objects.
+  *
+  * \sa class Scaling, class Transform
+  */
+template<typename _Scalar, int _Dim>
+class Translation
+{
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
+  /** dimension of the space */
+  enum { Dim = _Dim };
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+  /** corresponding vector type */
+  typedef Matrix<Scalar,Dim,1> VectorType;
+  /** corresponding linear transformation matrix type */
+  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
+  /** corresponding scaling transformation type */
+  typedef Scaling<Scalar,Dim> ScalingType;
+  /** corresponding affine transformation type */
+  typedef Transform<Scalar,Dim> TransformType;
+
+protected:
+
+  VectorType m_coeffs;
+
+public:
+
+  /** Default constructor without initialization. */
+  Translation() {}
+  /**  */
+  inline Translation(const Scalar& sx, const Scalar& sy)
+  {
+    ei_assert(Dim==2);
+    m_coeffs.x() = sx;
+    m_coeffs.y() = sy;
+  }
+  /**  */
+  inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
+  {
+    ei_assert(Dim==3);
+    m_coeffs.x() = sx;
+    m_coeffs.y() = sy;
+    m_coeffs.z() = sz;
+  }
+  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
+  explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
+
+  const VectorType& vector() const { return m_coeffs; }
+  VectorType& vector() { return m_coeffs; }
+
+  /** Concatenates two translation */
+  inline Translation operator* (const Translation& other) const
+  { return Translation(m_coeffs + other.m_coeffs); }
+
+  /** Concatenates a translation and a scaling */
+  inline TransformType operator* (const ScalingType& other) const;
+
+  /** Concatenates a translation and a linear transformation */
+  inline TransformType operator* (const LinearMatrixType& linear) const;
+
+  template<typename Derived>
+  inline TransformType operator*(const RotationBase<Derived,Dim>& r) const
+  { return *this * r.toRotationMatrix(); }
+
+  /** Concatenates a linear transformation and a translation */
+  // its a nightmare to define a templated friend function outside its declaration
+  friend inline TransformType operator* (const LinearMatrixType& linear, const Translation& t)
+  {
+    TransformType res;
+    res.matrix().setZero();
+    res.linear() = linear;
+    res.translation() = linear * t.m_coeffs;
+    res.matrix().row(Dim).setZero();
+    res(Dim,Dim) = Scalar(1);
+    return res;
+  }
+
+  /** Concatenates a translation and an affine transformation */
+  inline TransformType operator* (const TransformType& t) const;
+
+  /** Applies translation to vector */
+  inline VectorType operator* (const VectorType& other) const
+  { return m_coeffs + other; }
+
+  /** \returns the inverse translation (opposite) */
+  Translation inverse() const { return Translation(-m_coeffs); }
+
+  Translation& operator=(const Translation& other)
+  {
+    m_coeffs = other.m_coeffs;
+    return *this;
+  }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
+  { return typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
+  { m_coeffs = other.vector().template cast<Scalar>(); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return m_coeffs.isApprox(other.m_coeffs, prec); }
+
+};
+
+/** \addtogroup Geometry_Module */
+//@{
+typedef Translation<float, 2> Translation2f;
+typedef Translation<double,2> Translation2d;
+typedef Translation<float, 3> Translation3f;
+typedef Translation<double,3> Translation3d;
+//@}
+
+
+template<typename Scalar, int Dim>
+inline typename Translation<Scalar,Dim>::TransformType
+Translation<Scalar,Dim>::operator* (const ScalingType& other) const
+{
+  TransformType res;
+  res.matrix().setZero();
+  res.linear().diagonal() = other.coeffs();
+  res.translation() = m_coeffs;
+  res(Dim,Dim) = Scalar(1);
+  return res;
+}
+
+template<typename Scalar, int Dim>
+inline typename Translation<Scalar,Dim>::TransformType
+Translation<Scalar,Dim>::operator* (const LinearMatrixType& linear) const
+{
+  TransformType res;
+  res.matrix().setZero();
+  res.linear() = linear;
+  res.translation() = m_coeffs;
+  res.matrix().row(Dim).setZero();
+  res(Dim,Dim) = Scalar(1);
+  return res;
+}
+
+template<typename Scalar, int Dim>
+inline typename Translation<Scalar,Dim>::TransformType
+Translation<Scalar,Dim>::operator* (const TransformType& t) const
+{
+  TransformType res = t;
+  res.pretranslate(m_coeffs);
+  return res;
+}
+
+} // end namespace Eigen
diff --git a/include/eigen3/Eigen/src/Eigen2Support/LU.h b/include/eigen3/Eigen/src/Eigen2Support/LU.h
new file mode 100644
index 0000000..49f19ad
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/LU.h
@@ -0,0 +1,120 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2_LU_H
+#define EIGEN2_LU_H
+
+namespace Eigen { 
+
+template<typename MatrixType>
+class LU : public FullPivLU<MatrixType>
+{
+  public:
+
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime, MatrixType::Options, 1, MatrixType::MaxColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1, MatrixType::Options, MatrixType::MaxRowsAtCompileTime, 1> IntColVectorType;
+    typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime, MatrixType::Options, 1, MatrixType::MaxColsAtCompileTime> RowVectorType;
+    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1, MatrixType::Options, MatrixType::MaxRowsAtCompileTime, 1> ColVectorType;
+
+    typedef Matrix<typename MatrixType::Scalar,
+                  MatrixType::ColsAtCompileTime, // the number of rows in the "kernel matrix" is the number of cols of the original matrix
+                                                 // so that the product "matrix * kernel = zero" makes sense
+                  Dynamic,                       // we don't know at compile-time the dimension of the kernel
+                  MatrixType::Options,
+                  MatrixType::MaxColsAtCompileTime, // see explanation for 2nd template parameter
+                  MatrixType::MaxColsAtCompileTime // the kernel is a subspace of the domain space, whose dimension is the number
+                                                   // of columns of the original matrix
+    > KernelResultType;
+
+    typedef Matrix<typename MatrixType::Scalar,
+                   MatrixType::RowsAtCompileTime, // the image is a subspace of the destination space, whose dimension is the number
+                                                  // of rows of the original matrix
+                   Dynamic,                       // we don't know at compile time the dimension of the image (the rank)
+                   MatrixType::Options,
+                   MatrixType::MaxRowsAtCompileTime, // the image matrix will consist of columns from the original matrix,
+                   MatrixType::MaxColsAtCompileTime  // so it has the same number of rows and at most as many columns.
+    > ImageResultType;
+
+    typedef FullPivLU<MatrixType> Base;
+
+    template<typename T>
+    explicit LU(const T& t) : Base(t), m_originalMatrix(t) {}
+
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
+    {
+      *result = static_cast<const Base*>(this)->solve(b);
+      return true;
+    }
+
+    template<typename ResultType>
+    inline void computeInverse(ResultType *result) const
+    {
+      solve(MatrixType::Identity(this->rows(), this->cols()), result);
+    }
+    
+    template<typename KernelMatrixType>
+    void computeKernel(KernelMatrixType *result) const
+    {
+      *result = static_cast<const Base*>(this)->kernel();
+    }
+    
+    template<typename ImageMatrixType>
+    void computeImage(ImageMatrixType *result) const
+    {
+      *result = static_cast<const Base*>(this)->image(m_originalMatrix);
+    }
+    
+    const ImageResultType image() const
+    {
+      return static_cast<const Base*>(this)->image(m_originalMatrix);
+    }
+    
+    const MatrixType& m_originalMatrix;
+};
+
+#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
+/** \lu_module
+  *
+  * Synonym of partialPivLu().
+  *
+  * \return the partial-pivoting LU decomposition of \c *this.
+  *
+  * \sa class PartialPivLU
+  */
+template<typename Derived>
+inline const LU<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::lu() const
+{
+  return LU<PlainObject>(eval());
+}
+#endif
+
+#ifdef EIGEN2_SUPPORT
+/** \lu_module
+  *
+  * Synonym of partialPivLu().
+  *
+  * \return the partial-pivoting LU decomposition of \c *this.
+  *
+  * \sa class PartialPivLU
+  */
+template<typename Derived>
+inline const LU<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::eigen2_lu() const
+{
+  return LU<PlainObject>(eval());
+}
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN2_LU_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Lazy.h b/include/eigen3/Eigen/src/Eigen2Support/Lazy.h
new file mode 100644
index 0000000..593fc78
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Lazy.h
@@ -0,0 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LAZY_H
+#define EIGEN_LAZY_H
+
+namespace Eigen { 
+
+/** \deprecated it is only used by lazy() which is deprecated
+  *
+  * \returns an expression of *this with added flags
+  *
+  * Example: \include MatrixBase_marked.cpp
+  * Output: \verbinclude MatrixBase_marked.out
+  *
+  * \sa class Flagged, extract(), part()
+  */
+template<typename Derived>
+template<unsigned int Added>
+inline const Flagged<Derived, Added, 0>
+MatrixBase<Derived>::marked() const
+{
+  return derived();
+}
+
+/** \deprecated use MatrixBase::noalias()
+  *
+  * \returns an expression of *this with the EvalBeforeAssigningBit flag removed.
+  *
+  * Example: \include MatrixBase_lazy.cpp
+  * Output: \verbinclude MatrixBase_lazy.out
+  *
+  * \sa class Flagged, marked()
+  */
+template<typename Derived>
+inline const Flagged<Derived, 0, EvalBeforeAssigningBit>
+MatrixBase<Derived>::lazy() const
+{
+  return derived();
+}
+
+
+/** \internal
+  * Overloaded to perform an efficient C += (A*B).lazy() */
+template<typename Derived>
+template<typename ProductDerived, typename Lhs, typename Rhs>
+Derived& MatrixBase<Derived>::operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
+                                                       EvalBeforeAssigningBit>& other)
+{
+  other._expression().derived().addTo(derived()); return derived();
+}
+
+/** \internal
+  * Overloaded to perform an efficient C -= (A*B).lazy() */
+template<typename Derived>
+template<typename ProductDerived, typename Lhs, typename Rhs>
+Derived& MatrixBase<Derived>::operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
+                                                       EvalBeforeAssigningBit>& other)
+{
+  other._expression().derived().subTo(derived()); return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_LAZY_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/LeastSquares.h b/include/eigen3/Eigen/src/Eigen2Support/LeastSquares.h
new file mode 100644
index 0000000..7992d49
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/LeastSquares.h
@@ -0,0 +1,169 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2_LEASTSQUARES_H
+#define EIGEN2_LEASTSQUARES_H
+
+namespace Eigen { 
+
+/** \ingroup LeastSquares_Module
+  *
+  * \leastsquares_module
+  *
+  * For a set of points, this function tries to express
+  * one of the coords as a linear (affine) function of the other coords.
+  *
+  * This is best explained by an example. This function works in full
+  * generality, for points in a space of arbitrary dimension, and also over
+  * the complex numbers, but for this example we will work in dimension 3
+  * over the real numbers (doubles).
+  *
+  * So let us work with the following set of 5 points given by their
+  * \f$(x,y,z)\f$ coordinates:
+  * @code
+    Vector3d points[5];
+    points[0] = Vector3d( 3.02, 6.89, -4.32 );
+    points[1] = Vector3d( 2.01, 5.39, -3.79 );
+    points[2] = Vector3d( 2.41, 6.01, -4.01 );
+    points[3] = Vector3d( 2.09, 5.55, -3.86 );
+    points[4] = Vector3d( 2.58, 6.32, -4.10 );
+  * @endcode
+  * Suppose that we want to express the second coordinate (\f$y\f$) as a linear
+  * expression in \f$x\f$ and \f$z\f$, that is,
+  * \f[ y=ax+bz+c \f]
+  * for some constants \f$a,b,c\f$. Thus, we want to find the best possible
+  * constants \f$a,b,c\f$ so that the plane of equation \f$y=ax+bz+c\f$ fits
+  * best the five above points. To do that, call this function as follows:
+  * @code
+    Vector3d coeffs; // will store the coefficients a, b, c
+    linearRegression(
+      5,
+      &points,
+      &coeffs,
+      1 // the coord to express as a function of
+        // the other ones. 0 means x, 1 means y, 2 means z.
+    );
+  * @endcode
+  * Now the vector \a coeffs is approximately
+  * \f$( 0.495 ,  -1.927 ,  -2.906 )\f$.
+  * Thus, we get \f$a=0.495, b = -1.927, c = -2.906\f$. Let us check for
+  * instance how near points[0] is from the plane of equation \f$y=ax+bz+c\f$.
+  * Looking at the coords of points[0], we see that:
+  * \f[ax+bz+c = 0.495 * 3.02 + (-1.927) * (-4.32) + (-2.906) = 6.91.\f]
+  * On the other hand, we have \f$y=6.89\f$. We see that the values
+  * \f$6.91\f$ and \f$6.89\f$
+  * are near, so points[0] is very near the plane of equation \f$y=ax+bz+c\f$.
+  *
+  * Let's now describe precisely the parameters:
+  * @param numPoints the number of points
+  * @param points the array of pointers to the points on which to perform the linear regression
+  * @param result pointer to the vector in which to store the result.
+                  This vector must be of the same type and size as the
+                  data points. The meaning of its coords is as follows.
+                  For brevity, let \f$n=Size\f$,
+                  \f$r_i=result[i]\f$,
+                  and \f$f=funcOfOthers\f$. Denote by
+                  \f$x_0,\ldots,x_{n-1}\f$
+                  the n coordinates in the n-dimensional space.
+                  Then the resulting equation is:
+                  \f[ x_f = r_0 x_0 + \cdots + r_{f-1}x_{f-1}
+                   + r_{f+1}x_{f+1} + \cdots + r_{n-1}x_{n-1} + r_n. \f]
+  * @param funcOfOthers Determines which coord to express as a function of the
+                        others. Coords are numbered starting from 0, so that a
+                        value of 0 means \f$x\f$, 1 means \f$y\f$,
+                        2 means \f$z\f$, ...
+  *
+  * \sa fitHyperplane()
+  */
+template<typename VectorType>
+void linearRegression(int numPoints,
+                      VectorType **points,
+                      VectorType *result,
+                      int funcOfOthers )
+{
+  typedef typename VectorType::Scalar Scalar;
+  typedef Hyperplane<Scalar, VectorType::SizeAtCompileTime> HyperplaneType;
+  const int size = points[0]->size();
+  result->resize(size);
+  HyperplaneType h(size);
+  fitHyperplane(numPoints, points, &h);
+  for(int i = 0; i < funcOfOthers; i++)
+    result->coeffRef(i) = - h.coeffs()[i] / h.coeffs()[funcOfOthers];
+  for(int i = funcOfOthers; i < size; i++)
+    result->coeffRef(i) = - h.coeffs()[i+1] / h.coeffs()[funcOfOthers];
+}
+
+/** \ingroup LeastSquares_Module
+  *
+  * \leastsquares_module
+  *
+  * This function is quite similar to linearRegression(), so we refer to the
+  * documentation of this function and only list here the differences.
+  *
+  * The main difference from linearRegression() is that this function doesn't
+  * take a \a funcOfOthers argument. Instead, it finds a general equation
+  * of the form
+  * \f[ r_0 x_0 + \cdots + r_{n-1}x_{n-1} + r_n = 0, \f]
+  * where \f$n=Size\f$, \f$r_i=retCoefficients[i]\f$, and we denote by
+  * \f$x_0,\ldots,x_{n-1}\f$ the n coordinates in the n-dimensional space.
+  *
+  * Thus, the vector \a retCoefficients has size \f$n+1\f$, which is another
+  * difference from linearRegression().
+  *
+  * In practice, this function performs an hyper-plane fit in a total least square sense
+  * via the following steps:
+  *  1 - center the data to the mean
+  *  2 - compute the covariance matrix
+  *  3 - pick the eigenvector corresponding to the smallest eigenvalue of the covariance matrix
+  * The ratio of the smallest eigenvalue and the second one gives us a hint about the relevance
+  * of the solution. This value is optionally returned in \a soundness.
+  *
+  * \sa linearRegression()
+  */
+template<typename VectorType, typename HyperplaneType>
+void fitHyperplane(int numPoints,
+                   VectorType **points,
+                   HyperplaneType *result,
+                   typename NumTraits<typename VectorType::Scalar>::Real* soundness = 0)
+{
+  typedef typename VectorType::Scalar Scalar;
+  typedef Matrix<Scalar,VectorType::SizeAtCompileTime,VectorType::SizeAtCompileTime> CovMatrixType;
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType)
+  ei_assert(numPoints >= 1);
+  int size = points[0]->size();
+  ei_assert(size+1 == result->coeffs().size());
+
+  // compute the mean of the data
+  VectorType mean = VectorType::Zero(size);
+  for(int i = 0; i < numPoints; ++i)
+    mean += *(points[i]);
+  mean /= numPoints;
+
+  // compute the covariance matrix
+  CovMatrixType covMat = CovMatrixType::Zero(size, size);
+  for(int i = 0; i < numPoints; ++i)
+  {
+    VectorType diff = (*(points[i]) - mean).conjugate();
+    covMat += diff * diff.adjoint();
+  }
+
+  // now we just have to pick the eigen vector with smallest eigen value
+  SelfAdjointEigenSolver<CovMatrixType> eig(covMat);
+  result->normal() = eig.eigenvectors().col(0);
+  if (soundness)
+    *soundness = eig.eigenvalues().coeff(0)/eig.eigenvalues().coeff(1);
+
+  // let's compute the constant coefficient such that the
+  // plane pass trough the mean point:
+  result->offset() = - (result->normal().cwise()* mean).sum();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN2_LEASTSQUARES_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Macros.h b/include/eigen3/Eigen/src/Eigen2Support/Macros.h
new file mode 100644
index 0000000..351c32a
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Macros.h
@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2_MACROS_H
+#define EIGEN2_MACROS_H
+
+#define ei_assert eigen_assert
+#define ei_internal_assert eigen_internal_assert
+
+#define EIGEN_ALIGN_128 EIGEN_ALIGN16
+
+#define EIGEN_ARCH_WANTS_ALIGNMENT EIGEN_ALIGN_STATICALLY
+
+#endif // EIGEN2_MACROS_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/MathFunctions.h b/include/eigen3/Eigen/src/Eigen2Support/MathFunctions.h
new file mode 100644
index 0000000..3544af2
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/MathFunctions.h
@@ -0,0 +1,57 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2_MATH_FUNCTIONS_H
+#define EIGEN2_MATH_FUNCTIONS_H
+
+namespace Eigen { 
+
+template<typename T> inline typename NumTraits<T>::Real ei_real(const T& x) { return numext::real(x); }
+template<typename T> inline typename NumTraits<T>::Real ei_imag(const T& x) { return numext::imag(x); }
+template<typename T> inline T ei_conj(const T& x) { return numext::conj(x); }
+template<typename T> inline typename NumTraits<T>::Real ei_abs (const T& x) { using std::abs; return abs(x); }
+template<typename T> inline typename NumTraits<T>::Real ei_abs2(const T& x) { return numext::abs2(x); }
+template<typename T> inline T ei_sqrt(const T& x) { using std::sqrt; return sqrt(x); }
+template<typename T> inline T ei_exp (const T& x) { using std::exp;  return exp(x); }
+template<typename T> inline T ei_log (const T& x) { using std::log;  return log(x); }
+template<typename T> inline T ei_sin (const T& x) { using std::sin;  return sin(x); }
+template<typename T> inline T ei_cos (const T& x) { using std::cos;  return cos(x); }
+template<typename T> inline T ei_atan2(const T& x,const T& y) { using std::atan2; return atan2(x,y); }
+template<typename T> inline T ei_pow (const T& x,const T& y) { return numext::pow(x,y); }
+template<typename T> inline T ei_random () { return internal::random<T>(); }
+template<typename T> inline T ei_random (const T& x, const T& y) { return internal::random(x, y); }
+
+template<typename T> inline T precision () { return NumTraits<T>::dummy_precision(); }
+template<typename T> inline T machine_epsilon () { return NumTraits<T>::epsilon(); }
+
+
+template<typename Scalar, typename OtherScalar>
+inline bool ei_isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
+                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+{
+  return internal::isMuchSmallerThan(x, y, precision);
+}
+
+template<typename Scalar>
+inline bool ei_isApprox(const Scalar& x, const Scalar& y,
+                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+{
+  return internal::isApprox(x, y, precision);
+}
+
+template<typename Scalar>
+inline bool ei_isApproxOrLessThan(const Scalar& x, const Scalar& y,
+                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+{
+  return internal::isApproxOrLessThan(x, y, precision);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN2_MATH_FUNCTIONS_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Memory.h b/include/eigen3/Eigen/src/Eigen2Support/Memory.h
new file mode 100644
index 0000000..f86372b
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Memory.h
@@ -0,0 +1,45 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2_MEMORY_H
+#define EIGEN2_MEMORY_H
+
+namespace Eigen { 
+
+inline void* ei_aligned_malloc(size_t size) { return internal::aligned_malloc(size); }
+inline void  ei_aligned_free(void *ptr) { internal::aligned_free(ptr); }
+inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size) { return internal::aligned_realloc(ptr, new_size, old_size); }
+inline void* ei_handmade_aligned_malloc(size_t size) { return internal::handmade_aligned_malloc(size); }
+inline void  ei_handmade_aligned_free(void *ptr) { internal::handmade_aligned_free(ptr); }
+
+template<bool Align> inline void* ei_conditional_aligned_malloc(size_t size)
+{
+  return internal::conditional_aligned_malloc<Align>(size);
+}
+template<bool Align> inline void ei_conditional_aligned_free(void *ptr)
+{
+  internal::conditional_aligned_free<Align>(ptr);
+}
+template<bool Align> inline void* ei_conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
+{
+  return internal::conditional_aligned_realloc<Align>(ptr, new_size, old_size);
+}
+
+template<typename T> inline T* ei_aligned_new(size_t size)
+{
+  return internal::aligned_new<T>(size);
+}
+template<typename T> inline void ei_aligned_delete(T *ptr, size_t size)
+{
+  return internal::aligned_delete(ptr, size);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN2_MACROS_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Meta.h b/include/eigen3/Eigen/src/Eigen2Support/Meta.h
new file mode 100644
index 0000000..fa37cfc
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Meta.h
@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2_META_H
+#define EIGEN2_META_H
+
+namespace Eigen { 
+
+template<typename T>
+struct ei_traits : internal::traits<T>
+{};
+
+struct ei_meta_true {  enum { ret = 1 }; };
+struct ei_meta_false { enum { ret = 0 }; };
+
+template<bool Condition, typename Then, typename Else>
+struct ei_meta_if { typedef Then ret; };
+
+template<typename Then, typename Else>
+struct ei_meta_if <false, Then, Else> { typedef Else ret; };
+
+template<typename T, typename U> struct ei_is_same_type { enum { ret = 0 }; };
+template<typename T> struct ei_is_same_type<T,T> { enum { ret = 1 }; };
+
+template<typename T> struct ei_unref { typedef T type; };
+template<typename T> struct ei_unref<T&> { typedef T type; };
+
+template<typename T> struct ei_unpointer { typedef T type; };
+template<typename T> struct ei_unpointer<T*> { typedef T type; };
+template<typename T> struct ei_unpointer<T*const> { typedef T type; };
+
+template<typename T> struct ei_unconst { typedef T type; };
+template<typename T> struct ei_unconst<const T> { typedef T type; };
+template<typename T> struct ei_unconst<T const &> { typedef T & type; };
+template<typename T> struct ei_unconst<T const *> { typedef T * type; };
+
+template<typename T> struct ei_cleantype { typedef T type; };
+template<typename T> struct ei_cleantype<const T>   { typedef typename ei_cleantype<T>::type type; };
+template<typename T> struct ei_cleantype<const T&>  { typedef typename ei_cleantype<T>::type type; };
+template<typename T> struct ei_cleantype<T&>        { typedef typename ei_cleantype<T>::type type; };
+template<typename T> struct ei_cleantype<const T*>  { typedef typename ei_cleantype<T>::type type; };
+template<typename T> struct ei_cleantype<T*>        { typedef typename ei_cleantype<T>::type type; };
+
+/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
+  * Usage example: \code ei_meta_sqrt<1023>::ret \endcode
+  */
+template<int Y,
+         int InfX = 0,
+         int SupX = ((Y==1) ? 1 : Y/2),
+         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
+                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
+class ei_meta_sqrt
+{
+    enum {
+      MidX = (InfX+SupX)/2,
+      TakeInf = MidX*MidX > Y ? 1 : 0,
+      NewInf = int(TakeInf) ? InfX : int(MidX),
+      NewSup = int(TakeInf) ? int(MidX) : SupX
+    };
+  public:
+    enum { ret = ei_meta_sqrt<Y,NewInf,NewSup>::ret };
+};
+
+template<int Y, int InfX, int SupX>
+class ei_meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
+
+} // end namespace Eigen
+
+#endif // EIGEN2_META_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/Minor.h b/include/eigen3/Eigen/src/Eigen2Support/Minor.h
new file mode 100644
index 0000000..4cded57
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/Minor.h
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MINOR_H
+#define EIGEN_MINOR_H
+
+namespace Eigen { 
+
+/**
+  * \class Minor
+  *
+  * \brief Expression of a minor
+  *
+  * \param MatrixType the type of the object in which we are taking a minor
+  *
+  * This class represents an expression of a minor. It is the return
+  * type of MatrixBase::minor() and most of the time this is the only way it
+  * is used.
+  *
+  * \sa MatrixBase::minor()
+  */
+
+namespace internal {
+template<typename MatrixType>
+struct traits<Minor<MatrixType> >
+ : traits<MatrixType>
+{
+  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef typename MatrixType::StorageKind StorageKind;
+  enum {
+    RowsAtCompileTime = (MatrixType::RowsAtCompileTime != Dynamic) ?
+                          int(MatrixType::RowsAtCompileTime) - 1 : Dynamic,
+    ColsAtCompileTime = (MatrixType::ColsAtCompileTime != Dynamic) ?
+                          int(MatrixType::ColsAtCompileTime) - 1 : Dynamic,
+    MaxRowsAtCompileTime = (MatrixType::MaxRowsAtCompileTime != Dynamic) ?
+                             int(MatrixType::MaxRowsAtCompileTime) - 1 : Dynamic,
+    MaxColsAtCompileTime = (MatrixType::MaxColsAtCompileTime != Dynamic) ?
+                             int(MatrixType::MaxColsAtCompileTime) - 1 : Dynamic,
+    Flags = _MatrixTypeNested::Flags & (HereditaryBits | LvalueBit),
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost // minor is used typically on tiny matrices,
+      // where loops are unrolled and the 'if' evaluates at compile time
+  };
+};
+}
+
+template<typename MatrixType> class Minor
+  : public MatrixBase<Minor<MatrixType> >
+{
+  public:
+
+    typedef MatrixBase<Minor> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Minor)
+
+    inline Minor(const MatrixType& matrix,
+                       Index row, Index col)
+      : m_matrix(matrix), m_row(row), m_col(col)
+    {
+      eigen_assert(row >= 0 && row < matrix.rows()
+          && col >= 0 && col < matrix.cols());
+    }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Minor)
+
+    inline Index rows() const { return m_matrix.rows() - 1; }
+    inline Index cols() const { return m_matrix.cols() - 1; }
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_matrix.const_cast_derived().coeffRef(row + (row >= m_row), col + (col >= m_col));
+    }
+
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (row >= m_row), col + (col >= m_col));
+    }
+
+  protected:
+    const typename MatrixType::Nested m_matrix;
+    const Index m_row, m_col;
+};
+
+/**
+  * \return an expression of the (\a row, \a col)-minor of *this,
+  * i.e. an expression constructed from *this by removing the specified
+  * row and column.
+  *
+  * Example: \include MatrixBase_minor.cpp
+  * Output: \verbinclude MatrixBase_minor.out
+  *
+  * \sa class Minor
+  */
+template<typename Derived>
+inline Minor<Derived>
+MatrixBase<Derived>::minor(Index row, Index col)
+{
+  return Minor<Derived>(derived(), row, col);
+}
+
+/**
+  * This is the const version of minor(). */
+template<typename Derived>
+inline const Minor<Derived>
+MatrixBase<Derived>::minor(Index row, Index col) const
+{
+  return Minor<Derived>(derived(), row, col);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MINOR_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/QR.h b/include/eigen3/Eigen/src/Eigen2Support/QR.h
new file mode 100644
index 0000000..2042c98
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/QR.h
@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2_QR_H
+#define EIGEN2_QR_H
+
+namespace Eigen { 
+
+template<typename MatrixType>
+class QR : public HouseholderQR<MatrixType>
+{
+  public:
+
+    typedef HouseholderQR<MatrixType> Base;
+    typedef Block<const MatrixType, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> MatrixRBlockType;
+
+    QR() : Base() {}
+
+    template<typename T>
+    explicit QR(const T& t) : Base(t) {}
+
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
+    {
+      *result = static_cast<const Base*>(this)->solve(b);
+      return true;
+    }
+
+    MatrixType matrixQ(void) const {
+      MatrixType ret = MatrixType::Identity(this->rows(), this->cols());
+      ret = this->householderQ() * ret;
+      return ret;
+    }
+
+    bool isFullRank() const {
+      return true;
+    }
+    
+    const TriangularView<MatrixRBlockType, UpperTriangular>
+    matrixR(void) const
+    {
+      int cols = this->cols();
+      return MatrixRBlockType(this->matrixQR(), 0, 0, cols, cols).template triangularView<UpperTriangular>();
+    }
+};
+
+/** \return the QR decomposition of \c *this.
+  *
+  * \sa class QR
+  */
+template<typename Derived>
+const QR<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::qr() const
+{
+  return QR<PlainObject>(eval());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN2_QR_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/SVD.h b/include/eigen3/Eigen/src/Eigen2Support/SVD.h
new file mode 100644
index 0000000..3d03d22
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/SVD.h
@@ -0,0 +1,637 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2_SVD_H
+#define EIGEN2_SVD_H
+
+namespace Eigen {
+
+/** \ingroup SVD_Module
+  * \nonstableyet
+  *
+  * \class SVD
+  *
+  * \brief Standard SVD decomposition of a matrix and associated features
+  *
+  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
+  *
+  * This class performs a standard SVD decomposition of a real matrix A of size \c M x \c N
+  * with \c M \>= \c N.
+  *
+  *
+  * \sa MatrixBase::SVD()
+  */
+template<typename MatrixType> class SVD
+{
+  private:
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+
+    enum {
+      PacketSize = internal::packet_traits<Scalar>::size,
+      AlignmentMask = int(PacketSize)-1,
+      MinSize = EIGEN_SIZE_MIN_PREFER_DYNAMIC(MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime)
+    };
+
+    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> ColVector;
+    typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> RowVector;
+
+    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MinSize> MatrixUType;
+    typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> MatrixVType;
+    typedef Matrix<Scalar, MinSize, 1> SingularValuesType;
+
+  public:
+
+    SVD() {} // a user who relied on compiler-generated default compiler reported problems with MSVC in 2.0.7
+    
+    SVD(const MatrixType& matrix)
+      : m_matU(matrix.rows(), (std::min)(matrix.rows(), matrix.cols())),
+        m_matV(matrix.cols(),matrix.cols()),
+        m_sigma((std::min)(matrix.rows(),matrix.cols()))
+    {
+      compute(matrix);
+    }
+
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived> &b, ResultType* result) const;
+
+    const MatrixUType& matrixU() const { return m_matU; }
+    const SingularValuesType& singularValues() const { return m_sigma; }
+    const MatrixVType& matrixV() const { return m_matV; }
+
+    void compute(const MatrixType& matrix);
+    SVD& sort();
+
+    template<typename UnitaryType, typename PositiveType>
+    void computeUnitaryPositive(UnitaryType *unitary, PositiveType *positive) const;
+    template<typename PositiveType, typename UnitaryType>
+    void computePositiveUnitary(PositiveType *positive, UnitaryType *unitary) const;
+    template<typename RotationType, typename ScalingType>
+    void computeRotationScaling(RotationType *unitary, ScalingType *positive) const;
+    template<typename ScalingType, typename RotationType>
+    void computeScalingRotation(ScalingType *positive, RotationType *unitary) const;
+
+  protected:
+    /** \internal */
+    MatrixUType m_matU;
+    /** \internal */
+    MatrixVType m_matV;
+    /** \internal */
+    SingularValuesType m_sigma;
+};
+
+/** Computes / recomputes the SVD decomposition A = U S V^* of \a matrix
+  *
+  * \note this code has been adapted from JAMA (public domain)
+  */
+template<typename MatrixType>
+void SVD<MatrixType>::compute(const MatrixType& matrix)
+{
+  const int m = matrix.rows();
+  const int n = matrix.cols();
+  const int nu = (std::min)(m,n);
+  ei_assert(m>=n && "In Eigen 2.0, SVD only works for MxN matrices with M>=N. Sorry!");
+  ei_assert(m>1 && "In Eigen 2.0, SVD doesn't work on 1x1 matrices");
+
+  m_matU.resize(m, nu);
+  m_matU.setZero();
+  m_sigma.resize((std::min)(m,n));
+  m_matV.resize(n,n);
+
+  RowVector e(n);
+  ColVector work(m);
+  MatrixType matA(matrix);
+  const bool wantu = true;
+  const bool wantv = true;
+  int i=0, j=0, k=0;
+
+  // Reduce A to bidiagonal form, storing the diagonal elements
+  // in s and the super-diagonal elements in e.
+  int nct = (std::min)(m-1,n);
+  int nrt = (std::max)(0,(std::min)(n-2,m));
+  for (k = 0; k < (std::max)(nct,nrt); ++k)
+  {
+    if (k < nct)
+    {
+      // Compute the transformation for the k-th column and
+      // place the k-th diagonal in m_sigma[k].
+      m_sigma[k] = matA.col(k).end(m-k).norm();
+      if (m_sigma[k] != 0.0) // FIXME
+      {
+        if (matA(k,k) < 0.0)
+          m_sigma[k] = -m_sigma[k];
+        matA.col(k).end(m-k) /= m_sigma[k];
+        matA(k,k) += 1.0;
+      }
+      m_sigma[k] = -m_sigma[k];
+    }
+
+    for (j = k+1; j < n; ++j)
+    {
+      if ((k < nct) && (m_sigma[k] != 0.0))
+      {
+        // Apply the transformation.
+        Scalar t = matA.col(k).end(m-k).eigen2_dot(matA.col(j).end(m-k)); // FIXME dot product or cwise prod + .sum() ??
+        t = -t/matA(k,k);
+        matA.col(j).end(m-k) += t * matA.col(k).end(m-k);
+      }
+
+      // Place the k-th row of A into e for the
+      // subsequent calculation of the row transformation.
+      e[j] = matA(k,j);
+    }
+
+    // Place the transformation in U for subsequent back multiplication.
+    if (wantu & (k < nct))
+      m_matU.col(k).end(m-k) = matA.col(k).end(m-k);
+
+    if (k < nrt)
+    {
+      // Compute the k-th row transformation and place the
+      // k-th super-diagonal in e[k].
+      e[k] = e.end(n-k-1).norm();
+      if (e[k] != 0.0)
+      {
+          if (e[k+1] < 0.0)
+            e[k] = -e[k];
+          e.end(n-k-1) /= e[k];
+          e[k+1] += 1.0;
+      }
+      e[k] = -e[k];
+      if ((k+1 < m) & (e[k] != 0.0))
+      {
+        // Apply the transformation.
+        work.end(m-k-1) = matA.corner(BottomRight,m-k-1,n-k-1) * e.end(n-k-1);
+        for (j = k+1; j < n; ++j)
+          matA.col(j).end(m-k-1) += (-e[j]/e[k+1]) * work.end(m-k-1);
+      }
+
+      // Place the transformation in V for subsequent back multiplication.
+      if (wantv)
+        m_matV.col(k).end(n-k-1) = e.end(n-k-1);
+    }
+  }
+
+
+  // Set up the final bidiagonal matrix or order p.
+  int p = (std::min)(n,m+1);
+  if (nct < n)
+    m_sigma[nct] = matA(nct,nct);
+  if (m < p)
+    m_sigma[p-1] = 0.0;
+  if (nrt+1 < p)
+    e[nrt] = matA(nrt,p-1);
+  e[p-1] = 0.0;
+
+  // If required, generate U.
+  if (wantu)
+  {
+    for (j = nct; j < nu; ++j)
+    {
+      m_matU.col(j).setZero();
+      m_matU(j,j) = 1.0;
+    }
+    for (k = nct-1; k >= 0; k--)
+    {
+      if (m_sigma[k] != 0.0)
+      {
+        for (j = k+1; j < nu; ++j)
+        {
+          Scalar t = m_matU.col(k).end(m-k).eigen2_dot(m_matU.col(j).end(m-k)); // FIXME is it really a dot product we want ?
+          t = -t/m_matU(k,k);
+          m_matU.col(j).end(m-k) += t * m_matU.col(k).end(m-k);
+        }
+        m_matU.col(k).end(m-k) = - m_matU.col(k).end(m-k);
+        m_matU(k,k) = Scalar(1) + m_matU(k,k);
+        if (k-1>0)
+          m_matU.col(k).start(k-1).setZero();
+      }
+      else
+      {
+        m_matU.col(k).setZero();
+        m_matU(k,k) = 1.0;
+      }
+    }
+  }
+
+  // If required, generate V.
+  if (wantv)
+  {
+    for (k = n-1; k >= 0; k--)
+    {
+      if ((k < nrt) & (e[k] != 0.0))
+      {
+        for (j = k+1; j < nu; ++j)
+        {
+          Scalar t = m_matV.col(k).end(n-k-1).eigen2_dot(m_matV.col(j).end(n-k-1)); // FIXME is it really a dot product we want ?
+          t = -t/m_matV(k+1,k);
+          m_matV.col(j).end(n-k-1) += t * m_matV.col(k).end(n-k-1);
+        }
+      }
+      m_matV.col(k).setZero();
+      m_matV(k,k) = 1.0;
+    }
+  }
+
+  // Main iteration loop for the singular values.
+  int pp = p-1;
+  int iter = 0;
+  Scalar eps = ei_pow(Scalar(2),ei_is_same_type<Scalar,float>::ret ? Scalar(-23) : Scalar(-52));
+  while (p > 0)
+  {
+    int k=0;
+    int kase=0;
+
+    // Here is where a test for too many iterations would go.
+
+    // This section of the program inspects for
+    // negligible elements in the s and e arrays.  On
+    // completion the variables kase and k are set as follows.
+
+    // kase = 1     if s(p) and e[k-1] are negligible and k<p
+    // kase = 2     if s(k) is negligible and k<p
+    // kase = 3     if e[k-1] is negligible, k<p, and
+    //              s(k), ..., s(p) are not negligible (qr step).
+    // kase = 4     if e(p-1) is negligible (convergence).
+
+    for (k = p-2; k >= -1; --k)
+    {
+      if (k == -1)
+          break;
+      if (ei_abs(e[k]) <= eps*(ei_abs(m_sigma[k]) + ei_abs(m_sigma[k+1])))
+      {
+          e[k] = 0.0;
+          break;
+      }
+    }
+    if (k == p-2)
+    {
+      kase = 4;
+    }
+    else
+    {
+      int ks;
+      for (ks = p-1; ks >= k; --ks)
+      {
+        if (ks == k)
+          break;
+        Scalar t = (ks != p ? ei_abs(e[ks]) : Scalar(0)) + (ks != k+1 ? ei_abs(e[ks-1]) : Scalar(0));
+        if (ei_abs(m_sigma[ks]) <= eps*t)
+        {
+          m_sigma[ks] = 0.0;
+          break;
+        }
+      }
+      if (ks == k)
+      {
+        kase = 3;
+      }
+      else if (ks == p-1)
+      {
+        kase = 1;
+      }
+      else
+      {
+        kase = 2;
+        k = ks;
+      }
+    }
+    ++k;
+
+    // Perform the task indicated by kase.
+    switch (kase)
+    {
+
+      // Deflate negligible s(p).
+      case 1:
+      {
+        Scalar f(e[p-2]);
+        e[p-2] = 0.0;
+        for (j = p-2; j >= k; --j)
+        {
+          Scalar t(numext::hypot(m_sigma[j],f));
+          Scalar cs(m_sigma[j]/t);
+          Scalar sn(f/t);
+          m_sigma[j] = t;
+          if (j != k)
+          {
+            f = -sn*e[j-1];
+            e[j-1] = cs*e[j-1];
+          }
+          if (wantv)
+          {
+            for (i = 0; i < n; ++i)
+            {
+              t = cs*m_matV(i,j) + sn*m_matV(i,p-1);
+              m_matV(i,p-1) = -sn*m_matV(i,j) + cs*m_matV(i,p-1);
+              m_matV(i,j) = t;
+            }
+          }
+        }
+      }
+      break;
+
+      // Split at negligible s(k).
+      case 2:
+      {
+        Scalar f(e[k-1]);
+        e[k-1] = 0.0;
+        for (j = k; j < p; ++j)
+        {
+          Scalar t(numext::hypot(m_sigma[j],f));
+          Scalar cs( m_sigma[j]/t);
+          Scalar sn(f/t);
+          m_sigma[j] = t;
+          f = -sn*e[j];
+          e[j] = cs*e[j];
+          if (wantu)
+          {
+            for (i = 0; i < m; ++i)
+            {
+              t = cs*m_matU(i,j) + sn*m_matU(i,k-1);
+              m_matU(i,k-1) = -sn*m_matU(i,j) + cs*m_matU(i,k-1);
+              m_matU(i,j) = t;
+            }
+          }
+        }
+      }
+      break;
+
+      // Perform one qr step.
+      case 3:
+      {
+        // Calculate the shift.
+        Scalar scale = (std::max)((std::max)((std::max)((std::max)(
+                        ei_abs(m_sigma[p-1]),ei_abs(m_sigma[p-2])),ei_abs(e[p-2])),
+                        ei_abs(m_sigma[k])),ei_abs(e[k]));
+        Scalar sp = m_sigma[p-1]/scale;
+        Scalar spm1 = m_sigma[p-2]/scale;
+        Scalar epm1 = e[p-2]/scale;
+        Scalar sk = m_sigma[k]/scale;
+        Scalar ek = e[k]/scale;
+        Scalar b = ((spm1 + sp)*(spm1 - sp) + epm1*epm1)/Scalar(2);
+        Scalar c = (sp*epm1)*(sp*epm1);
+        Scalar shift(0);
+        if ((b != 0.0) || (c != 0.0))
+        {
+          shift = ei_sqrt(b*b + c);
+          if (b < 0.0)
+            shift = -shift;
+          shift = c/(b + shift);
+        }
+        Scalar f = (sk + sp)*(sk - sp) + shift;
+        Scalar g = sk*ek;
+
+        // Chase zeros.
+
+        for (j = k; j < p-1; ++j)
+        {
+          Scalar t = numext::hypot(f,g);
+          Scalar cs = f/t;
+          Scalar sn = g/t;
+          if (j != k)
+            e[j-1] = t;
+          f = cs*m_sigma[j] + sn*e[j];
+          e[j] = cs*e[j] - sn*m_sigma[j];
+          g = sn*m_sigma[j+1];
+          m_sigma[j+1] = cs*m_sigma[j+1];
+          if (wantv)
+          {
+            for (i = 0; i < n; ++i)
+            {
+              t = cs*m_matV(i,j) + sn*m_matV(i,j+1);
+              m_matV(i,j+1) = -sn*m_matV(i,j) + cs*m_matV(i,j+1);
+              m_matV(i,j) = t;
+            }
+          }
+          t = numext::hypot(f,g);
+          cs = f/t;
+          sn = g/t;
+          m_sigma[j] = t;
+          f = cs*e[j] + sn*m_sigma[j+1];
+          m_sigma[j+1] = -sn*e[j] + cs*m_sigma[j+1];
+          g = sn*e[j+1];
+          e[j+1] = cs*e[j+1];
+          if (wantu && (j < m-1))
+          {
+            for (i = 0; i < m; ++i)
+            {
+              t = cs*m_matU(i,j) + sn*m_matU(i,j+1);
+              m_matU(i,j+1) = -sn*m_matU(i,j) + cs*m_matU(i,j+1);
+              m_matU(i,j) = t;
+            }
+          }
+        }
+        e[p-2] = f;
+        iter = iter + 1;
+      }
+      break;
+
+      // Convergence.
+      case 4:
+      {
+        // Make the singular values positive.
+        if (m_sigma[k] <= 0.0)
+        {
+          m_sigma[k] = m_sigma[k] < Scalar(0) ? -m_sigma[k] : Scalar(0);
+          if (wantv)
+            m_matV.col(k).start(pp+1) = -m_matV.col(k).start(pp+1);
+        }
+
+        // Order the singular values.
+        while (k < pp)
+        {
+          if (m_sigma[k] >= m_sigma[k+1])
+            break;
+          Scalar t = m_sigma[k];
+          m_sigma[k] = m_sigma[k+1];
+          m_sigma[k+1] = t;
+          if (wantv && (k < n-1))
+            m_matV.col(k).swap(m_matV.col(k+1));
+          if (wantu && (k < m-1))
+            m_matU.col(k).swap(m_matU.col(k+1));
+          ++k;
+        }
+        iter = 0;
+        p--;
+      }
+      break;
+    } // end big switch
+  } // end iterations
+}
+
+template<typename MatrixType>
+SVD<MatrixType>& SVD<MatrixType>::sort()
+{
+  int mu = m_matU.rows();
+  int mv = m_matV.rows();
+  int n  = m_matU.cols();
+
+  for (int i=0; i<n; ++i)
+  {
+    int  k = i;
+    Scalar p = m_sigma.coeff(i);
+
+    for (int j=i+1; j<n; ++j)
+    {
+      if (m_sigma.coeff(j) > p)
+      {
+        k = j;
+        p = m_sigma.coeff(j);
+      }
+    }
+    if (k != i)
+    {
+      m_sigma.coeffRef(k) = m_sigma.coeff(i);  // i.e.
+      m_sigma.coeffRef(i) = p;                 // swaps the i-th and the k-th elements
+
+      int j = mu;
+      for(int s=0; j!=0; ++s, --j)
+        std::swap(m_matU.coeffRef(s,i), m_matU.coeffRef(s,k));
+
+      j = mv;
+      for (int s=0; j!=0; ++s, --j)
+        std::swap(m_matV.coeffRef(s,i), m_matV.coeffRef(s,k));
+    }
+  }
+  return *this;
+}
+
+/** \returns the solution of \f$ A x = b \f$ using the current SVD decomposition of A.
+  * The parts of the solution corresponding to zero singular values are ignored.
+  *
+  * \sa MatrixBase::svd(), LU::solve(), LLT::solve()
+  */
+template<typename MatrixType>
+template<typename OtherDerived, typename ResultType>
+bool SVD<MatrixType>::solve(const MatrixBase<OtherDerived> &b, ResultType* result) const
+{
+  ei_assert(b.rows() == m_matU.rows());
+
+  Scalar maxVal = m_sigma.cwise().abs().maxCoeff();
+  for (int j=0; j<b.cols(); ++j)
+  {
+    Matrix<Scalar,MatrixUType::RowsAtCompileTime,1> aux = m_matU.transpose() * b.col(j);
+
+    for (int i = 0; i <m_matU.cols(); ++i)
+    {
+      Scalar si = m_sigma.coeff(i);
+      if (ei_isMuchSmallerThan(ei_abs(si),maxVal))
+        aux.coeffRef(i) = 0;
+      else
+        aux.coeffRef(i) /= si;
+    }
+
+    result->col(j) = m_matV * aux;
+  }
+  return true;
+}
+
+/** Computes the polar decomposition of the matrix, as a product unitary x positive.
+  *
+  * If either pointer is zero, the corresponding computation is skipped.
+  *
+  * Only for square matrices.
+  *
+  * \sa computePositiveUnitary(), computeRotationScaling()
+  */
+template<typename MatrixType>
+template<typename UnitaryType, typename PositiveType>
+void SVD<MatrixType>::computeUnitaryPositive(UnitaryType *unitary,
+                                             PositiveType *positive) const
+{
+  ei_assert(m_matU.cols() == m_matV.cols() && "Polar decomposition is only for square matrices");
+  if(unitary) *unitary = m_matU * m_matV.adjoint();
+  if(positive) *positive = m_matV * m_sigma.asDiagonal() * m_matV.adjoint();
+}
+
+/** Computes the polar decomposition of the matrix, as a product positive x unitary.
+  *
+  * If either pointer is zero, the corresponding computation is skipped.
+  *
+  * Only for square matrices.
+  *
+  * \sa computeUnitaryPositive(), computeRotationScaling()
+  */
+template<typename MatrixType>
+template<typename UnitaryType, typename PositiveType>
+void SVD<MatrixType>::computePositiveUnitary(UnitaryType *positive,
+                                             PositiveType *unitary) const
+{
+  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
+  if(unitary) *unitary = m_matU * m_matV.adjoint();
+  if(positive) *positive = m_matU * m_sigma.asDiagonal() * m_matU.adjoint();
+}
+
+/** decomposes the matrix as a product rotation x scaling, the scaling being
+  * not necessarily positive.
+  *
+  * If either pointer is zero, the corresponding computation is skipped.
+  *
+  * This method requires the Geometry module.
+  *
+  * \sa computeScalingRotation(), computeUnitaryPositive()
+  */
+template<typename MatrixType>
+template<typename RotationType, typename ScalingType>
+void SVD<MatrixType>::computeRotationScaling(RotationType *rotation, ScalingType *scaling) const
+{
+  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
+  Scalar x = (m_matU * m_matV.adjoint()).determinant(); // so x has absolute value 1
+  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> sv(m_sigma);
+  sv.coeffRef(0) *= x;
+  if(scaling) scaling->lazyAssign(m_matV * sv.asDiagonal() * m_matV.adjoint());
+  if(rotation)
+  {
+    MatrixType m(m_matU);
+    m.col(0) /= x;
+    rotation->lazyAssign(m * m_matV.adjoint());
+  }
+}
+
+/** decomposes the matrix as a product scaling x rotation, the scaling being
+  * not necessarily positive.
+  *
+  * If either pointer is zero, the corresponding computation is skipped.
+  *
+  * This method requires the Geometry module.
+  *
+  * \sa computeRotationScaling(), computeUnitaryPositive()
+  */
+template<typename MatrixType>
+template<typename ScalingType, typename RotationType>
+void SVD<MatrixType>::computeScalingRotation(ScalingType *scaling, RotationType *rotation) const
+{
+  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
+  Scalar x = (m_matU * m_matV.adjoint()).determinant(); // so x has absolute value 1
+  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> sv(m_sigma);
+  sv.coeffRef(0) *= x;
+  if(scaling) scaling->lazyAssign(m_matU * sv.asDiagonal() * m_matU.adjoint());
+  if(rotation)
+  {
+    MatrixType m(m_matU);
+    m.col(0) /= x;
+    rotation->lazyAssign(m * m_matV.adjoint());
+  }
+}
+
+
+/** \svd_module
+  * \returns the SVD decomposition of \c *this
+  */
+template<typename Derived>
+inline SVD<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::svd() const
+{
+  return SVD<PlainObject>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN2_SVD_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/TriangularSolver.h b/include/eigen3/Eigen/src/Eigen2Support/TriangularSolver.h
new file mode 100644
index 0000000..ebbeb3b
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/TriangularSolver.h
@@ -0,0 +1,42 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER2_H
+#define EIGEN_TRIANGULAR_SOLVER2_H
+
+namespace Eigen { 
+
+const unsigned int UnitDiagBit = UnitDiag;
+const unsigned int SelfAdjointBit = SelfAdjoint;
+const unsigned int UpperTriangularBit = Upper;
+const unsigned int LowerTriangularBit = Lower;
+
+const unsigned int UpperTriangular = Upper;
+const unsigned int LowerTriangular = Lower;
+const unsigned int UnitUpperTriangular = UnitUpper;
+const unsigned int UnitLowerTriangular = UnitLower;
+
+template<typename ExpressionType, unsigned int Added, unsigned int Removed>
+template<typename OtherDerived>
+typename ExpressionType::PlainObject
+Flagged<ExpressionType,Added,Removed>::solveTriangular(const MatrixBase<OtherDerived>& other) const
+{
+  return m_matrix.template triangularView<Added>().solve(other.derived());
+}
+
+template<typename ExpressionType, unsigned int Added, unsigned int Removed>
+template<typename OtherDerived>
+void Flagged<ExpressionType,Added,Removed>::solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const
+{
+  m_matrix.template triangularView<Added>().solveInPlace(other.derived());
+}
+
+} // end namespace Eigen
+    
+#endif // EIGEN_TRIANGULAR_SOLVER2_H
diff --git a/include/eigen3/Eigen/src/Eigen2Support/VectorBlock.h b/include/eigen3/Eigen/src/Eigen2Support/VectorBlock.h
new file mode 100644
index 0000000..71a8080
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigen2Support/VectorBlock.h
@@ -0,0 +1,94 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2_VECTORBLOCK_H
+#define EIGEN2_VECTORBLOCK_H
+
+namespace Eigen { 
+
+/** \deprecated use DenseMase::head(Index) */
+template<typename Derived>
+inline VectorBlock<Derived>
+MatrixBase<Derived>::start(Index size)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return VectorBlock<Derived>(derived(), 0, size);
+}
+
+/** \deprecated use DenseMase::head(Index) */
+template<typename Derived>
+inline const VectorBlock<const Derived>
+MatrixBase<Derived>::start(Index size) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return VectorBlock<const Derived>(derived(), 0, size);
+}
+
+/** \deprecated use DenseMase::tail(Index) */
+template<typename Derived>
+inline VectorBlock<Derived>
+MatrixBase<Derived>::end(Index size)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return VectorBlock<Derived>(derived(), this->size() - size, size);
+}
+
+/** \deprecated use DenseMase::tail(Index) */
+template<typename Derived>
+inline const VectorBlock<const Derived>
+MatrixBase<Derived>::end(Index size) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return VectorBlock<const Derived>(derived(), this->size() - size, size);
+}
+
+/** \deprecated use DenseMase::head() */
+template<typename Derived>
+template<int Size>
+inline VectorBlock<Derived,Size>
+MatrixBase<Derived>::start()
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return VectorBlock<Derived,Size>(derived(), 0);
+}
+
+/** \deprecated use DenseMase::head() */
+template<typename Derived>
+template<int Size>
+inline const VectorBlock<const Derived,Size>
+MatrixBase<Derived>::start() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return VectorBlock<const Derived,Size>(derived(), 0);
+}
+
+/** \deprecated use DenseMase::tail() */
+template<typename Derived>
+template<int Size>
+inline VectorBlock<Derived,Size>
+MatrixBase<Derived>::end()
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return VectorBlock<Derived, Size>(derived(), size() - Size);
+}
+
+/** \deprecated use DenseMase::tail() */
+template<typename Derived>
+template<int Size>
+inline const VectorBlock<const Derived,Size>
+MatrixBase<Derived>::end() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return VectorBlock<const Derived, Size>(derived(), size() - Size);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN2_VECTORBLOCK_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/include/eigen3/Eigen/src/Eigenvalues/ComplexEigenSolver.h
new file mode 100644
index 0000000..af434bc
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -0,0 +1,333 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Claire Maurice
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010,2012 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_EIGEN_SOLVER_H
+#define EIGEN_COMPLEX_EIGEN_SOLVER_H
+
+#include "./ComplexSchur.h"
+
+namespace Eigen { 
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \class ComplexEigenSolver
+  *
+  * \brief Computes eigenvalues and eigenvectors of general complex matrices
+  *
+  * \tparam _MatrixType the type of the matrix of which we are
+  * computing the eigendecomposition; this is expected to be an
+  * instantiation of the Matrix class template.
+  *
+  * The eigenvalues and eigenvectors of a matrix \f$ A \f$ are scalars
+  * \f$ \lambda \f$ and vectors \f$ v \f$ such that \f$ Av = \lambda v
+  * \f$.  If \f$ D \f$ is a diagonal matrix with the eigenvalues on
+  * the diagonal, and \f$ V \f$ is a matrix with the eigenvectors as
+  * its columns, then \f$ A V = V D \f$. The matrix \f$ V \f$ is
+  * almost always invertible, in which case we have \f$ A = V D V^{-1}
+  * \f$. This is called the eigendecomposition.
+  *
+  * The main function in this class is compute(), which computes the
+  * eigenvalues and eigenvectors of a given function. The
+  * documentation for that function contains an example showing the
+  * main features of the class.
+  *
+  * \sa class EigenSolver, class SelfAdjointEigenSolver
+  */
+template<typename _MatrixType> class ComplexEigenSolver
+{
+  public:
+
+    /** \brief Synonym for the template parameter \p _MatrixType. */
+    typedef _MatrixType MatrixType;
+
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+
+    /** \brief Scalar type for matrices of type #MatrixType. */
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename MatrixType::Index Index;
+
+    /** \brief Complex scalar type for #MatrixType.
+      *
+      * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
+      * \c float or \c double) and just \c Scalar if #Scalar is
+      * complex.
+      */
+    typedef std::complex<RealScalar> ComplexScalar;
+
+    /** \brief Type for vector of eigenvalues as returned by eigenvalues().
+      *
+      * This is a column vector with entries of type #ComplexScalar.
+      * The length of the vector is the size of #MatrixType.
+      */
+    typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options&(~RowMajor), MaxColsAtCompileTime, 1> EigenvalueType;
+
+    /** \brief Type for matrix of eigenvectors as returned by eigenvectors().
+      *
+      * This is a square matrix with entries of type #ComplexScalar.
+      * The size is the same as the size of #MatrixType.
+      */
+    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime, MaxColsAtCompileTime> EigenvectorType;
+
+    /** \brief Default constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via compute().
+      */
+    ComplexEigenSolver()
+            : m_eivec(),
+              m_eivalues(),
+              m_schur(),
+              m_isInitialized(false),
+              m_eigenvectorsOk(false),
+              m_matX()
+    {}
+
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa ComplexEigenSolver()
+      */
+    ComplexEigenSolver(Index size)
+            : m_eivec(size, size),
+              m_eivalues(size),
+              m_schur(size),
+              m_isInitialized(false),
+              m_eigenvectorsOk(false),
+              m_matX(size, size)
+    {}
+
+    /** \brief Constructor; computes eigendecomposition of given matrix.
+      *
+      * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
+      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+      *    eigenvalues are computed; if false, only the eigenvalues are
+      *    computed.
+      *
+      * This constructor calls compute() to compute the eigendecomposition.
+      */
+      ComplexEigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
+            : m_eivec(matrix.rows(),matrix.cols()),
+              m_eivalues(matrix.cols()),
+              m_schur(matrix.rows()),
+              m_isInitialized(false),
+              m_eigenvectorsOk(false),
+              m_matX(matrix.rows(),matrix.cols())
+    {
+      compute(matrix, computeEigenvectors);
+    }
+
+    /** \brief Returns the eigenvectors of given matrix.
+      *
+      * \returns  A const reference to the matrix whose columns are the eigenvectors.
+      *
+      * \pre Either the constructor
+      * ComplexEigenSolver(const MatrixType& matrix, bool) or the member
+      * function compute(const MatrixType& matrix, bool) has been called before
+      * to compute the eigendecomposition of a matrix, and
+      * \p computeEigenvectors was set to true (the default).
+      *
+      * This function returns a matrix whose columns are the eigenvectors. Column
+      * \f$ k \f$ is an eigenvector corresponding to eigenvalue number \f$ k
+      * \f$ as returned by eigenvalues().  The eigenvectors are normalized to
+      * have (Euclidean) norm equal to one. The matrix returned by this
+      * function is the matrix \f$ V \f$ in the eigendecomposition \f$ A = V D
+      * V^{-1} \f$, if it exists.
+      *
+      * Example: \include ComplexEigenSolver_eigenvectors.cpp
+      * Output: \verbinclude ComplexEigenSolver_eigenvectors.out
+      */
+    const EigenvectorType& eigenvectors() const
+    {
+      eigen_assert(m_isInitialized && "ComplexEigenSolver is not initialized.");
+      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+      return m_eivec;
+    }
+
+    /** \brief Returns the eigenvalues of given matrix.
+      *
+      * \returns A const reference to the column vector containing the eigenvalues.
+      *
+      * \pre Either the constructor
+      * ComplexEigenSolver(const MatrixType& matrix, bool) or the member
+      * function compute(const MatrixType& matrix, bool) has been called before
+      * to compute the eigendecomposition of a matrix.
+      *
+      * This function returns a column vector containing the
+      * eigenvalues. Eigenvalues are repeated according to their
+      * algebraic multiplicity, so there are as many eigenvalues as
+      * rows in the matrix. The eigenvalues are not sorted in any particular
+      * order.
+      *
+      * Example: \include ComplexEigenSolver_eigenvalues.cpp
+      * Output: \verbinclude ComplexEigenSolver_eigenvalues.out
+      */
+    const EigenvalueType& eigenvalues() const
+    {
+      eigen_assert(m_isInitialized && "ComplexEigenSolver is not initialized.");
+      return m_eivalues;
+    }
+
+    /** \brief Computes eigendecomposition of given matrix.
+      *
+      * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
+      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+      *    eigenvalues are computed; if false, only the eigenvalues are
+      *    computed.
+      * \returns    Reference to \c *this
+      *
+      * This function computes the eigenvalues of the complex matrix \p matrix.
+      * The eigenvalues() function can be used to retrieve them.  If
+      * \p computeEigenvectors is true, then the eigenvectors are also computed
+      * and can be retrieved by calling eigenvectors().
+      *
+      * The matrix is first reduced to Schur form using the
+      * ComplexSchur class. The Schur decomposition is then used to
+      * compute the eigenvalues and eigenvectors.
+      *
+      * The cost of the computation is dominated by the cost of the
+      * Schur decomposition, which is \f$ O(n^3) \f$ where \f$ n \f$
+      * is the size of the matrix.
+      *
+      * Example: \include ComplexEigenSolver_compute.cpp
+      * Output: \verbinclude ComplexEigenSolver_compute.out
+      */
+    ComplexEigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "ComplexEigenSolver is not initialized.");
+      return m_schur.info();
+    }
+
+    /** \brief Sets the maximum number of iterations allowed. */
+    ComplexEigenSolver& setMaxIterations(Index maxIters)
+    {
+      m_schur.setMaxIterations(maxIters);
+      return *this;
+    }
+
+    /** \brief Returns the maximum number of iterations. */
+    Index getMaxIterations()
+    {
+      return m_schur.getMaxIterations();
+    }
+
+  protected:
+    EigenvectorType m_eivec;
+    EigenvalueType m_eivalues;
+    ComplexSchur<MatrixType> m_schur;
+    bool m_isInitialized;
+    bool m_eigenvectorsOk;
+    EigenvectorType m_matX;
+
+  private:
+    void doComputeEigenvectors(const RealScalar& matrixnorm);
+    void sortEigenvalues(bool computeEigenvectors);
+};
+
+
+template<typename MatrixType>
+ComplexEigenSolver<MatrixType>& 
+ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
+{
+  // this code is inspired from Jampack
+  eigen_assert(matrix.cols() == matrix.rows());
+
+  // Do a complex Schur decomposition, A = U T U^*
+  // The eigenvalues are on the diagonal of T.
+  m_schur.compute(matrix, computeEigenvectors);
+
+  if(m_schur.info() == Success)
+  {
+    m_eivalues = m_schur.matrixT().diagonal();
+    if(computeEigenvectors)
+      doComputeEigenvectors(matrix.norm());
+    sortEigenvalues(computeEigenvectors);
+  }
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = computeEigenvectors;
+  return *this;
+}
+
+
+template<typename MatrixType>
+void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(const RealScalar& matrixnorm)
+{
+  const Index n = m_eivalues.size();
+
+  // Compute X such that T = X D X^(-1), where D is the diagonal of T.
+  // The matrix X is unit triangular.
+  m_matX = EigenvectorType::Zero(n, n);
+  for(Index k=n-1 ; k>=0 ; k--)
+  {
+    m_matX.coeffRef(k,k) = ComplexScalar(1.0,0.0);
+    // Compute X(i,k) using the (i,k) entry of the equation X T = D X
+    for(Index i=k-1 ; i>=0 ; i--)
+    {
+      m_matX.coeffRef(i,k) = -m_schur.matrixT().coeff(i,k);
+      if(k-i-1>0)
+        m_matX.coeffRef(i,k) -= (m_schur.matrixT().row(i).segment(i+1,k-i-1) * m_matX.col(k).segment(i+1,k-i-1)).value();
+      ComplexScalar z = m_schur.matrixT().coeff(i,i) - m_schur.matrixT().coeff(k,k);
+      if(z==ComplexScalar(0))
+      {
+        // If the i-th and k-th eigenvalue are equal, then z equals 0.
+        // Use a small value instead, to prevent division by zero.
+        numext::real_ref(z) = NumTraits<RealScalar>::epsilon() * matrixnorm;
+      }
+      m_matX.coeffRef(i,k) = m_matX.coeff(i,k) / z;
+    }
+  }
+
+  // Compute V as V = U X; now A = U T U^* = U X D X^(-1) U^* = V D V^(-1)
+  m_eivec.noalias() = m_schur.matrixU() * m_matX;
+  // .. and normalize the eigenvectors
+  for(Index k=0 ; k<n ; k++)
+  {
+    m_eivec.col(k).normalize();
+  }
+}
+
+
+template<typename MatrixType>
+void ComplexEigenSolver<MatrixType>::sortEigenvalues(bool computeEigenvectors)
+{
+  const Index n =  m_eivalues.size();
+  for (Index i=0; i<n; i++)
+  {
+    Index k;
+    m_eivalues.cwiseAbs().tail(n-i).minCoeff(&k);
+    if (k != 0)
+    {
+      k += i;
+      std::swap(m_eivalues[k],m_eivalues[i]);
+      if(computeEigenvectors)
+	m_eivec.col(i).swap(m_eivec.col(k));
+    }
+  }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_EIGEN_SOLVER_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur.h b/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur.h
new file mode 100644
index 0000000..89e6cad
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -0,0 +1,456 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Claire Maurice
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010,2012 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_SCHUR_H
+#define EIGEN_COMPLEX_SCHUR_H
+
+#include "./HessenbergDecomposition.h"
+
+namespace Eigen { 
+
+namespace internal {
+template<typename MatrixType, bool IsComplex> struct complex_schur_reduce_to_hessenberg;
+}
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \class ComplexSchur
+  *
+  * \brief Performs a complex Schur decomposition of a real or complex square matrix
+  *
+  * \tparam _MatrixType the type of the matrix of which we are
+  * computing the Schur decomposition; this is expected to be an
+  * instantiation of the Matrix class template.
+  *
+  * Given a real or complex square matrix A, this class computes the
+  * Schur decomposition: \f$ A = U T U^*\f$ where U is a unitary
+  * complex matrix, and T is a complex upper triangular matrix.  The
+  * diagonal of the matrix T corresponds to the eigenvalues of the
+  * matrix A.
+  *
+  * Call the function compute() to compute the Schur decomposition of
+  * a given matrix. Alternatively, you can use the 
+  * ComplexSchur(const MatrixType&, bool) constructor which computes
+  * the Schur decomposition at construction time. Once the
+  * decomposition is computed, you can use the matrixU() and matrixT()
+  * functions to retrieve the matrices U and V in the decomposition.
+  *
+  * \note This code is inspired from Jampack
+  *
+  * \sa class RealSchur, class EigenSolver, class ComplexEigenSolver
+  */
+template<typename _MatrixType> class ComplexSchur
+{
+  public:
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+
+    /** \brief Scalar type for matrices of type \p _MatrixType. */
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename MatrixType::Index Index;
+
+    /** \brief Complex scalar type for \p _MatrixType. 
+      *
+      * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
+      * \c float or \c double) and just \c Scalar if #Scalar is
+      * complex.
+      */
+    typedef std::complex<RealScalar> ComplexScalar;
+
+    /** \brief Type for the matrices in the Schur decomposition.
+      *
+      * This is a square matrix with entries of type #ComplexScalar. 
+      * The size is the same as the size of \p _MatrixType.
+      */
+    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime, MaxColsAtCompileTime> ComplexMatrixType;
+
+    /** \brief Default constructor.
+      *
+      * \param [in] size  Positive integer, size of the matrix whose Schur decomposition will be computed.
+      *
+      * The default constructor is useful in cases in which the user
+      * intends to perform decompositions via compute().  The \p size
+      * parameter is only used as a hint. It is not an error to give a
+      * wrong \p size, but it may impair performance.
+      *
+      * \sa compute() for an example.
+      */
+    ComplexSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
+      : m_matT(size,size),
+        m_matU(size,size),
+        m_hess(size),
+        m_isInitialized(false),
+        m_matUisUptodate(false),
+        m_maxIters(-1)
+    {}
+
+    /** \brief Constructor; computes Schur decomposition of given matrix. 
+      * 
+      * \param[in]  matrix    Square matrix whose Schur decomposition is to be computed.
+      * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
+      *
+      * This constructor calls compute() to compute the Schur decomposition.
+      *
+      * \sa matrixT() and matrixU() for examples.
+      */
+    ComplexSchur(const MatrixType& matrix, bool computeU = true)
+      : m_matT(matrix.rows(),matrix.cols()),
+        m_matU(matrix.rows(),matrix.cols()),
+        m_hess(matrix.rows()),
+        m_isInitialized(false),
+        m_matUisUptodate(false),
+        m_maxIters(-1)
+    {
+      compute(matrix, computeU);
+    }
+
+    /** \brief Returns the unitary matrix in the Schur decomposition. 
+      *
+      * \returns A const reference to the matrix U.
+      *
+      * It is assumed that either the constructor
+      * ComplexSchur(const MatrixType& matrix, bool computeU) or the
+      * member function compute(const MatrixType& matrix, bool computeU)
+      * has been called before to compute the Schur decomposition of a
+      * matrix, and that \p computeU was set to true (the default
+      * value).
+      *
+      * Example: \include ComplexSchur_matrixU.cpp
+      * Output: \verbinclude ComplexSchur_matrixU.out
+      */
+    const ComplexMatrixType& matrixU() const
+    {
+      eigen_assert(m_isInitialized && "ComplexSchur is not initialized.");
+      eigen_assert(m_matUisUptodate && "The matrix U has not been computed during the ComplexSchur decomposition.");
+      return m_matU;
+    }
+
+    /** \brief Returns the triangular matrix in the Schur decomposition. 
+      *
+      * \returns A const reference to the matrix T.
+      *
+      * It is assumed that either the constructor
+      * ComplexSchur(const MatrixType& matrix, bool computeU) or the
+      * member function compute(const MatrixType& matrix, bool computeU)
+      * has been called before to compute the Schur decomposition of a
+      * matrix.
+      *
+      * Note that this function returns a plain square matrix. If you want to reference
+      * only the upper triangular part, use:
+      * \code schur.matrixT().triangularView<Upper>() \endcode 
+      *
+      * Example: \include ComplexSchur_matrixT.cpp
+      * Output: \verbinclude ComplexSchur_matrixT.out
+      */
+    const ComplexMatrixType& matrixT() const
+    {
+      eigen_assert(m_isInitialized && "ComplexSchur is not initialized.");
+      return m_matT;
+    }
+
+    /** \brief Computes Schur decomposition of given matrix. 
+      * 
+      * \param[in]  matrix  Square matrix whose Schur decomposition is to be computed.
+      * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
+
+      * \returns    Reference to \c *this
+      *
+      * The Schur decomposition is computed by first reducing the
+      * matrix to Hessenberg form using the class
+      * HessenbergDecomposition. The Hessenberg matrix is then reduced
+      * to triangular form by performing QR iterations with a single
+      * shift. The cost of computing the Schur decomposition depends
+      * on the number of iterations; as a rough guide, it may be taken
+      * on the number of iterations; as a rough guide, it may be taken
+      * to be \f$25n^3\f$ complex flops, or \f$10n^3\f$ complex flops
+      * if \a computeU is false.
+      *
+      * Example: \include ComplexSchur_compute.cpp
+      * Output: \verbinclude ComplexSchur_compute.out
+      *
+      * \sa compute(const MatrixType&, bool, Index)
+      */
+    ComplexSchur& compute(const MatrixType& matrix, bool computeU = true);
+    
+    /** \brief Compute Schur decomposition from a given Hessenberg matrix
+     *  \param[in] matrixH Matrix in Hessenberg form H
+     *  \param[in] matrixQ orthogonal matrix Q that transform a matrix A to H : A = Q H Q^T
+     *  \param computeU Computes the matriX U of the Schur vectors
+     * \return Reference to \c *this
+     * 
+     *  This routine assumes that the matrix is already reduced in Hessenberg form matrixH
+     *  using either the class HessenbergDecomposition or another mean. 
+     *  It computes the upper quasi-triangular matrix T of the Schur decomposition of H
+     *  When computeU is true, this routine computes the matrix U such that 
+     *  A = U T U^T =  (QZ) T (QZ)^T = Q H Q^T where A is the initial matrix
+     * 
+     * NOTE Q is referenced if computeU is true; so, if the initial orthogonal matrix
+     * is not available, the user should give an identity matrix (Q.setIdentity())
+     * 
+     * \sa compute(const MatrixType&, bool)
+     */
+    template<typename HessMatrixType, typename OrthMatrixType>
+    ComplexSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU=true);
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "ComplexSchur is not initialized.");
+      return m_info;
+    }
+
+    /** \brief Sets the maximum number of iterations allowed. 
+      *
+      * If not specified by the user, the maximum number of iterations is m_maxIterationsPerRow times the size
+      * of the matrix.
+      */
+    ComplexSchur& setMaxIterations(Index maxIters)
+    {
+      m_maxIters = maxIters;
+      return *this;
+    }
+
+    /** \brief Returns the maximum number of iterations. */
+    Index getMaxIterations()
+    {
+      return m_maxIters;
+    }
+
+    /** \brief Maximum number of iterations per row.
+      *
+      * If not otherwise specified, the maximum number of iterations is this number times the size of the
+      * matrix. It is currently set to 30.
+      */
+    static const int m_maxIterationsPerRow = 30;
+
+  protected:
+    ComplexMatrixType m_matT, m_matU;
+    HessenbergDecomposition<MatrixType> m_hess;
+    ComputationInfo m_info;
+    bool m_isInitialized;
+    bool m_matUisUptodate;
+    Index m_maxIters;
+
+  private:  
+    bool subdiagonalEntryIsNeglegible(Index i);
+    ComplexScalar computeShift(Index iu, Index iter);
+    void reduceToTriangularForm(bool computeU);
+    friend struct internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>;
+};
+
+/** If m_matT(i+1,i) is neglegible in floating point arithmetic
+  * compared to m_matT(i,i) and m_matT(j,j), then set it to zero and
+  * return true, else return false. */
+template<typename MatrixType>
+inline bool ComplexSchur<MatrixType>::subdiagonalEntryIsNeglegible(Index i)
+{
+  RealScalar d = numext::norm1(m_matT.coeff(i,i)) + numext::norm1(m_matT.coeff(i+1,i+1));
+  RealScalar sd = numext::norm1(m_matT.coeff(i+1,i));
+  if (internal::isMuchSmallerThan(sd, d, NumTraits<RealScalar>::epsilon()))
+  {
+    m_matT.coeffRef(i+1,i) = ComplexScalar(0);
+    return true;
+  }
+  return false;
+}
+
+
+/** Compute the shift in the current QR iteration. */
+template<typename MatrixType>
+typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::computeShift(Index iu, Index iter)
+{
+  using std::abs;
+  if (iter == 10 || iter == 20) 
+  {
+    // exceptional shift, taken from http://www.netlib.org/eispack/comqr.f
+    return abs(numext::real(m_matT.coeff(iu,iu-1))) + abs(numext::real(m_matT.coeff(iu-1,iu-2)));
+  }
+
+  // compute the shift as one of the eigenvalues of t, the 2x2
+  // diagonal block on the bottom of the active submatrix
+  Matrix<ComplexScalar,2,2> t = m_matT.template block<2,2>(iu-1,iu-1);
+  RealScalar normt = t.cwiseAbs().sum();
+  t /= normt;     // the normalization by sf is to avoid under/overflow
+
+  ComplexScalar b = t.coeff(0,1) * t.coeff(1,0);
+  ComplexScalar c = t.coeff(0,0) - t.coeff(1,1);
+  ComplexScalar disc = sqrt(c*c + RealScalar(4)*b);
+  ComplexScalar det = t.coeff(0,0) * t.coeff(1,1) - b;
+  ComplexScalar trace = t.coeff(0,0) + t.coeff(1,1);
+  ComplexScalar eival1 = (trace + disc) / RealScalar(2);
+  ComplexScalar eival2 = (trace - disc) / RealScalar(2);
+
+  if(numext::norm1(eival1) > numext::norm1(eival2))
+    eival2 = det / eival1;
+  else
+    eival1 = det / eival2;
+
+  // choose the eigenvalue closest to the bottom entry of the diagonal
+  if(numext::norm1(eival1-t.coeff(1,1)) < numext::norm1(eival2-t.coeff(1,1)))
+    return normt * eival1;
+  else
+    return normt * eival2;
+}
+
+
+template<typename MatrixType>
+ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
+{
+  m_matUisUptodate = false;
+  eigen_assert(matrix.cols() == matrix.rows());
+
+  if(matrix.cols() == 1)
+  {
+    m_matT = matrix.template cast<ComplexScalar>();
+    if(computeU)  m_matU = ComplexMatrixType::Identity(1,1);
+    m_info = Success;
+    m_isInitialized = true;
+    m_matUisUptodate = computeU;
+    return *this;
+  }
+
+  internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>::run(*this, matrix, computeU);
+  computeFromHessenberg(m_matT, m_matU, computeU);
+  return *this;
+}
+
+template<typename MatrixType>
+template<typename HessMatrixType, typename OrthMatrixType>
+ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU)
+{
+  m_matT = matrixH;
+  if(computeU)
+    m_matU = matrixQ;
+  reduceToTriangularForm(computeU);
+  return *this;
+}
+namespace internal {
+
+/* Reduce given matrix to Hessenberg form */
+template<typename MatrixType, bool IsComplex>
+struct complex_schur_reduce_to_hessenberg
+{
+  // this is the implementation for the case IsComplex = true
+  static void run(ComplexSchur<MatrixType>& _this, const MatrixType& matrix, bool computeU)
+  {
+    _this.m_hess.compute(matrix);
+    _this.m_matT = _this.m_hess.matrixH();
+    if(computeU)  _this.m_matU = _this.m_hess.matrixQ();
+  }
+};
+
+template<typename MatrixType>
+struct complex_schur_reduce_to_hessenberg<MatrixType, false>
+{
+  static void run(ComplexSchur<MatrixType>& _this, const MatrixType& matrix, bool computeU)
+  {
+    typedef typename ComplexSchur<MatrixType>::ComplexScalar ComplexScalar;
+
+    // Note: m_hess is over RealScalar; m_matT and m_matU is over ComplexScalar
+    _this.m_hess.compute(matrix);
+    _this.m_matT = _this.m_hess.matrixH().template cast<ComplexScalar>();
+    if(computeU)  
+    {
+      // This may cause an allocation which seems to be avoidable
+      MatrixType Q = _this.m_hess.matrixQ(); 
+      _this.m_matU = Q.template cast<ComplexScalar>();
+    }
+  }
+};
+
+} // end namespace internal
+
+// Reduce the Hessenberg matrix m_matT to triangular form by QR iteration.
+template<typename MatrixType>
+void ComplexSchur<MatrixType>::reduceToTriangularForm(bool computeU)
+{  
+  Index maxIters = m_maxIters;
+  if (maxIters == -1)
+    maxIters = m_maxIterationsPerRow * m_matT.rows();
+
+  // The matrix m_matT is divided in three parts. 
+  // Rows 0,...,il-1 are decoupled from the rest because m_matT(il,il-1) is zero. 
+  // Rows il,...,iu is the part we are working on (the active submatrix).
+  // Rows iu+1,...,end are already brought in triangular form.
+  Index iu = m_matT.cols() - 1;
+  Index il;
+  Index iter = 0; // number of iterations we are working on the (iu,iu) element
+  Index totalIter = 0; // number of iterations for whole matrix
+
+  while(true)
+  {
+    // find iu, the bottom row of the active submatrix
+    while(iu > 0)
+    {
+      if(!subdiagonalEntryIsNeglegible(iu-1)) break;
+      iter = 0;
+      --iu;
+    }
+
+    // if iu is zero then we are done; the whole matrix is triangularized
+    if(iu==0) break;
+
+    // if we spent too many iterations, we give up
+    iter++;
+    totalIter++;
+    if(totalIter > maxIters) break;
+
+    // find il, the top row of the active submatrix
+    il = iu-1;
+    while(il > 0 && !subdiagonalEntryIsNeglegible(il-1))
+    {
+      --il;
+    }
+
+    /* perform the QR step using Givens rotations. The first rotation
+       creates a bulge; the (il+2,il) element becomes nonzero. This
+       bulge is chased down to the bottom of the active submatrix. */
+
+    ComplexScalar shift = computeShift(iu, iter);
+    JacobiRotation<ComplexScalar> rot;
+    rot.makeGivens(m_matT.coeff(il,il) - shift, m_matT.coeff(il+1,il));
+    m_matT.rightCols(m_matT.cols()-il).applyOnTheLeft(il, il+1, rot.adjoint());
+    m_matT.topRows((std::min)(il+2,iu)+1).applyOnTheRight(il, il+1, rot);
+    if(computeU) m_matU.applyOnTheRight(il, il+1, rot);
+
+    for(Index i=il+1 ; i<iu ; i++)
+    {
+      rot.makeGivens(m_matT.coeffRef(i,i-1), m_matT.coeffRef(i+1,i-1), &m_matT.coeffRef(i,i-1));
+      m_matT.coeffRef(i+1,i-1) = ComplexScalar(0);
+      m_matT.rightCols(m_matT.cols()-i).applyOnTheLeft(i, i+1, rot.adjoint());
+      m_matT.topRows((std::min)(i+2,iu)+1).applyOnTheRight(i, i+1, rot);
+      if(computeU) m_matU.applyOnTheRight(i, i+1, rot);
+    }
+  }
+
+  if(totalIter <= maxIters)
+    m_info = Success;
+  else
+    m_info = NoConvergence;
+
+  m_isInitialized = true;
+  m_matUisUptodate = computeU;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_SCHUR_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
new file mode 100644
index 0000000..91496ae
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
@@ -0,0 +1,94 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *    Complex Schur needed to complex unsymmetrical eigenvalues/eigenvectors.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_COMPLEX_SCHUR_MKL_H
+#define EIGEN_COMPLEX_SCHUR_MKL_H
+
+#include "Eigen/src/Core/util/MKL_support.h"
+
+namespace Eigen { 
+
+/** \internal Specialization for the data types supported by MKL */
+
+#define EIGEN_MKL_SCHUR_COMPLEX(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
+template<> inline \
+ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
+ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
+{ \
+  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
+  typedef MatrixType::Scalar Scalar; \
+  typedef MatrixType::RealScalar RealScalar; \
+  typedef std::complex<RealScalar> ComplexScalar; \
+\
+  eigen_assert(matrix.cols() == matrix.rows()); \
+\
+  m_matUisUptodate = false; \
+  if(matrix.cols() == 1) \
+  { \
+    m_matT = matrix.cast<ComplexScalar>(); \
+    if(computeU)  m_matU = ComplexMatrixType::Identity(1,1); \
+      m_info = Success; \
+      m_isInitialized = true; \
+      m_matUisUptodate = computeU; \
+      return *this; \
+  } \
+  lapack_int n = matrix.cols(), sdim, info; \
+  lapack_int lda = matrix.outerStride(); \
+  lapack_int matrix_order = MKLCOLROW; \
+  char jobvs, sort='N'; \
+  LAPACK_##MKLPREFIX_U##_SELECT1 select = 0; \
+  jobvs = (computeU) ? 'V' : 'N'; \
+  m_matU.resize(n, n); \
+  lapack_int ldvs  = m_matU.outerStride(); \
+  m_matT = matrix; \
+  Matrix<EIGTYPE, Dynamic, Dynamic> w; \
+  w.resize(n, 1);\
+  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)w.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
+  if(info == 0) \
+    m_info = Success; \
+  else \
+    m_info = NoConvergence; \
+\
+  m_isInitialized = true; \
+  m_matUisUptodate = computeU; \
+  return *this; \
+\
+}
+
+EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, RowMajor, LAPACK_ROW_MAJOR)
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_SCHUR_MKL_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/EigenSolver.h b/include/eigen3/Eigen/src/Eigenvalues/EigenSolver.h
new file mode 100644
index 0000000..6e71506
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/EigenSolver.h
@@ -0,0 +1,598 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010,2012 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EIGENSOLVER_H
+#define EIGEN_EIGENSOLVER_H
+
+#include "./RealSchur.h"
+
+namespace Eigen { 
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \class EigenSolver
+  *
+  * \brief Computes eigenvalues and eigenvectors of general matrices
+  *
+  * \tparam _MatrixType the type of the matrix of which we are computing the
+  * eigendecomposition; this is expected to be an instantiation of the Matrix
+  * class template. Currently, only real matrices are supported.
+  *
+  * The eigenvalues and eigenvectors of a matrix \f$ A \f$ are scalars
+  * \f$ \lambda \f$ and vectors \f$ v \f$ such that \f$ Av = \lambda v \f$.  If
+  * \f$ D \f$ is a diagonal matrix with the eigenvalues on the diagonal, and
+  * \f$ V \f$ is a matrix with the eigenvectors as its columns, then \f$ A V =
+  * V D \f$. The matrix \f$ V \f$ is almost always invertible, in which case we
+  * have \f$ A = V D V^{-1} \f$. This is called the eigendecomposition.
+  *
+  * The eigenvalues and eigenvectors of a matrix may be complex, even when the
+  * matrix is real. However, we can choose real matrices \f$ V \f$ and \f$ D
+  * \f$ satisfying \f$ A V = V D \f$, just like the eigendecomposition, if the
+  * matrix \f$ D \f$ is not required to be diagonal, but if it is allowed to
+  * have blocks of the form
+  * \f[ \begin{bmatrix} u & v \\ -v & u \end{bmatrix} \f]
+  * (where \f$ u \f$ and \f$ v \f$ are real numbers) on the diagonal.  These
+  * blocks correspond to complex eigenvalue pairs \f$ u \pm iv \f$. We call
+  * this variant of the eigendecomposition the pseudo-eigendecomposition.
+  *
+  * Call the function compute() to compute the eigenvalues and eigenvectors of
+  * a given matrix. Alternatively, you can use the 
+  * EigenSolver(const MatrixType&, bool) constructor which computes the
+  * eigenvalues and eigenvectors at construction time. Once the eigenvalue and
+  * eigenvectors are computed, they can be retrieved with the eigenvalues() and
+  * eigenvectors() functions. The pseudoEigenvalueMatrix() and
+  * pseudoEigenvectors() methods allow the construction of the
+  * pseudo-eigendecomposition.
+  *
+  * The documentation for EigenSolver(const MatrixType&, bool) contains an
+  * example of the typical use of this class.
+  *
+  * \note The implementation is adapted from
+  * <a href="http://math.nist.gov/javanumerics/jama/">JAMA</a> (public domain).
+  * Their code is based on EISPACK.
+  *
+  * \sa MatrixBase::eigenvalues(), class ComplexEigenSolver, class SelfAdjointEigenSolver
+  */
+template<typename _MatrixType> class EigenSolver
+{
+  public:
+
+    /** \brief Synonym for the template parameter \p _MatrixType. */
+    typedef _MatrixType MatrixType;
+
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+
+    /** \brief Scalar type for matrices of type #MatrixType. */
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename MatrixType::Index Index;
+
+    /** \brief Complex scalar type for #MatrixType. 
+      *
+      * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
+      * \c float or \c double) and just \c Scalar if #Scalar is
+      * complex.
+      */
+    typedef std::complex<RealScalar> ComplexScalar;
+
+    /** \brief Type for vector of eigenvalues as returned by eigenvalues(). 
+      *
+      * This is a column vector with entries of type #ComplexScalar.
+      * The length of the vector is the size of #MatrixType.
+      */
+    typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
+
+    /** \brief Type for matrix of eigenvectors as returned by eigenvectors(). 
+      *
+      * This is a square matrix with entries of type #ComplexScalar. 
+      * The size is the same as the size of #MatrixType.
+      */
+    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime, MaxColsAtCompileTime> EigenvectorsType;
+
+    /** \brief Default constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via EigenSolver::compute(const MatrixType&, bool).
+      *
+      * \sa compute() for an example.
+      */
+ EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {}
+
+    /** \brief Default constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa EigenSolver()
+      */
+    EigenSolver(Index size)
+      : m_eivec(size, size),
+        m_eivalues(size),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false),
+        m_realSchur(size),
+        m_matT(size, size), 
+        m_tmp(size)
+    {}
+
+    /** \brief Constructor; computes eigendecomposition of given matrix. 
+      * 
+      * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
+      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+      *    eigenvalues are computed; if false, only the eigenvalues are
+      *    computed. 
+      *
+      * This constructor calls compute() to compute the eigenvalues
+      * and eigenvectors.
+      *
+      * Example: \include EigenSolver_EigenSolver_MatrixType.cpp
+      * Output: \verbinclude EigenSolver_EigenSolver_MatrixType.out
+      *
+      * \sa compute()
+      */
+    EigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
+      : m_eivec(matrix.rows(), matrix.cols()),
+        m_eivalues(matrix.cols()),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false),
+        m_realSchur(matrix.cols()),
+        m_matT(matrix.rows(), matrix.cols()), 
+        m_tmp(matrix.cols())
+    {
+      compute(matrix, computeEigenvectors);
+    }
+
+    /** \brief Returns the eigenvectors of given matrix. 
+      *
+      * \returns  %Matrix whose columns are the (possibly complex) eigenvectors.
+      *
+      * \pre Either the constructor 
+      * EigenSolver(const MatrixType&,bool) or the member function
+      * compute(const MatrixType&, bool) has been called before, and
+      * \p computeEigenvectors was set to true (the default).
+      *
+      * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
+      * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
+      * eigenvectors are normalized to have (Euclidean) norm equal to one. The
+      * matrix returned by this function is the matrix \f$ V \f$ in the
+      * eigendecomposition \f$ A = V D V^{-1} \f$, if it exists.
+      *
+      * Example: \include EigenSolver_eigenvectors.cpp
+      * Output: \verbinclude EigenSolver_eigenvectors.out
+      *
+      * \sa eigenvalues(), pseudoEigenvectors()
+      */
+    EigenvectorsType eigenvectors() const;
+
+    /** \brief Returns the pseudo-eigenvectors of given matrix. 
+      *
+      * \returns  Const reference to matrix whose columns are the pseudo-eigenvectors.
+      *
+      * \pre Either the constructor 
+      * EigenSolver(const MatrixType&,bool) or the member function
+      * compute(const MatrixType&, bool) has been called before, and
+      * \p computeEigenvectors was set to true (the default).
+      *
+      * The real matrix \f$ V \f$ returned by this function and the
+      * block-diagonal matrix \f$ D \f$ returned by pseudoEigenvalueMatrix()
+      * satisfy \f$ AV = VD \f$.
+      *
+      * Example: \include EigenSolver_pseudoEigenvectors.cpp
+      * Output: \verbinclude EigenSolver_pseudoEigenvectors.out
+      *
+      * \sa pseudoEigenvalueMatrix(), eigenvectors()
+      */
+    const MatrixType& pseudoEigenvectors() const
+    {
+      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+      return m_eivec;
+    }
+
+    /** \brief Returns the block-diagonal matrix in the pseudo-eigendecomposition.
+      *
+      * \returns  A block-diagonal matrix.
+      *
+      * \pre Either the constructor 
+      * EigenSolver(const MatrixType&,bool) or the member function
+      * compute(const MatrixType&, bool) has been called before.
+      *
+      * The matrix \f$ D \f$ returned by this function is real and
+      * block-diagonal. The blocks on the diagonal are either 1-by-1 or 2-by-2
+      * blocks of the form
+      * \f$ \begin{bmatrix} u & v \\ -v & u \end{bmatrix} \f$.
+      * These blocks are not sorted in any particular order.
+      * The matrix \f$ D \f$ and the matrix \f$ V \f$ returned by
+      * pseudoEigenvectors() satisfy \f$ AV = VD \f$.
+      *
+      * \sa pseudoEigenvectors() for an example, eigenvalues()
+      */
+    MatrixType pseudoEigenvalueMatrix() const;
+
+    /** \brief Returns the eigenvalues of given matrix. 
+      *
+      * \returns A const reference to the column vector containing the eigenvalues.
+      *
+      * \pre Either the constructor 
+      * EigenSolver(const MatrixType&,bool) or the member function
+      * compute(const MatrixType&, bool) has been called before.
+      *
+      * The eigenvalues are repeated according to their algebraic multiplicity,
+      * so there are as many eigenvalues as rows in the matrix. The eigenvalues 
+      * are not sorted in any particular order.
+      *
+      * Example: \include EigenSolver_eigenvalues.cpp
+      * Output: \verbinclude EigenSolver_eigenvalues.out
+      *
+      * \sa eigenvectors(), pseudoEigenvalueMatrix(),
+      *     MatrixBase::eigenvalues()
+      */
+    const EigenvalueType& eigenvalues() const
+    {
+      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+      return m_eivalues;
+    }
+
+    /** \brief Computes eigendecomposition of given matrix. 
+      * 
+      * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
+      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+      *    eigenvalues are computed; if false, only the eigenvalues are
+      *    computed. 
+      * \returns    Reference to \c *this
+      *
+      * This function computes the eigenvalues of the real matrix \p matrix.
+      * The eigenvalues() function can be used to retrieve them.  If 
+      * \p computeEigenvectors is true, then the eigenvectors are also computed
+      * and can be retrieved by calling eigenvectors().
+      *
+      * The matrix is first reduced to real Schur form using the RealSchur
+      * class. The Schur decomposition is then used to compute the eigenvalues
+      * and eigenvectors.
+      *
+      * The cost of the computation is dominated by the cost of the
+      * Schur decomposition, which is very approximately \f$ 25n^3 \f$
+      * (where \f$ n \f$ is the size of the matrix) if \p computeEigenvectors 
+      * is true, and \f$ 10n^3 \f$ if \p computeEigenvectors is false.
+      *
+      * This method reuses of the allocated data in the EigenSolver object.
+      *
+      * Example: \include EigenSolver_compute.cpp
+      * Output: \verbinclude EigenSolver_compute.out
+      */
+    EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
+
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+      return m_realSchur.info();
+    }
+
+    /** \brief Sets the maximum number of iterations allowed. */
+    EigenSolver& setMaxIterations(Index maxIters)
+    {
+      m_realSchur.setMaxIterations(maxIters);
+      return *this;
+    }
+
+    /** \brief Returns the maximum number of iterations. */
+    Index getMaxIterations()
+    {
+      return m_realSchur.getMaxIterations();
+    }
+
+  private:
+    void doComputeEigenvectors();
+
+  protected:
+    MatrixType m_eivec;
+    EigenvalueType m_eivalues;
+    bool m_isInitialized;
+    bool m_eigenvectorsOk;
+    RealSchur<MatrixType> m_realSchur;
+    MatrixType m_matT;
+
+    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
+    ColumnVectorType m_tmp;
+};
+
+template<typename MatrixType>
+MatrixType EigenSolver<MatrixType>::pseudoEigenvalueMatrix() const
+{
+  eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+  Index n = m_eivalues.rows();
+  MatrixType matD = MatrixType::Zero(n,n);
+  for (Index i=0; i<n; ++i)
+  {
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i))))
+      matD.coeffRef(i,i) = numext::real(m_eivalues.coeff(i));
+    else
+    {
+      matD.template block<2,2>(i,i) <<  numext::real(m_eivalues.coeff(i)), numext::imag(m_eivalues.coeff(i)),
+                                       -numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i));
+      ++i;
+    }
+  }
+  return matD;
+}
+
+template<typename MatrixType>
+typename EigenSolver<MatrixType>::EigenvectorsType EigenSolver<MatrixType>::eigenvectors() const
+{
+  eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+  eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+  Index n = m_eivec.cols();
+  EigenvectorsType matV(n,n);
+  for (Index j=0; j<n; ++j)
+  {
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j))) || j+1==n)
+    {
+      // we have a real eigen value
+      matV.col(j) = m_eivec.col(j).template cast<ComplexScalar>();
+      matV.col(j).normalize();
+    }
+    else
+    {
+      // we have a pair of complex eigen values
+      for (Index i=0; i<n; ++i)
+      {
+        matV.coeffRef(i,j)   = ComplexScalar(m_eivec.coeff(i,j),  m_eivec.coeff(i,j+1));
+        matV.coeffRef(i,j+1) = ComplexScalar(m_eivec.coeff(i,j), -m_eivec.coeff(i,j+1));
+      }
+      matV.col(j).normalize();
+      matV.col(j+1).normalize();
+      ++j;
+    }
+  }
+  return matV;
+}
+
+template<typename MatrixType>
+EigenSolver<MatrixType>& 
+EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
+{
+  using std::sqrt;
+  using std::abs;
+  eigen_assert(matrix.cols() == matrix.rows());
+
+  // Reduce to real Schur form.
+  m_realSchur.compute(matrix, computeEigenvectors);
+
+  if (m_realSchur.info() == Success)
+  {
+    m_matT = m_realSchur.matrixT();
+    if (computeEigenvectors)
+      m_eivec = m_realSchur.matrixU();
+  
+    // Compute eigenvalues from matT
+    m_eivalues.resize(matrix.cols());
+    Index i = 0;
+    while (i < matrix.cols()) 
+    {
+      if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) 
+      {
+        m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
+        ++i;
+      }
+      else
+      {
+        Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1));
+        Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
+        m_eivalues.coeffRef(i)   = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
+        m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
+        i += 2;
+      }
+    }
+    
+    // Compute eigenvectors.
+    if (computeEigenvectors)
+      doComputeEigenvectors();
+  }
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = computeEigenvectors;
+
+  return *this;
+}
+
+// Complex scalar division.
+template<typename Scalar>
+std::complex<Scalar> cdiv(const Scalar& xr, const Scalar& xi, const Scalar& yr, const Scalar& yi)
+{
+  using std::abs;
+  Scalar r,d;
+  if (abs(yr) > abs(yi))
+  {
+      r = yi/yr;
+      d = yr + r*yi;
+      return std::complex<Scalar>((xr + r*xi)/d, (xi - r*xr)/d);
+  }
+  else
+  {
+      r = yr/yi;
+      d = yi + r*yr;
+      return std::complex<Scalar>((r*xr + xi)/d, (r*xi - xr)/d);
+  }
+}
+
+
+template<typename MatrixType>
+void EigenSolver<MatrixType>::doComputeEigenvectors()
+{
+  using std::abs;
+  const Index size = m_eivec.cols();
+  const Scalar eps = NumTraits<Scalar>::epsilon();
+
+  // inefficient! this is already computed in RealSchur
+  Scalar norm(0);
+  for (Index j = 0; j < size; ++j)
+  {
+    norm += m_matT.row(j).segment((std::max)(j-1,Index(0)), size-(std::max)(j-1,Index(0))).cwiseAbs().sum();
+  }
+  
+  // Backsubstitute to find vectors of upper triangular form
+  if (norm == 0.0)
+  {
+    return;
+  }
+
+  for (Index n = size-1; n >= 0; n--)
+  {
+    Scalar p = m_eivalues.coeff(n).real();
+    Scalar q = m_eivalues.coeff(n).imag();
+
+    // Scalar vector
+    if (q == Scalar(0))
+    {
+      Scalar lastr(0), lastw(0);
+      Index l = n;
+
+      m_matT.coeffRef(n,n) = 1.0;
+      for (Index i = n-1; i >= 0; i--)
+      {
+        Scalar w = m_matT.coeff(i,i) - p;
+        Scalar r = m_matT.row(i).segment(l,n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
+
+        if (m_eivalues.coeff(i).imag() < 0.0)
+        {
+          lastw = w;
+          lastr = r;
+        }
+        else
+        {
+          l = i;
+          if (m_eivalues.coeff(i).imag() == 0.0)
+          {
+            if (w != 0.0)
+              m_matT.coeffRef(i,n) = -r / w;
+            else
+              m_matT.coeffRef(i,n) = -r / (eps * norm);
+          }
+          else // Solve real equations
+          {
+            Scalar x = m_matT.coeff(i,i+1);
+            Scalar y = m_matT.coeff(i+1,i);
+            Scalar denom = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) + m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag();
+            Scalar t = (x * lastr - lastw * r) / denom;
+            m_matT.coeffRef(i,n) = t;
+            if (abs(x) > abs(lastw))
+              m_matT.coeffRef(i+1,n) = (-r - w * t) / x;
+            else
+              m_matT.coeffRef(i+1,n) = (-lastr - y * t) / lastw;
+          }
+
+          // Overflow control
+          Scalar t = abs(m_matT.coeff(i,n));
+          if ((eps * t) * t > Scalar(1))
+            m_matT.col(n).tail(size-i) /= t;
+        }
+      }
+    }
+    else if (q < Scalar(0) && n > 0) // Complex vector
+    {
+      Scalar lastra(0), lastsa(0), lastw(0);
+      Index l = n-1;
+
+      // Last vector component imaginary so matrix is triangular
+      if (abs(m_matT.coeff(n,n-1)) > abs(m_matT.coeff(n-1,n)))
+      {
+        m_matT.coeffRef(n-1,n-1) = q / m_matT.coeff(n,n-1);
+        m_matT.coeffRef(n-1,n) = -(m_matT.coeff(n,n) - p) / m_matT.coeff(n,n-1);
+      }
+      else
+      {
+        std::complex<Scalar> cc = cdiv<Scalar>(0.0,-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
+        m_matT.coeffRef(n-1,n-1) = numext::real(cc);
+        m_matT.coeffRef(n-1,n) = numext::imag(cc);
+      }
+      m_matT.coeffRef(n,n-1) = 0.0;
+      m_matT.coeffRef(n,n) = 1.0;
+      for (Index i = n-2; i >= 0; i--)
+      {
+        Scalar ra = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n-1).segment(l, n-l+1));
+        Scalar sa = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
+        Scalar w = m_matT.coeff(i,i) - p;
+
+        if (m_eivalues.coeff(i).imag() < 0.0)
+        {
+          lastw = w;
+          lastra = ra;
+          lastsa = sa;
+        }
+        else
+        {
+          l = i;
+          if (m_eivalues.coeff(i).imag() == RealScalar(0))
+          {
+            std::complex<Scalar> cc = cdiv(-ra,-sa,w,q);
+            m_matT.coeffRef(i,n-1) = numext::real(cc);
+            m_matT.coeffRef(i,n) = numext::imag(cc);
+          }
+          else
+          {
+            // Solve complex equations
+            Scalar x = m_matT.coeff(i,i+1);
+            Scalar y = m_matT.coeff(i+1,i);
+            Scalar vr = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) + m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag() - q * q;
+            Scalar vi = (m_eivalues.coeff(i).real() - p) * Scalar(2) * q;
+            if ((vr == 0.0) && (vi == 0.0))
+              vr = eps * norm * (abs(w) + abs(q) + abs(x) + abs(y) + abs(lastw));
+
+            std::complex<Scalar> cc = cdiv(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra,vr,vi);
+            m_matT.coeffRef(i,n-1) = numext::real(cc);
+            m_matT.coeffRef(i,n) = numext::imag(cc);
+            if (abs(x) > (abs(lastw) + abs(q)))
+            {
+              m_matT.coeffRef(i+1,n-1) = (-ra - w * m_matT.coeff(i,n-1) + q * m_matT.coeff(i,n)) / x;
+              m_matT.coeffRef(i+1,n) = (-sa - w * m_matT.coeff(i,n) - q * m_matT.coeff(i,n-1)) / x;
+            }
+            else
+            {
+              cc = cdiv(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n),lastw,q);
+              m_matT.coeffRef(i+1,n-1) = numext::real(cc);
+              m_matT.coeffRef(i+1,n) = numext::imag(cc);
+            }
+          }
+
+          // Overflow control
+          using std::max;
+          Scalar t = (max)(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
+          if ((eps * t) * t > Scalar(1))
+            m_matT.block(i, n-1, size-i, 2) /= t;
+
+        }
+      }
+      
+      // We handled a pair of complex conjugate eigenvalues, so need to skip them both
+      n--;
+    }
+    else
+    {
+      eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen
+    }
+  }
+
+  // Back transformation to get eigenvectors of original matrix
+  for (Index j = size-1; j >= 0; j--)
+  {
+    m_tmp.noalias() = m_eivec.leftCols(j+1) * m_matT.col(j).segment(0, j+1);
+    m_eivec.col(j) = m_tmp;
+  }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_EIGENSOLVER_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/include/eigen3/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
new file mode 100644
index 0000000..dc240e1
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -0,0 +1,341 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010,2012 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERALIZEDEIGENSOLVER_H
+#define EIGEN_GENERALIZEDEIGENSOLVER_H
+
+#include "./RealQZ.h"
+
+namespace Eigen { 
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \class GeneralizedEigenSolver
+  *
+  * \brief Computes the generalized eigenvalues and eigenvectors of a pair of general matrices
+  *
+  * \tparam _MatrixType the type of the matrices of which we are computing the
+  * eigen-decomposition; this is expected to be an instantiation of the Matrix
+  * class template. Currently, only real matrices are supported.
+  *
+  * The generalized eigenvalues and eigenvectors of a matrix pair \f$ A \f$ and \f$ B \f$ are scalars
+  * \f$ \lambda \f$ and vectors \f$ v \f$ such that \f$ Av = \lambda Bv \f$.  If
+  * \f$ D \f$ is a diagonal matrix with the eigenvalues on the diagonal, and
+  * \f$ V \f$ is a matrix with the eigenvectors as its columns, then \f$ A V =
+  * B V D \f$. The matrix \f$ V \f$ is almost always invertible, in which case we
+  * have \f$ A = B V D V^{-1} \f$. This is called the generalized eigen-decomposition.
+  *
+  * The generalized eigenvalues and eigenvectors of a matrix pair may be complex, even when the
+  * matrices are real. Moreover, the generalized eigenvalue might be infinite if the matrix B is
+  * singular. To workaround this difficulty, the eigenvalues are provided as a pair of complex \f$ \alpha \f$
+  * and real \f$ \beta \f$ such that: \f$ \lambda_i = \alpha_i / \beta_i \f$. If \f$ \beta_i \f$ is (nearly) zero,
+  * then one can consider the well defined left eigenvalue \f$ \mu = \beta_i / \alpha_i\f$ such that:
+  * \f$ \mu_i A v_i = B v_i \f$, or even \f$ \mu_i u_i^T A  = u_i^T B \f$ where \f$ u_i \f$ is
+  * called the left eigenvector.
+  *
+  * Call the function compute() to compute the generalized eigenvalues and eigenvectors of
+  * a given matrix pair. Alternatively, you can use the
+  * GeneralizedEigenSolver(const MatrixType&, const MatrixType&, bool) constructor which computes the
+  * eigenvalues and eigenvectors at construction time. Once the eigenvalue and
+  * eigenvectors are computed, they can be retrieved with the eigenvalues() and
+  * eigenvectors() functions.
+  *
+  * Here is an usage example of this class:
+  * Example: \include GeneralizedEigenSolver.cpp
+  * Output: \verbinclude GeneralizedEigenSolver.out
+  *
+  * \sa MatrixBase::eigenvalues(), class ComplexEigenSolver, class SelfAdjointEigenSolver
+  */
+template<typename _MatrixType> class GeneralizedEigenSolver
+{
+  public:
+
+    /** \brief Synonym for the template parameter \p _MatrixType. */
+    typedef _MatrixType MatrixType;
+
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+
+    /** \brief Scalar type for matrices of type #MatrixType. */
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename MatrixType::Index Index;
+
+    /** \brief Complex scalar type for #MatrixType. 
+      *
+      * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
+      * \c float or \c double) and just \c Scalar if #Scalar is
+      * complex.
+      */
+    typedef std::complex<RealScalar> ComplexScalar;
+
+    /** \brief Type for vector of real scalar values eigenvalues as returned by betas().
+      *
+      * This is a column vector with entries of type #Scalar.
+      * The length of the vector is the size of #MatrixType.
+      */
+    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> VectorType;
+
+    /** \brief Type for vector of complex scalar values eigenvalues as returned by betas().
+      *
+      * This is a column vector with entries of type #ComplexScalar.
+      * The length of the vector is the size of #MatrixType.
+      */
+    typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ComplexVectorType;
+
+    /** \brief Expression type for the eigenvalues as returned by eigenvalues().
+      */
+    typedef CwiseBinaryOp<internal::scalar_quotient_op<ComplexScalar,Scalar>,ComplexVectorType,VectorType> EigenvalueType;
+
+    /** \brief Type for matrix of eigenvectors as returned by eigenvectors(). 
+      *
+      * This is a square matrix with entries of type #ComplexScalar. 
+      * The size is the same as the size of #MatrixType.
+      */
+    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime, MaxColsAtCompileTime> EigenvectorsType;
+
+    /** \brief Default constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via EigenSolver::compute(const MatrixType&, bool).
+      *
+      * \sa compute() for an example.
+      */
+    GeneralizedEigenSolver() : m_eivec(), m_alphas(), m_betas(), m_isInitialized(false), m_realQZ(), m_matS(), m_tmp() {}
+
+    /** \brief Default constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa GeneralizedEigenSolver()
+      */
+    GeneralizedEigenSolver(Index size)
+      : m_eivec(size, size),
+        m_alphas(size),
+        m_betas(size),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false),
+        m_realQZ(size),
+        m_matS(size, size),
+        m_tmp(size)
+    {}
+
+    /** \brief Constructor; computes the generalized eigendecomposition of given matrix pair.
+      * 
+      * \param[in]  A  Square matrix whose eigendecomposition is to be computed.
+      * \param[in]  B  Square matrix whose eigendecomposition is to be computed.
+      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+      *    eigenvalues are computed; if false, only the eigenvalues are computed.
+      *
+      * This constructor calls compute() to compute the generalized eigenvalues
+      * and eigenvectors.
+      *
+      * \sa compute()
+      */
+    GeneralizedEigenSolver(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true)
+      : m_eivec(A.rows(), A.cols()),
+        m_alphas(A.cols()),
+        m_betas(A.cols()),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false),
+        m_realQZ(A.cols()),
+        m_matS(A.rows(), A.cols()),
+        m_tmp(A.cols())
+    {
+      compute(A, B, computeEigenvectors);
+    }
+
+    /* \brief Returns the computed generalized eigenvectors.
+      *
+      * \returns  %Matrix whose columns are the (possibly complex) eigenvectors.
+      *
+      * \pre Either the constructor 
+      * GeneralizedEigenSolver(const MatrixType&,const MatrixType&, bool) or the member function
+      * compute(const MatrixType&, const MatrixType& bool) has been called before, and
+      * \p computeEigenvectors was set to true (the default).
+      *
+      * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
+      * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
+      * eigenvectors are normalized to have (Euclidean) norm equal to one. The
+      * matrix returned by this function is the matrix \f$ V \f$ in the
+      * generalized eigendecomposition \f$ A = B V D V^{-1} \f$, if it exists.
+      *
+      * \sa eigenvalues()
+      */
+//    EigenvectorsType eigenvectors() const;
+
+    /** \brief Returns an expression of the computed generalized eigenvalues.
+      *
+      * \returns An expression of the column vector containing the eigenvalues.
+      *
+      * It is a shortcut for \code this->alphas().cwiseQuotient(this->betas()); \endcode
+      * Not that betas might contain zeros. It is therefore not recommended to use this function,
+      * but rather directly deal with the alphas and betas vectors.
+      *
+      * \pre Either the constructor 
+      * GeneralizedEigenSolver(const MatrixType&,const MatrixType&,bool) or the member function
+      * compute(const MatrixType&,const MatrixType&,bool) has been called before.
+      *
+      * The eigenvalues are repeated according to their algebraic multiplicity,
+      * so there are as many eigenvalues as rows in the matrix. The eigenvalues 
+      * are not sorted in any particular order.
+      *
+      * \sa alphas(), betas(), eigenvectors()
+      */
+    EigenvalueType eigenvalues() const
+    {
+      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      return EigenvalueType(m_alphas,m_betas);
+    }
+
+    /** \returns A const reference to the vectors containing the alpha values
+      *
+      * This vector permits to reconstruct the j-th eigenvalues as alphas(i)/betas(j).
+      *
+      * \sa betas(), eigenvalues() */
+    ComplexVectorType alphas() const
+    {
+      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      return m_alphas;
+    }
+
+    /** \returns A const reference to the vectors containing the beta values
+      *
+      * This vector permits to reconstruct the j-th eigenvalues as alphas(i)/betas(j).
+      *
+      * \sa alphas(), eigenvalues() */
+    VectorType betas() const
+    {
+      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      return m_betas;
+    }
+
+    /** \brief Computes generalized eigendecomposition of given matrix.
+      * 
+      * \param[in]  A  Square matrix whose eigendecomposition is to be computed.
+      * \param[in]  B  Square matrix whose eigendecomposition is to be computed.
+      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+      *    eigenvalues are computed; if false, only the eigenvalues are
+      *    computed. 
+      * \returns    Reference to \c *this
+      *
+      * This function computes the eigenvalues of the real matrix \p matrix.
+      * The eigenvalues() function can be used to retrieve them.  If 
+      * \p computeEigenvectors is true, then the eigenvectors are also computed
+      * and can be retrieved by calling eigenvectors().
+      *
+      * The matrix is first reduced to real generalized Schur form using the RealQZ
+      * class. The generalized Schur decomposition is then used to compute the eigenvalues
+      * and eigenvectors.
+      *
+      * The cost of the computation is dominated by the cost of the
+      * generalized Schur decomposition.
+      *
+      * This method reuses of the allocated data in the GeneralizedEigenSolver object.
+      */
+    GeneralizedEigenSolver& compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true);
+
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+      return m_realQZ.info();
+    }
+
+    /** Sets the maximal number of iterations allowed.
+    */
+    GeneralizedEigenSolver& setMaxIterations(Index maxIters)
+    {
+      m_realQZ.setMaxIterations(maxIters);
+      return *this;
+    }
+
+  protected:
+    MatrixType m_eivec;
+    ComplexVectorType m_alphas;
+    VectorType m_betas;
+    bool m_isInitialized;
+    bool m_eigenvectorsOk;
+    RealQZ<MatrixType> m_realQZ;
+    MatrixType m_matS;
+
+    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
+    ColumnVectorType m_tmp;
+};
+
+//template<typename MatrixType>
+//typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType GeneralizedEigenSolver<MatrixType>::eigenvectors() const
+//{
+//  eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+//  eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+//  Index n = m_eivec.cols();
+//  EigenvectorsType matV(n,n);
+//  // TODO
+//  return matV;
+//}
+
+template<typename MatrixType>
+GeneralizedEigenSolver<MatrixType>&
+GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors)
+{
+  using std::sqrt;
+  using std::abs;
+  eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
+
+  // Reduce to generalized real Schur form:
+  // A = Q S Z and B = Q T Z
+  m_realQZ.compute(A, B, computeEigenvectors);
+
+  if (m_realQZ.info() == Success)
+  {
+    m_matS = m_realQZ.matrixS();
+    if (computeEigenvectors)
+      m_eivec = m_realQZ.matrixZ().transpose();
+  
+    // Compute eigenvalues from matS
+    m_alphas.resize(A.cols());
+    m_betas.resize(A.cols());
+    Index i = 0;
+    while (i < A.cols())
+    {
+      if (i == A.cols() - 1 || m_matS.coeff(i+1, i) == Scalar(0))
+      {
+        m_alphas.coeffRef(i) = m_matS.coeff(i, i);
+        m_betas.coeffRef(i)  = m_realQZ.matrixT().coeff(i,i);
+        ++i;
+      }
+      else
+      {
+        Scalar p = Scalar(0.5) * (m_matS.coeff(i, i) - m_matS.coeff(i+1, i+1));
+        Scalar z = sqrt(abs(p * p + m_matS.coeff(i+1, i) * m_matS.coeff(i, i+1)));
+        m_alphas.coeffRef(i)   = ComplexScalar(m_matS.coeff(i+1, i+1) + p, z);
+        m_alphas.coeffRef(i+1) = ComplexScalar(m_matS.coeff(i+1, i+1) + p, -z);
+
+        m_betas.coeffRef(i)   = m_realQZ.matrixT().coeff(i,i);
+        m_betas.coeffRef(i+1) = m_realQZ.matrixT().coeff(i,i);
+        i += 2;
+      }
+    }
+  }
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = false;//computeEigenvectors;
+
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERALIZEDEIGENSOLVER_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/include/eigen3/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
new file mode 100644
index 0000000..07bf1ea
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
@@ -0,0 +1,227 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERALIZEDSELFADJOINTEIGENSOLVER_H
+#define EIGEN_GENERALIZEDSELFADJOINTEIGENSOLVER_H
+
+#include "./Tridiagonalization.h"
+
+namespace Eigen { 
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \class GeneralizedSelfAdjointEigenSolver
+  *
+  * \brief Computes eigenvalues and eigenvectors of the generalized selfadjoint eigen problem
+  *
+  * \tparam _MatrixType the type of the matrix of which we are computing the
+  * eigendecomposition; this is expected to be an instantiation of the Matrix
+  * class template.
+  *
+  * This class solves the generalized eigenvalue problem
+  * \f$ Av = \lambda Bv \f$. In this case, the matrix \f$ A \f$ should be
+  * selfadjoint and the matrix \f$ B \f$ should be positive definite.
+  *
+  * Only the \b lower \b triangular \b part of the input matrix is referenced.
+  *
+  * Call the function compute() to compute the eigenvalues and eigenvectors of
+  * a given matrix. Alternatively, you can use the
+  * GeneralizedSelfAdjointEigenSolver(const MatrixType&, const MatrixType&, int)
+  * constructor which computes the eigenvalues and eigenvectors at construction time.
+  * Once the eigenvalue and eigenvectors are computed, they can be retrieved with the eigenvalues()
+  * and eigenvectors() functions.
+  *
+  * The documentation for GeneralizedSelfAdjointEigenSolver(const MatrixType&, const MatrixType&, int)
+  * contains an example of the typical use of this class.
+  *
+  * \sa class SelfAdjointEigenSolver, class EigenSolver, class ComplexEigenSolver
+  */
+template<typename _MatrixType>
+class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixType>
+{
+    typedef SelfAdjointEigenSolver<_MatrixType> Base;
+  public:
+
+    typedef typename Base::Index Index;
+    typedef _MatrixType MatrixType;
+
+    /** \brief Default constructor for fixed-size matrices.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via compute(). This constructor
+      * can only be used if \p _MatrixType is a fixed-size matrix; use
+      * GeneralizedSelfAdjointEigenSolver(Index) for dynamic-size matrices.
+      */
+    GeneralizedSelfAdjointEigenSolver() : Base() {}
+
+    /** \brief Constructor, pre-allocates memory for dynamic-size matrices.
+      *
+      * \param [in]  size  Positive integer, size of the matrix whose
+      * eigenvalues and eigenvectors will be computed.
+      *
+      * This constructor is useful for dynamic-size matrices, when the user
+      * intends to perform decompositions via compute(). The \p size
+      * parameter is only used as a hint. It is not an error to give a wrong
+      * \p size, but it may impair performance.
+      *
+      * \sa compute() for an example
+      */
+    GeneralizedSelfAdjointEigenSolver(Index size)
+        : Base(size)
+    {}
+
+    /** \brief Constructor; computes generalized eigendecomposition of given matrix pencil.
+      *
+      * \param[in]  matA  Selfadjoint matrix in matrix pencil.
+      *                   Only the lower triangular part of the matrix is referenced.
+      * \param[in]  matB  Positive-definite matrix in matrix pencil.
+      *                   Only the lower triangular part of the matrix is referenced.
+      * \param[in]  options A or-ed set of flags {#ComputeEigenvectors,#EigenvaluesOnly} | {#Ax_lBx,#ABx_lx,#BAx_lx}.
+      *                     Default is #ComputeEigenvectors|#Ax_lBx.
+      *
+      * This constructor calls compute(const MatrixType&, const MatrixType&, int)
+      * to compute the eigenvalues and (if requested) the eigenvectors of the
+      * generalized eigenproblem \f$ Ax = \lambda B x \f$ with \a matA the
+      * selfadjoint matrix \f$ A \f$ and \a matB the positive definite matrix
+      * \f$ B \f$. Each eigenvector \f$ x \f$ satisfies the property
+      * \f$ x^* B x = 1 \f$. The eigenvectors are computed if
+      * \a options contains ComputeEigenvectors.
+      *
+      * In addition, the two following variants can be solved via \p options:
+      * - \c ABx_lx: \f$ ABx = \lambda x \f$
+      * - \c BAx_lx: \f$ BAx = \lambda x \f$
+      *
+      * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp
+      * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.out
+      *
+      * \sa compute(const MatrixType&, const MatrixType&, int)
+      */
+    GeneralizedSelfAdjointEigenSolver(const MatrixType& matA, const MatrixType& matB,
+                                      int options = ComputeEigenvectors|Ax_lBx)
+      : Base(matA.cols())
+    {
+      compute(matA, matB, options);
+    }
+
+    /** \brief Computes generalized eigendecomposition of given matrix pencil.
+      *
+      * \param[in]  matA  Selfadjoint matrix in matrix pencil.
+      *                   Only the lower triangular part of the matrix is referenced.
+      * \param[in]  matB  Positive-definite matrix in matrix pencil.
+      *                   Only the lower triangular part of the matrix is referenced.
+      * \param[in]  options A or-ed set of flags {#ComputeEigenvectors,#EigenvaluesOnly} | {#Ax_lBx,#ABx_lx,#BAx_lx}.
+      *                     Default is #ComputeEigenvectors|#Ax_lBx.
+      *
+      * \returns    Reference to \c *this
+      *
+      * Accoring to \p options, this function computes eigenvalues and (if requested)
+      * the eigenvectors of one of the following three generalized eigenproblems:
+      * - \c Ax_lBx: \f$ Ax = \lambda B x \f$
+      * - \c ABx_lx: \f$ ABx = \lambda x \f$
+      * - \c BAx_lx: \f$ BAx = \lambda x \f$
+      * with \a matA the selfadjoint matrix \f$ A \f$ and \a matB the positive definite
+      * matrix \f$ B \f$.
+      * In addition, each eigenvector \f$ x \f$ satisfies the property \f$ x^* B x = 1 \f$.
+      *
+      * The eigenvalues() function can be used to retrieve
+      * the eigenvalues. If \p options contains ComputeEigenvectors, then the
+      * eigenvectors are also computed and can be retrieved by calling
+      * eigenvectors().
+      *
+      * The implementation uses LLT to compute the Cholesky decomposition
+      * \f$ B = LL^* \f$ and computes the classical eigendecomposition
+      * of the selfadjoint matrix \f$ L^{-1} A (L^*)^{-1} \f$ if \p options contains Ax_lBx
+      * and of \f$ L^{*} A L \f$ otherwise. This solves the
+      * generalized eigenproblem, because any solution of the generalized
+      * eigenproblem \f$ Ax = \lambda B x \f$ corresponds to a solution
+      * \f$ L^{-1} A (L^*)^{-1} (L^* x) = \lambda (L^* x) \f$ of the
+      * eigenproblem for \f$ L^{-1} A (L^*)^{-1} \f$. Similar statements
+      * can be made for the two other variants.
+      *
+      * Example: \include SelfAdjointEigenSolver_compute_MatrixType2.cpp
+      * Output: \verbinclude SelfAdjointEigenSolver_compute_MatrixType2.out
+      *
+      * \sa GeneralizedSelfAdjointEigenSolver(const MatrixType&, const MatrixType&, int)
+      */
+    GeneralizedSelfAdjointEigenSolver& compute(const MatrixType& matA, const MatrixType& matB,
+                                               int options = ComputeEigenvectors|Ax_lBx);
+
+  protected:
+
+};
+
+
+template<typename MatrixType>
+GeneralizedSelfAdjointEigenSolver<MatrixType>& GeneralizedSelfAdjointEigenSolver<MatrixType>::
+compute(const MatrixType& matA, const MatrixType& matB, int options)
+{
+  eigen_assert(matA.cols()==matA.rows() && matB.rows()==matA.rows() && matB.cols()==matB.rows());
+  eigen_assert((options&~(EigVecMask|GenEigMask))==0
+          && (options&EigVecMask)!=EigVecMask
+          && ((options&GenEigMask)==0 || (options&GenEigMask)==Ax_lBx
+           || (options&GenEigMask)==ABx_lx || (options&GenEigMask)==BAx_lx)
+          && "invalid option parameter");
+
+  bool computeEigVecs = ((options&EigVecMask)==0) || ((options&EigVecMask)==ComputeEigenvectors);
+
+  // Compute the cholesky decomposition of matB = L L' = U'U
+  LLT<MatrixType> cholB(matB);
+
+  int type = (options&GenEigMask);
+  if(type==0)
+    type = Ax_lBx;
+
+  if(type==Ax_lBx)
+  {
+    // compute C = inv(L) A inv(L')
+    MatrixType matC = matA.template selfadjointView<Lower>();
+    cholB.matrixL().template solveInPlace<OnTheLeft>(matC);
+    cholB.matrixU().template solveInPlace<OnTheRight>(matC);
+
+    Base::compute(matC, computeEigVecs ? ComputeEigenvectors : EigenvaluesOnly );
+
+    // transform back the eigen vectors: evecs = inv(U) * evecs
+    if(computeEigVecs)
+      cholB.matrixU().solveInPlace(Base::m_eivec);
+  }
+  else if(type==ABx_lx)
+  {
+    // compute C = L' A L
+    MatrixType matC = matA.template selfadjointView<Lower>();
+    matC = matC * cholB.matrixL();
+    matC = cholB.matrixU() * matC;
+
+    Base::compute(matC, computeEigVecs ? ComputeEigenvectors : EigenvaluesOnly);
+
+    // transform back the eigen vectors: evecs = inv(U) * evecs
+    if(computeEigVecs)
+      cholB.matrixU().solveInPlace(Base::m_eivec);
+  }
+  else if(type==BAx_lx)
+  {
+    // compute C = L' A L
+    MatrixType matC = matA.template selfadjointView<Lower>();
+    matC = matC * cholB.matrixL();
+    matC = cholB.matrixU() * matC;
+
+    Base::compute(matC, computeEigVecs ? ComputeEigenvectors : EigenvaluesOnly);
+
+    // transform back the eigen vectors: evecs = L * evecs
+    if(computeEigVecs)
+      Base::m_eivec = cholB.matrixL() * Base::m_eivec;
+  }
+
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERALIZEDSELFADJOINTEIGENSOLVER_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/include/eigen3/Eigen/src/Eigenvalues/HessenbergDecomposition.h
new file mode 100644
index 0000000..3db0c01
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -0,0 +1,373 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_HESSENBERGDECOMPOSITION_H
+#define EIGEN_HESSENBERGDECOMPOSITION_H
+
+namespace Eigen { 
+
+namespace internal {
+  
+template<typename MatrixType> struct HessenbergDecompositionMatrixHReturnType;
+template<typename MatrixType>
+struct traits<HessenbergDecompositionMatrixHReturnType<MatrixType> >
+{
+  typedef MatrixType ReturnType;
+};
+
+}
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \class HessenbergDecomposition
+  *
+  * \brief Reduces a square matrix to Hessenberg form by an orthogonal similarity transformation
+  *
+  * \tparam _MatrixType the type of the matrix of which we are computing the Hessenberg decomposition
+  *
+  * This class performs an Hessenberg decomposition of a matrix \f$ A \f$. In
+  * the real case, the Hessenberg decomposition consists of an orthogonal
+  * matrix \f$ Q \f$ and a Hessenberg matrix \f$ H \f$ such that \f$ A = Q H
+  * Q^T \f$. An orthogonal matrix is a matrix whose inverse equals its
+  * transpose (\f$ Q^{-1} = Q^T \f$). A Hessenberg matrix has zeros below the
+  * subdiagonal, so it is almost upper triangular. The Hessenberg decomposition
+  * of a complex matrix is \f$ A = Q H Q^* \f$ with \f$ Q \f$ unitary (that is,
+  * \f$ Q^{-1} = Q^* \f$).
+  *
+  * Call the function compute() to compute the Hessenberg decomposition of a
+  * given matrix. Alternatively, you can use the
+  * HessenbergDecomposition(const MatrixType&) constructor which computes the
+  * Hessenberg decomposition at construction time. Once the decomposition is
+  * computed, you can use the matrixH() and matrixQ() functions to construct
+  * the matrices H and Q in the decomposition.
+  *
+  * The documentation for matrixH() contains an example of the typical use of
+  * this class.
+  *
+  * \sa class ComplexSchur, class Tridiagonalization, \ref QR_Module "QR Module"
+  */
+template<typename _MatrixType> class HessenbergDecomposition
+{
+  public:
+
+    /** \brief Synonym for the template parameter \p _MatrixType. */
+    typedef _MatrixType MatrixType;
+
+    enum {
+      Size = MatrixType::RowsAtCompileTime,
+      SizeMinusOne = Size == Dynamic ? Dynamic : Size - 1,
+      Options = MatrixType::Options,
+      MaxSize = MatrixType::MaxRowsAtCompileTime,
+      MaxSizeMinusOne = MaxSize == Dynamic ? Dynamic : MaxSize - 1
+    };
+
+    /** \brief Scalar type for matrices of type #MatrixType. */
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+
+    /** \brief Type for vector of Householder coefficients.
+      *
+      * This is column vector with entries of type #Scalar. The length of the
+      * vector is one less than the size of #MatrixType, if it is a fixed-side
+      * type.
+      */
+    typedef Matrix<Scalar, SizeMinusOne, 1, Options & ~RowMajor, MaxSizeMinusOne, 1> CoeffVectorType;
+
+    /** \brief Return type of matrixQ() */
+    typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename CoeffVectorType::ConjugateReturnType>::type> HouseholderSequenceType;
+    
+    typedef internal::HessenbergDecompositionMatrixHReturnType<MatrixType> MatrixHReturnType;
+
+    /** \brief Default constructor; the decomposition will be computed later.
+      *
+      * \param [in] size  The size of the matrix whose Hessenberg decomposition will be computed.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via compute().  The \p size parameter is only
+      * used as a hint. It is not an error to give a wrong \p size, but it may
+      * impair performance.
+      *
+      * \sa compute() for an example.
+      */
+    HessenbergDecomposition(Index size = Size==Dynamic ? 2 : Size)
+      : m_matrix(size,size),
+        m_temp(size),
+        m_isInitialized(false)
+    {
+      if(size>1)
+        m_hCoeffs.resize(size-1);
+    }
+
+    /** \brief Constructor; computes Hessenberg decomposition of given matrix.
+      *
+      * \param[in]  matrix  Square matrix whose Hessenberg decomposition is to be computed.
+      *
+      * This constructor calls compute() to compute the Hessenberg
+      * decomposition.
+      *
+      * \sa matrixH() for an example.
+      */
+    HessenbergDecomposition(const MatrixType& matrix)
+      : m_matrix(matrix),
+        m_temp(matrix.rows()),
+        m_isInitialized(false)
+    {
+      if(matrix.rows()<2)
+      {
+        m_isInitialized = true;
+        return;
+      }
+      m_hCoeffs.resize(matrix.rows()-1,1);
+      _compute(m_matrix, m_hCoeffs, m_temp);
+      m_isInitialized = true;
+    }
+
+    /** \brief Computes Hessenberg decomposition of given matrix.
+      *
+      * \param[in]  matrix  Square matrix whose Hessenberg decomposition is to be computed.
+      * \returns    Reference to \c *this
+      *
+      * The Hessenberg decomposition is computed by bringing the columns of the
+      * matrix successively in the required form using Householder reflections
+      * (see, e.g., Algorithm 7.4.2 in Golub \& Van Loan, <i>%Matrix
+      * Computations</i>). The cost is \f$ 10n^3/3 \f$ flops, where \f$ n \f$
+      * denotes the size of the given matrix.
+      *
+      * This method reuses of the allocated data in the HessenbergDecomposition
+      * object.
+      *
+      * Example: \include HessenbergDecomposition_compute.cpp
+      * Output: \verbinclude HessenbergDecomposition_compute.out
+      */
+    HessenbergDecomposition& compute(const MatrixType& matrix)
+    {
+      m_matrix = matrix;
+      if(matrix.rows()<2)
+      {
+        m_isInitialized = true;
+        return *this;
+      }
+      m_hCoeffs.resize(matrix.rows()-1,1);
+      _compute(m_matrix, m_hCoeffs, m_temp);
+      m_isInitialized = true;
+      return *this;
+    }
+
+    /** \brief Returns the Householder coefficients.
+      *
+      * \returns a const reference to the vector of Householder coefficients
+      *
+      * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
+      * or the member function compute(const MatrixType&) has been called
+      * before to compute the Hessenberg decomposition of a matrix.
+      *
+      * The Householder coefficients allow the reconstruction of the matrix
+      * \f$ Q \f$ in the Hessenberg decomposition from the packed data.
+      *
+      * \sa packedMatrix(), \ref Householder_Module "Householder module"
+      */
+    const CoeffVectorType& householderCoefficients() const
+    {
+      eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
+      return m_hCoeffs;
+    }
+
+    /** \brief Returns the internal representation of the decomposition
+      *
+      *	\returns a const reference to a matrix with the internal representation
+      *	         of the decomposition.
+      *
+      * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
+      * or the member function compute(const MatrixType&) has been called
+      * before to compute the Hessenberg decomposition of a matrix.
+      *
+      * The returned matrix contains the following information:
+      *  - the upper part and lower sub-diagonal represent the Hessenberg matrix H
+      *  - the rest of the lower part contains the Householder vectors that, combined with
+      *    Householder coefficients returned by householderCoefficients(),
+      *    allows to reconstruct the matrix Q as
+      *       \f$ Q = H_{N-1} \ldots H_1 H_0 \f$.
+      *    Here, the matrices \f$ H_i \f$ are the Householder transformations
+      *       \f$ H_i = (I - h_i v_i v_i^T) \f$
+      *    where \f$ h_i \f$ is the \f$ i \f$th Householder coefficient and
+      *    \f$ v_i \f$ is the Householder vector defined by
+      *       \f$ v_i = [ 0, \ldots, 0, 1, M(i+2,i), \ldots, M(N-1,i) ]^T \f$
+      *    with M the matrix returned by this function.
+      *
+      * See LAPACK for further details on this packed storage.
+      *
+      * Example: \include HessenbergDecomposition_packedMatrix.cpp
+      * Output: \verbinclude HessenbergDecomposition_packedMatrix.out
+      *
+      * \sa householderCoefficients()
+      */
+    const MatrixType& packedMatrix() const
+    {
+      eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
+      return m_matrix;
+    }
+
+    /** \brief Reconstructs the orthogonal matrix Q in the decomposition
+      *
+      * \returns object representing the matrix Q
+      *
+      * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
+      * or the member function compute(const MatrixType&) has been called
+      * before to compute the Hessenberg decomposition of a matrix.
+      *
+      * This function returns a light-weight object of template class
+      * HouseholderSequence. You can either apply it directly to a matrix or
+      * you can convert it to a matrix of type #MatrixType.
+      *
+      * \sa matrixH() for an example, class HouseholderSequence
+      */
+    HouseholderSequenceType matrixQ() const
+    {
+      eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
+      return HouseholderSequenceType(m_matrix, m_hCoeffs.conjugate())
+             .setLength(m_matrix.rows() - 1)
+             .setShift(1);
+    }
+
+    /** \brief Constructs the Hessenberg matrix H in the decomposition
+      *
+      * \returns expression object representing the matrix H
+      *
+      * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
+      * or the member function compute(const MatrixType&) has been called
+      * before to compute the Hessenberg decomposition of a matrix.
+      *
+      * The object returned by this function constructs the Hessenberg matrix H
+      * when it is assigned to a matrix or otherwise evaluated. The matrix H is
+      * constructed from the packed matrix as returned by packedMatrix(): The
+      * upper part (including the subdiagonal) of the packed matrix contains
+      * the matrix H. It may sometimes be better to directly use the packed
+      * matrix instead of constructing the matrix H.
+      *
+      * Example: \include HessenbergDecomposition_matrixH.cpp
+      * Output: \verbinclude HessenbergDecomposition_matrixH.out
+      *
+      * \sa matrixQ(), packedMatrix()
+      */
+    MatrixHReturnType matrixH() const
+    {
+      eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
+      return MatrixHReturnType(*this);
+    }
+
+  private:
+
+    typedef Matrix<Scalar, 1, Size, Options | RowMajor, 1, MaxSize> VectorType;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp);
+
+  protected:
+    MatrixType m_matrix;
+    CoeffVectorType m_hCoeffs;
+    VectorType m_temp;
+    bool m_isInitialized;
+};
+
+/** \internal
+  * Performs a tridiagonal decomposition of \a matA in place.
+  *
+  * \param matA the input selfadjoint matrix
+  * \param hCoeffs returned Householder coefficients
+  *
+  * The result is written in the lower triangular part of \a matA.
+  *
+  * Implemented from Golub's "%Matrix Computations", algorithm 8.3.1.
+  *
+  * \sa packedMatrix()
+  */
+template<typename MatrixType>
+void HessenbergDecomposition<MatrixType>::_compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp)
+{
+  eigen_assert(matA.rows()==matA.cols());
+  Index n = matA.rows();
+  temp.resize(n);
+  for (Index i = 0; i<n-1; ++i)
+  {
+    // let's consider the vector v = i-th column starting at position i+1
+    Index remainingSize = n-i-1;
+    RealScalar beta;
+    Scalar h;
+    matA.col(i).tail(remainingSize).makeHouseholderInPlace(h, beta);
+    matA.col(i).coeffRef(i+1) = beta;
+    hCoeffs.coeffRef(i) = h;
+
+    // Apply similarity transformation to remaining columns,
+    // i.e., compute A = H A H'
+
+    // A = H A
+    matA.bottomRightCorner(remainingSize, remainingSize)
+        .applyHouseholderOnTheLeft(matA.col(i).tail(remainingSize-1), h, &temp.coeffRef(0));
+
+    // A = A H'
+    matA.rightCols(remainingSize)
+        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0));
+  }
+}
+
+namespace internal {
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \brief Expression type for return value of HessenbergDecomposition::matrixH()
+  *
+  * \tparam MatrixType type of matrix in the Hessenberg decomposition
+  *
+  * Objects of this type represent the Hessenberg matrix in the Hessenberg
+  * decomposition of some matrix. The object holds a reference to the
+  * HessenbergDecomposition class until the it is assigned or evaluated for
+  * some other reason (the reference should remain valid during the life time
+  * of this object). This class is the return type of
+  * HessenbergDecomposition::matrixH(); there is probably no other use for this
+  * class.
+  */
+template<typename MatrixType> struct HessenbergDecompositionMatrixHReturnType
+: public ReturnByValue<HessenbergDecompositionMatrixHReturnType<MatrixType> >
+{
+    typedef typename MatrixType::Index Index;
+  public:
+    /** \brief Constructor.
+      *
+      * \param[in] hess  Hessenberg decomposition
+      */
+    HessenbergDecompositionMatrixHReturnType(const HessenbergDecomposition<MatrixType>& hess) : m_hess(hess) { }
+
+    /** \brief Hessenberg matrix in decomposition.
+      *
+      * \param[out] result  Hessenberg matrix in decomposition \p hess which
+      *                     was passed to the constructor
+      */
+    template <typename ResultType>
+    inline void evalTo(ResultType& result) const
+    {
+      result = m_hess.packedMatrix();
+      Index n = result.rows();
+      if (n>2)
+        result.bottomLeftCorner(n-2, n-2).template triangularView<Lower>().setZero();
+    }
+
+    Index rows() const { return m_hess.packedMatrix().rows(); }
+    Index cols() const { return m_hess.packedMatrix().cols(); }
+
+  protected:
+    const HessenbergDecomposition<MatrixType>& m_hess;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_HESSENBERGDECOMPOSITION_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/include/eigen3/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
new file mode 100644
index 0000000..4fec8af
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIXBASEEIGENVALUES_H
+#define EIGEN_MATRIXBASEEIGENVALUES_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Derived, bool IsComplex>
+struct eigenvalues_selector
+{
+  // this is the implementation for the case IsComplex = true
+  static inline typename MatrixBase<Derived>::EigenvaluesReturnType const
+  run(const MatrixBase<Derived>& m)
+  {
+    typedef typename Derived::PlainObject PlainObject;
+    PlainObject m_eval(m);
+    return ComplexEigenSolver<PlainObject>(m_eval, false).eigenvalues();
+  }
+};
+
+template<typename Derived>
+struct eigenvalues_selector<Derived, false>
+{
+  static inline typename MatrixBase<Derived>::EigenvaluesReturnType const
+  run(const MatrixBase<Derived>& m)
+  {
+    typedef typename Derived::PlainObject PlainObject;
+    PlainObject m_eval(m);
+    return EigenSolver<PlainObject>(m_eval, false).eigenvalues();
+  }
+};
+
+} // end namespace internal
+
+/** \brief Computes the eigenvalues of a matrix 
+  * \returns Column vector containing the eigenvalues.
+  *
+  * \eigenvalues_module
+  * This function computes the eigenvalues with the help of the EigenSolver
+  * class (for real matrices) or the ComplexEigenSolver class (for complex
+  * matrices). 
+  *
+  * The eigenvalues are repeated according to their algebraic multiplicity,
+  * so there are as many eigenvalues as rows in the matrix.
+  *
+  * The SelfAdjointView class provides a better algorithm for selfadjoint
+  * matrices.
+  *
+  * Example: \include MatrixBase_eigenvalues.cpp
+  * Output: \verbinclude MatrixBase_eigenvalues.out
+  *
+  * \sa EigenSolver::eigenvalues(), ComplexEigenSolver::eigenvalues(),
+  *     SelfAdjointView::eigenvalues()
+  */
+template<typename Derived>
+inline typename MatrixBase<Derived>::EigenvaluesReturnType
+MatrixBase<Derived>::eigenvalues() const
+{
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  return internal::eigenvalues_selector<Derived, NumTraits<Scalar>::IsComplex>::run(derived());
+}
+
+/** \brief Computes the eigenvalues of a matrix
+  * \returns Column vector containing the eigenvalues.
+  *
+  * \eigenvalues_module
+  * This function computes the eigenvalues with the help of the
+  * SelfAdjointEigenSolver class.  The eigenvalues are repeated according to
+  * their algebraic multiplicity, so there are as many eigenvalues as rows in
+  * the matrix.
+  *
+  * Example: \include SelfAdjointView_eigenvalues.cpp
+  * Output: \verbinclude SelfAdjointView_eigenvalues.out
+  *
+  * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
+  */
+template<typename MatrixType, unsigned int UpLo> 
+inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
+SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
+{
+  typedef typename SelfAdjointView<MatrixType, UpLo>::PlainObject PlainObject;
+  PlainObject thisAsMatrix(*this);
+  return SelfAdjointEigenSolver<PlainObject>(thisAsMatrix, false).eigenvalues();
+}
+
+
+
+/** \brief Computes the L2 operator norm
+  * \returns Operator norm of the matrix.
+  *
+  * \eigenvalues_module
+  * This function computes the L2 operator norm of a matrix, which is also
+  * known as the spectral norm. The norm of a matrix \f$ A \f$ is defined to be
+  * \f[ \|A\|_2 = \max_x \frac{\|Ax\|_2}{\|x\|_2} \f]
+  * where the maximum is over all vectors and the norm on the right is the
+  * Euclidean vector norm. The norm equals the largest singular value, which is
+  * the square root of the largest eigenvalue of the positive semi-definite
+  * matrix \f$ A^*A \f$.
+  *
+  * The current implementation uses the eigenvalues of \f$ A^*A \f$, as computed
+  * by SelfAdjointView::eigenvalues(), to compute the operator norm of a
+  * matrix.  The SelfAdjointView class provides a better algorithm for
+  * selfadjoint matrices.
+  *
+  * Example: \include MatrixBase_operatorNorm.cpp
+  * Output: \verbinclude MatrixBase_operatorNorm.out
+  *
+  * \sa SelfAdjointView::eigenvalues(), SelfAdjointView::operatorNorm()
+  */
+template<typename Derived>
+inline typename MatrixBase<Derived>::RealScalar
+MatrixBase<Derived>::operatorNorm() const
+{
+  using std::sqrt;
+  typename Derived::PlainObject m_eval(derived());
+  // FIXME if it is really guaranteed that the eigenvalues are already sorted,
+  // then we don't need to compute a maxCoeff() here, comparing the 1st and last ones is enough.
+  return sqrt((m_eval*m_eval.adjoint())
+                 .eval()
+		 .template selfadjointView<Lower>()
+		 .eigenvalues()
+		 .maxCoeff()
+		 );
+}
+
+/** \brief Computes the L2 operator norm
+  * \returns Operator norm of the matrix.
+  *
+  * \eigenvalues_module
+  * This function computes the L2 operator norm of a self-adjoint matrix. For a
+  * self-adjoint matrix, the operator norm is the largest eigenvalue.
+  *
+  * The current implementation uses the eigenvalues of the matrix, as computed
+  * by eigenvalues(), to compute the operator norm of the matrix.
+  *
+  * Example: \include SelfAdjointView_operatorNorm.cpp
+  * Output: \verbinclude SelfAdjointView_operatorNorm.out
+  *
+  * \sa eigenvalues(), MatrixBase::operatorNorm()
+  */
+template<typename MatrixType, unsigned int UpLo>
+inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
+SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
+{
+  return eigenvalues().cwiseAbs().maxCoeff();
+}
+
+} // end namespace Eigen
+
+#endif
diff --git a/include/eigen3/Eigen/src/Eigenvalues/RealQZ.h b/include/eigen3/Eigen/src/Eigenvalues/RealQZ.h
new file mode 100644
index 0000000..4f36091
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/RealQZ.h
@@ -0,0 +1,624 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Alexey Korepanov <kaikaikai at yandex.ru>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REAL_QZ_H
+#define EIGEN_REAL_QZ_H
+
+namespace Eigen {
+
+  /** \eigenvalues_module \ingroup Eigenvalues_Module
+   *
+   *
+   * \class RealQZ
+   *
+   * \brief Performs a real QZ decomposition of a pair of square matrices
+   *
+   * \tparam _MatrixType the type of the matrix of which we are computing the
+   * real QZ decomposition; this is expected to be an instantiation of the
+   * Matrix class template.
+   *
+   * Given a real square matrices A and B, this class computes the real QZ
+   * decomposition: \f$ A = Q S Z \f$, \f$ B = Q T Z \f$ where Q and Z are
+   * real orthogonal matrixes, T is upper-triangular matrix, and S is upper
+   * quasi-triangular matrix. An orthogonal matrix is a matrix whose
+   * inverse is equal to its transpose, \f$ U^{-1} = U^T \f$. A quasi-triangular
+   * matrix is a block-triangular matrix whose diagonal consists of 1-by-1
+   * blocks and 2-by-2 blocks where further reduction is impossible due to
+   * complex eigenvalues. 
+   *
+   * The eigenvalues of the pencil \f$ A - z B \f$ can be obtained from
+   * 1x1 and 2x2 blocks on the diagonals of S and T.
+   *
+   * Call the function compute() to compute the real QZ decomposition of a
+   * given pair of matrices. Alternatively, you can use the 
+   * RealQZ(const MatrixType& B, const MatrixType& B, bool computeQZ)
+   * constructor which computes the real QZ decomposition at construction
+   * time. Once the decomposition is computed, you can use the matrixS(),
+   * matrixT(), matrixQ() and matrixZ() functions to retrieve the matrices
+   * S, T, Q and Z in the decomposition. If computeQZ==false, some time
+   * is saved by not computing matrices Q and Z.
+   *
+   * Example: \include RealQZ_compute.cpp
+   * Output: \include RealQZ_compute.out
+   *
+   * \note The implementation is based on the algorithm in "Matrix Computations"
+   * by Gene H. Golub and Charles F. Van Loan, and a paper "An algorithm for
+   * generalized eigenvalue problems" by C.B.Moler and G.W.Stewart.
+   *
+   * \sa class RealSchur, class ComplexSchur, class EigenSolver, class ComplexEigenSolver
+   */
+
+  template<typename _MatrixType> class RealQZ
+  {
+    public:
+      typedef _MatrixType MatrixType;
+      enum {
+        RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+        ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+        Options = MatrixType::Options,
+        MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+        MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+      };
+      typedef typename MatrixType::Scalar Scalar;
+      typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+      typedef typename MatrixType::Index Index;
+
+      typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
+      typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
+
+      /** \brief Default constructor.
+       *
+       * \param [in] size  Positive integer, size of the matrix whose QZ decomposition will be computed.
+       *
+       * The default constructor is useful in cases in which the user intends to
+       * perform decompositions via compute().  The \p size parameter is only
+       * used as a hint. It is not an error to give a wrong \p size, but it may
+       * impair performance.
+       *
+       * \sa compute() for an example.
+       */
+      RealQZ(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime) : 
+        m_S(size, size),
+        m_T(size, size),
+        m_Q(size, size),
+        m_Z(size, size),
+        m_workspace(size*2),
+        m_maxIters(400),
+        m_isInitialized(false)
+        { }
+
+      /** \brief Constructor; computes real QZ decomposition of given matrices
+       * 
+       * \param[in]  A          Matrix A.
+       * \param[in]  B          Matrix B.
+       * \param[in]  computeQZ  If false, A and Z are not computed.
+       *
+       * This constructor calls compute() to compute the QZ decomposition.
+       */
+      RealQZ(const MatrixType& A, const MatrixType& B, bool computeQZ = true) :
+        m_S(A.rows(),A.cols()),
+        m_T(A.rows(),A.cols()),
+        m_Q(A.rows(),A.cols()),
+        m_Z(A.rows(),A.cols()),
+        m_workspace(A.rows()*2),
+        m_maxIters(400),
+        m_isInitialized(false) {
+          compute(A, B, computeQZ);
+        }
+
+      /** \brief Returns matrix Q in the QZ decomposition. 
+       *
+       * \returns A const reference to the matrix Q.
+       */
+      const MatrixType& matrixQ() const {
+        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+        eigen_assert(m_computeQZ && "The matrices Q and Z have not been computed during the QZ decomposition.");
+        return m_Q;
+      }
+
+      /** \brief Returns matrix Z in the QZ decomposition. 
+       *
+       * \returns A const reference to the matrix Z.
+       */
+      const MatrixType& matrixZ() const {
+        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+        eigen_assert(m_computeQZ && "The matrices Q and Z have not been computed during the QZ decomposition.");
+        return m_Z;
+      }
+
+      /** \brief Returns matrix S in the QZ decomposition. 
+       *
+       * \returns A const reference to the matrix S.
+       */
+      const MatrixType& matrixS() const {
+        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+        return m_S;
+      }
+
+      /** \brief Returns matrix S in the QZ decomposition. 
+       *
+       * \returns A const reference to the matrix S.
+       */
+      const MatrixType& matrixT() const {
+        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+        return m_T;
+      }
+
+      /** \brief Computes QZ decomposition of given matrix. 
+       * 
+       * \param[in]  A          Matrix A.
+       * \param[in]  B          Matrix B.
+       * \param[in]  computeQZ  If false, A and Z are not computed.
+       * \returns    Reference to \c *this
+       */
+      RealQZ& compute(const MatrixType& A, const MatrixType& B, bool computeQZ = true);
+
+      /** \brief Reports whether previous computation was successful.
+       *
+       * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+       */
+      ComputationInfo info() const
+      {
+        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+        return m_info;
+      }
+
+      /** \brief Returns number of performed QR-like iterations.
+      */
+      Index iterations() const
+      {
+        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+        return m_global_iter;
+      }
+
+      /** Sets the maximal number of iterations allowed to converge to one eigenvalue
+       * or decouple the problem.
+      */
+      RealQZ& setMaxIterations(Index maxIters)
+      {
+        m_maxIters = maxIters;
+        return *this;
+      }
+
+    private:
+
+      MatrixType m_S, m_T, m_Q, m_Z;
+      Matrix<Scalar,Dynamic,1> m_workspace;
+      ComputationInfo m_info;
+      Index m_maxIters;
+      bool m_isInitialized;
+      bool m_computeQZ;
+      Scalar m_normOfT, m_normOfS;
+      Index m_global_iter;
+
+      typedef Matrix<Scalar,3,1> Vector3s;
+      typedef Matrix<Scalar,2,1> Vector2s;
+      typedef Matrix<Scalar,2,2> Matrix2s;
+      typedef JacobiRotation<Scalar> JRs;
+
+      void hessenbergTriangular();
+      void computeNorms();
+      Index findSmallSubdiagEntry(Index iu);
+      Index findSmallDiagEntry(Index f, Index l);
+      void splitOffTwoRows(Index i);
+      void pushDownZero(Index z, Index f, Index l);
+      void step(Index f, Index l, Index iter);
+
+  }; // RealQZ
+
+  /** \internal Reduces S and T to upper Hessenberg - triangular form */
+  template<typename MatrixType>
+    void RealQZ<MatrixType>::hessenbergTriangular()
+    {
+
+      const Index dim = m_S.cols();
+
+      // perform QR decomposition of T, overwrite T with R, save Q
+      HouseholderQR<MatrixType> qrT(m_T);
+      m_T = qrT.matrixQR();
+      m_T.template triangularView<StrictlyLower>().setZero();
+      m_Q = qrT.householderQ();
+      // overwrite S with Q* S
+      m_S.applyOnTheLeft(m_Q.adjoint());
+      // init Z as Identity
+      if (m_computeQZ)
+        m_Z = MatrixType::Identity(dim,dim);
+      // reduce S to upper Hessenberg with Givens rotations
+      for (Index j=0; j<=dim-3; j++) {
+        for (Index i=dim-1; i>=j+2; i--) {
+          JRs G;
+          // kill S(i,j)
+          if(m_S.coeff(i,j) != 0)
+          {
+            G.makeGivens(m_S.coeff(i-1,j), m_S.coeff(i,j), &m_S.coeffRef(i-1, j));
+            m_S.coeffRef(i,j) = Scalar(0.0);
+            m_S.rightCols(dim-j-1).applyOnTheLeft(i-1,i,G.adjoint());
+            m_T.rightCols(dim-i+1).applyOnTheLeft(i-1,i,G.adjoint());
+          }
+          // update Q
+          if (m_computeQZ)
+            m_Q.applyOnTheRight(i-1,i,G);
+          // kill T(i,i-1)
+          if(m_T.coeff(i,i-1)!=Scalar(0))
+          {
+            G.makeGivens(m_T.coeff(i,i), m_T.coeff(i,i-1), &m_T.coeffRef(i,i));
+            m_T.coeffRef(i,i-1) = Scalar(0.0);
+            m_S.applyOnTheRight(i,i-1,G);
+            m_T.topRows(i).applyOnTheRight(i,i-1,G);
+          }
+          // update Z
+          if (m_computeQZ)
+            m_Z.applyOnTheLeft(i,i-1,G.adjoint());
+        }
+      }
+    }
+
+  /** \internal Computes vector L1 norms of S and T when in Hessenberg-Triangular form already */
+  template<typename MatrixType>
+    inline void RealQZ<MatrixType>::computeNorms()
+    {
+      const Index size = m_S.cols();
+      m_normOfS = Scalar(0.0);
+      m_normOfT = Scalar(0.0);
+      for (Index j = 0; j < size; ++j)
+      {
+        m_normOfS += m_S.col(j).segment(0, (std::min)(size,j+2)).cwiseAbs().sum();
+        m_normOfT += m_T.row(j).segment(j, size - j).cwiseAbs().sum();
+      }
+    }
+
+
+  /** \internal Look for single small sub-diagonal element S(res, res-1) and return res (or 0) */
+  template<typename MatrixType>
+    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallSubdiagEntry(Index iu)
+    {
+      using std::abs;
+      Index res = iu;
+      while (res > 0)
+      {
+        Scalar s = abs(m_S.coeff(res-1,res-1)) + abs(m_S.coeff(res,res));
+        if (s == Scalar(0.0))
+          s = m_normOfS;
+        if (abs(m_S.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s)
+          break;
+        res--;
+      }
+      return res;
+    }
+
+  /** \internal Look for single small diagonal element T(res, res) for res between f and l, and return res (or f-1)  */
+  template<typename MatrixType>
+    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallDiagEntry(Index f, Index l)
+    {
+      using std::abs;
+      Index res = l;
+      while (res >= f) {
+        if (abs(m_T.coeff(res,res)) <= NumTraits<Scalar>::epsilon() * m_normOfT)
+          break;
+        res--;
+      }
+      return res;
+    }
+
+  /** \internal decouple 2x2 diagonal block in rows i, i+1 if eigenvalues are real */
+  template<typename MatrixType>
+    inline void RealQZ<MatrixType>::splitOffTwoRows(Index i)
+    {
+      using std::abs;
+      using std::sqrt;
+      const Index dim=m_S.cols();
+      if (abs(m_S.coeff(i+1,i))==Scalar(0))
+        return;
+      Index z = findSmallDiagEntry(i,i+1);
+      if (z==i-1)
+      {
+        // block of (S T^{-1})
+        Matrix2s STi = m_T.template block<2,2>(i,i).template triangularView<Upper>().
+          template solve<OnTheRight>(m_S.template block<2,2>(i,i));
+        Scalar p = Scalar(0.5)*(STi(0,0)-STi(1,1));
+        Scalar q = p*p + STi(1,0)*STi(0,1);
+        if (q>=0) {
+          Scalar z = sqrt(q);
+          // one QR-like iteration for ABi - lambda I
+          // is enough - when we know exact eigenvalue in advance,
+          // convergence is immediate
+          JRs G;
+          if (p>=0)
+            G.makeGivens(p + z, STi(1,0));
+          else
+            G.makeGivens(p - z, STi(1,0));
+          m_S.rightCols(dim-i).applyOnTheLeft(i,i+1,G.adjoint());
+          m_T.rightCols(dim-i).applyOnTheLeft(i,i+1,G.adjoint());
+          // update Q
+          if (m_computeQZ)
+            m_Q.applyOnTheRight(i,i+1,G);
+
+          G.makeGivens(m_T.coeff(i+1,i+1), m_T.coeff(i+1,i));
+          m_S.topRows(i+2).applyOnTheRight(i+1,i,G);
+          m_T.topRows(i+2).applyOnTheRight(i+1,i,G);
+          // update Z
+          if (m_computeQZ)
+            m_Z.applyOnTheLeft(i+1,i,G.adjoint());
+
+          m_S.coeffRef(i+1,i) = Scalar(0.0);
+          m_T.coeffRef(i+1,i) = Scalar(0.0);
+        }
+      }
+      else
+      {
+        pushDownZero(z,i,i+1);
+      }
+    }
+
+  /** \internal use zero in T(z,z) to zero S(l,l-1), working in block f..l */
+  template<typename MatrixType>
+    inline void RealQZ<MatrixType>::pushDownZero(Index z, Index f, Index l)
+    {
+      JRs G;
+      const Index dim = m_S.cols();
+      for (Index zz=z; zz<l; zz++)
+      {
+        // push 0 down
+        Index firstColS = zz>f ? (zz-1) : zz;
+        G.makeGivens(m_T.coeff(zz, zz+1), m_T.coeff(zz+1, zz+1));
+        m_S.rightCols(dim-firstColS).applyOnTheLeft(zz,zz+1,G.adjoint());
+        m_T.rightCols(dim-zz).applyOnTheLeft(zz,zz+1,G.adjoint());
+        m_T.coeffRef(zz+1,zz+1) = Scalar(0.0);
+        // update Q
+        if (m_computeQZ)
+          m_Q.applyOnTheRight(zz,zz+1,G);
+        // kill S(zz+1, zz-1)
+        if (zz>f)
+        {
+          G.makeGivens(m_S.coeff(zz+1, zz), m_S.coeff(zz+1,zz-1));
+          m_S.topRows(zz+2).applyOnTheRight(zz, zz-1,G);
+          m_T.topRows(zz+1).applyOnTheRight(zz, zz-1,G);
+          m_S.coeffRef(zz+1,zz-1) = Scalar(0.0);
+          // update Z
+          if (m_computeQZ)
+            m_Z.applyOnTheLeft(zz,zz-1,G.adjoint());
+        }
+      }
+      // finally kill S(l,l-1)
+      G.makeGivens(m_S.coeff(l,l), m_S.coeff(l,l-1));
+      m_S.applyOnTheRight(l,l-1,G);
+      m_T.applyOnTheRight(l,l-1,G);
+      m_S.coeffRef(l,l-1)=Scalar(0.0);
+      // update Z
+      if (m_computeQZ)
+        m_Z.applyOnTheLeft(l,l-1,G.adjoint());
+    }
+
+  /** \internal QR-like iterative step for block f..l */
+  template<typename MatrixType>
+    inline void RealQZ<MatrixType>::step(Index f, Index l, Index iter)
+    {
+      using std::abs;
+      const Index dim = m_S.cols();
+
+      // x, y, z
+      Scalar x, y, z;
+      if (iter==10)
+      {
+        // Wilkinson ad hoc shift
+        const Scalar
+          a11=m_S.coeff(f+0,f+0), a12=m_S.coeff(f+0,f+1),
+          a21=m_S.coeff(f+1,f+0), a22=m_S.coeff(f+1,f+1), a32=m_S.coeff(f+2,f+1),
+          b12=m_T.coeff(f+0,f+1),
+          b11i=Scalar(1.0)/m_T.coeff(f+0,f+0),
+          b22i=Scalar(1.0)/m_T.coeff(f+1,f+1),
+          a87=m_S.coeff(l-1,l-2),
+          a98=m_S.coeff(l-0,l-1),
+          b77i=Scalar(1.0)/m_T.coeff(l-2,l-2),
+          b88i=Scalar(1.0)/m_T.coeff(l-1,l-1);
+        Scalar ss = abs(a87*b77i) + abs(a98*b88i),
+               lpl = Scalar(1.5)*ss,
+               ll = ss*ss;
+        x = ll + a11*a11*b11i*b11i - lpl*a11*b11i + a12*a21*b11i*b22i
+          - a11*a21*b12*b11i*b11i*b22i;
+        y = a11*a21*b11i*b11i - lpl*a21*b11i + a21*a22*b11i*b22i 
+          - a21*a21*b12*b11i*b11i*b22i;
+        z = a21*a32*b11i*b22i;
+      }
+      else if (iter==16)
+      {
+        // another exceptional shift
+        x = m_S.coeff(f,f)/m_T.coeff(f,f)-m_S.coeff(l,l)/m_T.coeff(l,l) + m_S.coeff(l,l-1)*m_T.coeff(l-1,l) /
+          (m_T.coeff(l-1,l-1)*m_T.coeff(l,l));
+        y = m_S.coeff(f+1,f)/m_T.coeff(f,f);
+        z = 0;
+      }
+      else if (iter>23 && !(iter%8))
+      {
+        // extremely exceptional shift
+        x = internal::random<Scalar>(-1.0,1.0);
+        y = internal::random<Scalar>(-1.0,1.0);
+        z = internal::random<Scalar>(-1.0,1.0);
+      }
+      else
+      {
+        // Compute the shifts: (x,y,z,0...) = (AB^-1 - l1 I) (AB^-1 - l2 I) e1
+        // where l1 and l2 are the eigenvalues of the 2x2 matrix C = U V^-1 where
+        // U and V are 2x2 bottom right sub matrices of A and B. Thus:
+        //  = AB^-1AB^-1 + l1 l2 I - (l1+l2)(AB^-1)
+        //  = AB^-1AB^-1 + det(M) - tr(M)(AB^-1)
+        // Since we are only interested in having x, y, z with a correct ratio, we have:
+        const Scalar
+          a11 = m_S.coeff(f,f),     a12 = m_S.coeff(f,f+1),
+          a21 = m_S.coeff(f+1,f),   a22 = m_S.coeff(f+1,f+1),
+                                    a32 = m_S.coeff(f+2,f+1),
+
+          a88 = m_S.coeff(l-1,l-1), a89 = m_S.coeff(l-1,l),
+          a98 = m_S.coeff(l,l-1),   a99 = m_S.coeff(l,l),
+
+          b11 = m_T.coeff(f,f),     b12 = m_T.coeff(f,f+1),
+                                    b22 = m_T.coeff(f+1,f+1),
+
+          b88 = m_T.coeff(l-1,l-1), b89 = m_T.coeff(l-1,l),
+                                    b99 = m_T.coeff(l,l);
+
+        x = ( (a88/b88 - a11/b11)*(a99/b99 - a11/b11) - (a89/b99)*(a98/b88) + (a98/b88)*(b89/b99)*(a11/b11) ) * (b11/a21)
+          + a12/b22 - (a11/b11)*(b12/b22);
+        y = (a22/b22-a11/b11) - (a21/b11)*(b12/b22) - (a88/b88-a11/b11) - (a99/b99-a11/b11) + (a98/b88)*(b89/b99);
+        z = a32/b22;
+      }
+
+      JRs G;
+
+      for (Index k=f; k<=l-2; k++)
+      {
+        // variables for Householder reflections
+        Vector2s essential2;
+        Scalar tau, beta;
+
+        Vector3s hr(x,y,z);
+
+        // Q_k to annihilate S(k+1,k-1) and S(k+2,k-1)
+        hr.makeHouseholderInPlace(tau, beta);
+        essential2 = hr.template bottomRows<2>();
+        Index fc=(std::max)(k-1,Index(0));  // first col to update
+        m_S.template middleRows<3>(k).rightCols(dim-fc).applyHouseholderOnTheLeft(essential2, tau, m_workspace.data());
+        m_T.template middleRows<3>(k).rightCols(dim-fc).applyHouseholderOnTheLeft(essential2, tau, m_workspace.data());
+        if (m_computeQZ)
+          m_Q.template middleCols<3>(k).applyHouseholderOnTheRight(essential2, tau, m_workspace.data());
+        if (k>f)
+          m_S.coeffRef(k+2,k-1) = m_S.coeffRef(k+1,k-1) = Scalar(0.0);
+
+        // Z_{k1} to annihilate T(k+2,k+1) and T(k+2,k)
+        hr << m_T.coeff(k+2,k+2),m_T.coeff(k+2,k),m_T.coeff(k+2,k+1);
+        hr.makeHouseholderInPlace(tau, beta);
+        essential2 = hr.template bottomRows<2>();
+        {
+          Index lr = (std::min)(k+4,dim); // last row to update
+          Map<Matrix<Scalar,Dynamic,1> > tmp(m_workspace.data(),lr);
+          // S
+          tmp = m_S.template middleCols<2>(k).topRows(lr) * essential2;
+          tmp += m_S.col(k+2).head(lr);
+          m_S.col(k+2).head(lr) -= tau*tmp;
+          m_S.template middleCols<2>(k).topRows(lr) -= (tau*tmp) * essential2.adjoint();
+          // T
+          tmp = m_T.template middleCols<2>(k).topRows(lr) * essential2;
+          tmp += m_T.col(k+2).head(lr);
+          m_T.col(k+2).head(lr) -= tau*tmp;
+          m_T.template middleCols<2>(k).topRows(lr) -= (tau*tmp) * essential2.adjoint();
+        }
+        if (m_computeQZ)
+        {
+          // Z
+          Map<Matrix<Scalar,1,Dynamic> > tmp(m_workspace.data(),dim);
+          tmp = essential2.adjoint()*(m_Z.template middleRows<2>(k));
+          tmp += m_Z.row(k+2);
+          m_Z.row(k+2) -= tau*tmp;
+          m_Z.template middleRows<2>(k) -= essential2 * (tau*tmp);
+        }
+        m_T.coeffRef(k+2,k) = m_T.coeffRef(k+2,k+1) = Scalar(0.0);
+
+        // Z_{k2} to annihilate T(k+1,k)
+        G.makeGivens(m_T.coeff(k+1,k+1), m_T.coeff(k+1,k));
+        m_S.applyOnTheRight(k+1,k,G);
+        m_T.applyOnTheRight(k+1,k,G);
+        // update Z
+        if (m_computeQZ)
+          m_Z.applyOnTheLeft(k+1,k,G.adjoint());
+        m_T.coeffRef(k+1,k) = Scalar(0.0);
+
+        // update x,y,z
+        x = m_S.coeff(k+1,k);
+        y = m_S.coeff(k+2,k);
+        if (k < l-2)
+          z = m_S.coeff(k+3,k);
+      } // loop over k
+
+      // Q_{n-1} to annihilate y = S(l,l-2)
+      G.makeGivens(x,y);
+      m_S.applyOnTheLeft(l-1,l,G.adjoint());
+      m_T.applyOnTheLeft(l-1,l,G.adjoint());
+      if (m_computeQZ)
+        m_Q.applyOnTheRight(l-1,l,G);
+      m_S.coeffRef(l,l-2) = Scalar(0.0);
+
+      // Z_{n-1} to annihilate T(l,l-1)
+      G.makeGivens(m_T.coeff(l,l),m_T.coeff(l,l-1));
+      m_S.applyOnTheRight(l,l-1,G);
+      m_T.applyOnTheRight(l,l-1,G);
+      if (m_computeQZ)
+        m_Z.applyOnTheLeft(l,l-1,G.adjoint());
+      m_T.coeffRef(l,l-1) = Scalar(0.0);
+    }
+
+
+  template<typename MatrixType>
+    RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ)
+    {
+
+      const Index dim = A_in.cols();
+
+      eigen_assert (A_in.rows()==dim && A_in.cols()==dim 
+          && B_in.rows()==dim && B_in.cols()==dim 
+          && "Need square matrices of the same dimension");
+
+      m_isInitialized = true;
+      m_computeQZ = computeQZ;
+      m_S = A_in; m_T = B_in;
+      m_workspace.resize(dim*2);
+      m_global_iter = 0;
+
+      // entrance point: hessenberg triangular decomposition
+      hessenbergTriangular();
+      // compute L1 vector norms of T, S into m_normOfS, m_normOfT
+      computeNorms();
+
+      Index l = dim-1, 
+            f, 
+            local_iter = 0;
+
+      while (l>0 && local_iter<m_maxIters)
+      {
+        f = findSmallSubdiagEntry(l);
+        // now rows and columns f..l (including) decouple from the rest of the problem
+        if (f>0) m_S.coeffRef(f,f-1) = Scalar(0.0);
+        if (f == l) // One root found
+        {
+          l--;
+          local_iter = 0;
+        }
+        else if (f == l-1) // Two roots found
+        {
+          splitOffTwoRows(f);
+          l -= 2;
+          local_iter = 0;
+        }
+        else // No convergence yet
+        {
+          // if there's zero on diagonal of T, we can isolate an eigenvalue with Givens rotations
+          Index z = findSmallDiagEntry(f,l);
+          if (z>=f)
+          {
+            // zero found
+            pushDownZero(z,f,l);
+          }
+          else
+          {
+            // We are sure now that S.block(f,f, l-f+1,l-f+1) is underuced upper-Hessenberg 
+            // and T.block(f,f, l-f+1,l-f+1) is invertible uper-triangular, which allows to
+            // apply a QR-like iteration to rows and columns f..l.
+            step(f,l, local_iter);
+            local_iter++;
+            m_global_iter++;
+          }
+        }
+      }
+      // check if we converged before reaching iterations limit
+      m_info = (local_iter<m_maxIters) ? Success : NoConvergence;
+      return *this;
+    } // end compute
+
+} // end namespace Eigen
+
+#endif //EIGEN_REAL_QZ
diff --git a/include/eigen3/Eigen/src/Eigenvalues/RealSchur.h b/include/eigen3/Eigen/src/Eigenvalues/RealSchur.h
new file mode 100644
index 0000000..64d1363
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/RealSchur.h
@@ -0,0 +1,529 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010,2012 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REAL_SCHUR_H
+#define EIGEN_REAL_SCHUR_H
+
+#include "./HessenbergDecomposition.h"
+
+namespace Eigen { 
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \class RealSchur
+  *
+  * \brief Performs a real Schur decomposition of a square matrix
+  *
+  * \tparam _MatrixType the type of the matrix of which we are computing the
+  * real Schur decomposition; this is expected to be an instantiation of the
+  * Matrix class template.
+  *
+  * Given a real square matrix A, this class computes the real Schur
+  * decomposition: \f$ A = U T U^T \f$ where U is a real orthogonal matrix and
+  * T is a real quasi-triangular matrix. An orthogonal matrix is a matrix whose
+  * inverse is equal to its transpose, \f$ U^{-1} = U^T \f$. A quasi-triangular
+  * matrix is a block-triangular matrix whose diagonal consists of 1-by-1
+  * blocks and 2-by-2 blocks with complex eigenvalues. The eigenvalues of the
+  * blocks on the diagonal of T are the same as the eigenvalues of the matrix
+  * A, and thus the real Schur decomposition is used in EigenSolver to compute
+  * the eigendecomposition of a matrix.
+  *
+  * Call the function compute() to compute the real Schur decomposition of a
+  * given matrix. Alternatively, you can use the RealSchur(const MatrixType&, bool)
+  * constructor which computes the real Schur decomposition at construction
+  * time. Once the decomposition is computed, you can use the matrixU() and
+  * matrixT() functions to retrieve the matrices U and T in the decomposition.
+  *
+  * The documentation of RealSchur(const MatrixType&, bool) contains an example
+  * of the typical use of this class.
+  *
+  * \note The implementation is adapted from
+  * <a href="http://math.nist.gov/javanumerics/jama/">JAMA</a> (public domain).
+  * Their code is based on EISPACK.
+  *
+  * \sa class ComplexSchur, class EigenSolver, class ComplexEigenSolver
+  */
+template<typename _MatrixType> class RealSchur
+{
+  public:
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+    typedef typename MatrixType::Index Index;
+
+    typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
+    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
+
+    /** \brief Default constructor.
+      *
+      * \param [in] size  Positive integer, size of the matrix whose Schur decomposition will be computed.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via compute().  The \p size parameter is only
+      * used as a hint. It is not an error to give a wrong \p size, but it may
+      * impair performance.
+      *
+      * \sa compute() for an example.
+      */
+    RealSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
+            : m_matT(size, size),
+              m_matU(size, size),
+              m_workspaceVector(size),
+              m_hess(size),
+              m_isInitialized(false),
+              m_matUisUptodate(false),
+              m_maxIters(-1)
+    { }
+
+    /** \brief Constructor; computes real Schur decomposition of given matrix. 
+      * 
+      * \param[in]  matrix    Square matrix whose Schur decomposition is to be computed.
+      * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
+      *
+      * This constructor calls compute() to compute the Schur decomposition.
+      *
+      * Example: \include RealSchur_RealSchur_MatrixType.cpp
+      * Output: \verbinclude RealSchur_RealSchur_MatrixType.out
+      */
+    RealSchur(const MatrixType& matrix, bool computeU = true)
+            : m_matT(matrix.rows(),matrix.cols()),
+              m_matU(matrix.rows(),matrix.cols()),
+              m_workspaceVector(matrix.rows()),
+              m_hess(matrix.rows()),
+              m_isInitialized(false),
+              m_matUisUptodate(false),
+              m_maxIters(-1)
+    {
+      compute(matrix, computeU);
+    }
+
+    /** \brief Returns the orthogonal matrix in the Schur decomposition. 
+      *
+      * \returns A const reference to the matrix U.
+      *
+      * \pre Either the constructor RealSchur(const MatrixType&, bool) or the
+      * member function compute(const MatrixType&, bool) has been called before
+      * to compute the Schur decomposition of a matrix, and \p computeU was set
+      * to true (the default value).
+      *
+      * \sa RealSchur(const MatrixType&, bool) for an example
+      */
+    const MatrixType& matrixU() const
+    {
+      eigen_assert(m_isInitialized && "RealSchur is not initialized.");
+      eigen_assert(m_matUisUptodate && "The matrix U has not been computed during the RealSchur decomposition.");
+      return m_matU;
+    }
+
+    /** \brief Returns the quasi-triangular matrix in the Schur decomposition. 
+      *
+      * \returns A const reference to the matrix T.
+      *
+      * \pre Either the constructor RealSchur(const MatrixType&, bool) or the
+      * member function compute(const MatrixType&, bool) has been called before
+      * to compute the Schur decomposition of a matrix.
+      *
+      * \sa RealSchur(const MatrixType&, bool) for an example
+      */
+    const MatrixType& matrixT() const
+    {
+      eigen_assert(m_isInitialized && "RealSchur is not initialized.");
+      return m_matT;
+    }
+  
+    /** \brief Computes Schur decomposition of given matrix. 
+      * 
+      * \param[in]  matrix    Square matrix whose Schur decomposition is to be computed.
+      * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
+      * \returns    Reference to \c *this
+      *
+      * The Schur decomposition is computed by first reducing the matrix to
+      * Hessenberg form using the class HessenbergDecomposition. The Hessenberg
+      * matrix is then reduced to triangular form by performing Francis QR
+      * iterations with implicit double shift. The cost of computing the Schur
+      * decomposition depends on the number of iterations; as a rough guide, it
+      * may be taken to be \f$25n^3\f$ flops if \a computeU is true and
+      * \f$10n^3\f$ flops if \a computeU is false.
+      *
+      * Example: \include RealSchur_compute.cpp
+      * Output: \verbinclude RealSchur_compute.out
+      *
+      * \sa compute(const MatrixType&, bool, Index)
+      */
+    RealSchur& compute(const MatrixType& matrix, bool computeU = true);
+
+    /** \brief Computes Schur decomposition of a Hessenberg matrix H = Z T Z^T
+     *  \param[in] matrixH Matrix in Hessenberg form H
+     *  \param[in] matrixQ orthogonal matrix Q that transform a matrix A to H : A = Q H Q^T
+     *  \param computeU Computes the matriX U of the Schur vectors
+     * \return Reference to \c *this
+     * 
+     *  This routine assumes that the matrix is already reduced in Hessenberg form matrixH
+     *  using either the class HessenbergDecomposition or another mean. 
+     *  It computes the upper quasi-triangular matrix T of the Schur decomposition of H
+     *  When computeU is true, this routine computes the matrix U such that 
+     *  A = U T U^T =  (QZ) T (QZ)^T = Q H Q^T where A is the initial matrix
+     * 
+     * NOTE Q is referenced if computeU is true; so, if the initial orthogonal matrix
+     * is not available, the user should give an identity matrix (Q.setIdentity())
+     * 
+     * \sa compute(const MatrixType&, bool)
+     */
+    template<typename HessMatrixType, typename OrthMatrixType>
+    RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU);
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "RealSchur is not initialized.");
+      return m_info;
+    }
+
+    /** \brief Sets the maximum number of iterations allowed. 
+      *
+      * If not specified by the user, the maximum number of iterations is m_maxIterationsPerRow times the size
+      * of the matrix.
+      */
+    RealSchur& setMaxIterations(Index maxIters)
+    {
+      m_maxIters = maxIters;
+      return *this;
+    }
+
+    /** \brief Returns the maximum number of iterations. */
+    Index getMaxIterations()
+    {
+      return m_maxIters;
+    }
+
+    /** \brief Maximum number of iterations per row.
+      *
+      * If not otherwise specified, the maximum number of iterations is this number times the size of the
+      * matrix. It is currently set to 40.
+      */
+    static const int m_maxIterationsPerRow = 40;
+
+  private:
+    
+    MatrixType m_matT;
+    MatrixType m_matU;
+    ColumnVectorType m_workspaceVector;
+    HessenbergDecomposition<MatrixType> m_hess;
+    ComputationInfo m_info;
+    bool m_isInitialized;
+    bool m_matUisUptodate;
+    Index m_maxIters;
+
+    typedef Matrix<Scalar,3,1> Vector3s;
+
+    Scalar computeNormOfT();
+    Index findSmallSubdiagEntry(Index iu, const Scalar& norm);
+    void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
+    void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
+    void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
+    void performFrancisQRStep(Index il, Index im, Index iu, bool computeU, const Vector3s& firstHouseholderVector, Scalar* workspace);
+};
+
+
+template<typename MatrixType>
+RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
+{
+  eigen_assert(matrix.cols() == matrix.rows());
+  Index maxIters = m_maxIters;
+  if (maxIters == -1)
+    maxIters = m_maxIterationsPerRow * matrix.rows();
+
+  // Step 1. Reduce to Hessenberg form
+  m_hess.compute(matrix);
+
+  // Step 2. Reduce to real Schur form  
+  computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
+  
+  return *this;
+}
+template<typename MatrixType>
+template<typename HessMatrixType, typename OrthMatrixType>
+RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU)
+{  
+  m_matT = matrixH; 
+  if(computeU)
+    m_matU = matrixQ;
+  
+  Index maxIters = m_maxIters;
+  if (maxIters == -1)
+    maxIters = m_maxIterationsPerRow * matrixH.rows();
+  m_workspaceVector.resize(m_matT.cols());
+  Scalar* workspace = &m_workspaceVector.coeffRef(0);
+
+  // The matrix m_matT is divided in three parts. 
+  // Rows 0,...,il-1 are decoupled from the rest because m_matT(il,il-1) is zero. 
+  // Rows il,...,iu is the part we are working on (the active window).
+  // Rows iu+1,...,end are already brought in triangular form.
+  Index iu = m_matT.cols() - 1;
+  Index iter = 0;      // iteration count for current eigenvalue
+  Index totalIter = 0; // iteration count for whole matrix
+  Scalar exshift(0);   // sum of exceptional shifts
+  Scalar norm = computeNormOfT();
+
+  if(norm!=0)
+  {
+    while (iu >= 0)
+    {
+      Index il = findSmallSubdiagEntry(iu, norm);
+
+      // Check for convergence
+      if (il == iu) // One root found
+      {
+        m_matT.coeffRef(iu,iu) = m_matT.coeff(iu,iu) + exshift;
+        if (iu > 0)
+          m_matT.coeffRef(iu, iu-1) = Scalar(0);
+        iu--;
+        iter = 0;
+      }
+      else if (il == iu-1) // Two roots found
+      {
+        splitOffTwoRows(iu, computeU, exshift);
+        iu -= 2;
+        iter = 0;
+      }
+      else // No convergence yet
+      {
+        // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG )
+        Vector3s firstHouseholderVector(0,0,0), shiftInfo;
+        computeShift(iu, iter, exshift, shiftInfo);
+        iter = iter + 1;
+        totalIter = totalIter + 1;
+        if (totalIter > maxIters) break;
+        Index im;
+        initFrancisQRStep(il, iu, shiftInfo, im, firstHouseholderVector);
+        performFrancisQRStep(il, im, iu, computeU, firstHouseholderVector, workspace);
+      }
+    }
+  }
+  if(totalIter <= maxIters)
+    m_info = Success;
+  else
+    m_info = NoConvergence;
+
+  m_isInitialized = true;
+  m_matUisUptodate = computeU;
+  return *this;
+}
+
+/** \internal Computes and returns vector L1 norm of T */
+template<typename MatrixType>
+inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()
+{
+  const Index size = m_matT.cols();
+  // FIXME to be efficient the following would requires a triangular reduxion code
+  // Scalar norm = m_matT.upper().cwiseAbs().sum() 
+  //               + m_matT.bottomLeftCorner(size-1,size-1).diagonal().cwiseAbs().sum();
+  Scalar norm(0);
+  for (Index j = 0; j < size; ++j)
+    norm += m_matT.col(j).segment(0, (std::min)(size,j+2)).cwiseAbs().sum();
+  return norm;
+}
+
+/** \internal Look for single small sub-diagonal element and returns its index */
+template<typename MatrixType>
+inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& norm)
+{
+  using std::abs;
+  Index res = iu;
+  while (res > 0)
+  {
+    Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
+    if (s == 0.0)
+      s = norm;
+    if (abs(m_matT.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s)
+      break;
+    res--;
+  }
+  return res;
+}
+
+/** \internal Update T given that rows iu-1 and iu decouple from the rest. */
+template<typename MatrixType>
+inline void RealSchur<MatrixType>::splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift)
+{
+  using std::sqrt;
+  using std::abs;
+  const Index size = m_matT.cols();
+
+  // The eigenvalues of the 2x2 matrix [a b; c d] are 
+  // trace +/- sqrt(discr/4) where discr = tr^2 - 4*det, tr = a + d, det = ad - bc
+  Scalar p = Scalar(0.5) * (m_matT.coeff(iu-1,iu-1) - m_matT.coeff(iu,iu));
+  Scalar q = p * p + m_matT.coeff(iu,iu-1) * m_matT.coeff(iu-1,iu);   // q = tr^2 / 4 - det = discr/4
+  m_matT.coeffRef(iu,iu) += exshift;
+  m_matT.coeffRef(iu-1,iu-1) += exshift;
+
+  if (q >= Scalar(0)) // Two real eigenvalues
+  {
+    Scalar z = sqrt(abs(q));
+    JacobiRotation<Scalar> rot;
+    if (p >= Scalar(0))
+      rot.makeGivens(p + z, m_matT.coeff(iu, iu-1));
+    else
+      rot.makeGivens(p - z, m_matT.coeff(iu, iu-1));
+
+    m_matT.rightCols(size-iu+1).applyOnTheLeft(iu-1, iu, rot.adjoint());
+    m_matT.topRows(iu+1).applyOnTheRight(iu-1, iu, rot);
+    m_matT.coeffRef(iu, iu-1) = Scalar(0); 
+    if (computeU)
+      m_matU.applyOnTheRight(iu-1, iu, rot);
+  }
+
+  if (iu > 1) 
+    m_matT.coeffRef(iu-1, iu-2) = Scalar(0);
+}
+
+/** \internal Form shift in shiftInfo, and update exshift if an exceptional shift is performed. */
+template<typename MatrixType>
+inline void RealSchur<MatrixType>::computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo)
+{
+  using std::sqrt;
+  using std::abs;
+  shiftInfo.coeffRef(0) = m_matT.coeff(iu,iu);
+  shiftInfo.coeffRef(1) = m_matT.coeff(iu-1,iu-1);
+  shiftInfo.coeffRef(2) = m_matT.coeff(iu,iu-1) * m_matT.coeff(iu-1,iu);
+
+  // Wilkinson's original ad hoc shift
+  if (iter == 10)
+  {
+    exshift += shiftInfo.coeff(0);
+    for (Index i = 0; i <= iu; ++i)
+      m_matT.coeffRef(i,i) -= shiftInfo.coeff(0);
+    Scalar s = abs(m_matT.coeff(iu,iu-1)) + abs(m_matT.coeff(iu-1,iu-2));
+    shiftInfo.coeffRef(0) = Scalar(0.75) * s;
+    shiftInfo.coeffRef(1) = Scalar(0.75) * s;
+    shiftInfo.coeffRef(2) = Scalar(-0.4375) * s * s;
+  }
+
+  // MATLAB's new ad hoc shift
+  if (iter == 30)
+  {
+    Scalar s = (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0);
+    s = s * s + shiftInfo.coeff(2);
+    if (s > Scalar(0))
+    {
+      s = sqrt(s);
+      if (shiftInfo.coeff(1) < shiftInfo.coeff(0))
+        s = -s;
+      s = s + (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0);
+      s = shiftInfo.coeff(0) - shiftInfo.coeff(2) / s;
+      exshift += s;
+      for (Index i = 0; i <= iu; ++i)
+        m_matT.coeffRef(i,i) -= s;
+      shiftInfo.setConstant(Scalar(0.964));
+    }
+  }
+}
+
+/** \internal Compute index im at which Francis QR step starts and the first Householder vector. */
+template<typename MatrixType>
+inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector)
+{
+  using std::abs;
+  Vector3s& v = firstHouseholderVector; // alias to save typing
+
+  for (im = iu-2; im >= il; --im)
+  {
+    const Scalar Tmm = m_matT.coeff(im,im);
+    const Scalar r = shiftInfo.coeff(0) - Tmm;
+    const Scalar s = shiftInfo.coeff(1) - Tmm;
+    v.coeffRef(0) = (r * s - shiftInfo.coeff(2)) / m_matT.coeff(im+1,im) + m_matT.coeff(im,im+1);
+    v.coeffRef(1) = m_matT.coeff(im+1,im+1) - Tmm - r - s;
+    v.coeffRef(2) = m_matT.coeff(im+2,im+1);
+    if (im == il) {
+      break;
+    }
+    const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2)));
+    const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1)));
+    if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs)
+    {
+      break;
+    }
+  }
+}
+
+/** \internal Perform a Francis QR step involving rows il:iu and columns im:iu. */
+template<typename MatrixType>
+inline void RealSchur<MatrixType>::performFrancisQRStep(Index il, Index im, Index iu, bool computeU, const Vector3s& firstHouseholderVector, Scalar* workspace)
+{
+  eigen_assert(im >= il);
+  eigen_assert(im <= iu-2);
+
+  const Index size = m_matT.cols();
+
+  for (Index k = im; k <= iu-2; ++k)
+  {
+    bool firstIteration = (k == im);
+
+    Vector3s v;
+    if (firstIteration)
+      v = firstHouseholderVector;
+    else
+      v = m_matT.template block<3,1>(k,k-1);
+
+    Scalar tau, beta;
+    Matrix<Scalar, 2, 1> ess;
+    v.makeHouseholder(ess, tau, beta);
+    
+    if (beta != Scalar(0)) // if v is not zero
+    {
+      if (firstIteration && k > il)
+        m_matT.coeffRef(k,k-1) = -m_matT.coeff(k,k-1);
+      else if (!firstIteration)
+        m_matT.coeffRef(k,k-1) = beta;
+
+      // These Householder transformations form the O(n^3) part of the algorithm
+      m_matT.block(k, k, 3, size-k).applyHouseholderOnTheLeft(ess, tau, workspace);
+      m_matT.block(0, k, (std::min)(iu,k+3) + 1, 3).applyHouseholderOnTheRight(ess, tau, workspace);
+      if (computeU)
+        m_matU.block(0, k, size, 3).applyHouseholderOnTheRight(ess, tau, workspace);
+    }
+  }
+
+  Matrix<Scalar, 2, 1> v = m_matT.template block<2,1>(iu-1, iu-2);
+  Scalar tau, beta;
+  Matrix<Scalar, 1, 1> ess;
+  v.makeHouseholder(ess, tau, beta);
+
+  if (beta != Scalar(0)) // if v is not zero
+  {
+    m_matT.coeffRef(iu-1, iu-2) = beta;
+    m_matT.block(iu-1, iu-1, 2, size-iu+1).applyHouseholderOnTheLeft(ess, tau, workspace);
+    m_matT.block(0, iu-1, iu+1, 2).applyHouseholderOnTheRight(ess, tau, workspace);
+    if (computeU)
+      m_matU.block(0, iu-1, size, 2).applyHouseholderOnTheRight(ess, tau, workspace);
+  }
+
+  // clean up pollution due to round-off errors
+  for (Index i = im+2; i <= iu; ++i)
+  {
+    m_matT.coeffRef(i,i-2) = Scalar(0);
+    if (i > im+2)
+      m_matT.coeffRef(i,i-3) = Scalar(0);
+  }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_REAL_SCHUR_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/RealSchur_MKL.h b/include/eigen3/Eigen/src/Eigenvalues/RealSchur_MKL.h
new file mode 100644
index 0000000..ad97364
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/RealSchur_MKL.h
@@ -0,0 +1,83 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *    Real Schur needed to real unsymmetrical eigenvalues/eigenvectors.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_REAL_SCHUR_MKL_H
+#define EIGEN_REAL_SCHUR_MKL_H
+
+#include "Eigen/src/Core/util/MKL_support.h"
+
+namespace Eigen { 
+
+/** \internal Specialization for the data types supported by MKL */
+
+#define EIGEN_MKL_SCHUR_REAL(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
+template<> inline \
+RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
+RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
+{ \
+  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
+  typedef MatrixType::Scalar Scalar; \
+  typedef MatrixType::RealScalar RealScalar; \
+\
+  eigen_assert(matrix.cols() == matrix.rows()); \
+\
+  lapack_int n = matrix.cols(), sdim, info; \
+  lapack_int lda = matrix.outerStride(); \
+  lapack_int matrix_order = MKLCOLROW; \
+  char jobvs, sort='N'; \
+  LAPACK_##MKLPREFIX_U##_SELECT2 select = 0; \
+  jobvs = (computeU) ? 'V' : 'N'; \
+  m_matU.resize(n, n); \
+  lapack_int ldvs  = m_matU.outerStride(); \
+  m_matT = matrix; \
+  Matrix<EIGTYPE, Dynamic, Dynamic> wr, wi; \
+  wr.resize(n, 1); wi.resize(n, 1); \
+  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)wr.data(), (MKLTYPE*)wi.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
+  if(info == 0) \
+    m_info = Success; \
+  else \
+    m_info = NoConvergence; \
+\
+  m_isInitialized = true; \
+  m_matUisUptodate = computeU; \
+  return *this; \
+\
+}
+
+EIGEN_MKL_SCHUR_REAL(double,   double, d, D, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_SCHUR_REAL(double,   double, d, D, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, RowMajor, LAPACK_ROW_MAJOR)
+
+} // end namespace Eigen
+
+#endif // EIGEN_REAL_SCHUR_MKL_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
new file mode 100644
index 0000000..be89de4
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -0,0 +1,812 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINTEIGENSOLVER_H
+#define EIGEN_SELFADJOINTEIGENSOLVER_H
+
+#include "./Tridiagonalization.h"
+
+namespace Eigen { 
+
+template<typename _MatrixType>
+class GeneralizedSelfAdjointEigenSolver;
+
+namespace internal {
+template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
+}
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \class SelfAdjointEigenSolver
+  *
+  * \brief Computes eigenvalues and eigenvectors of selfadjoint matrices
+  *
+  * \tparam _MatrixType the type of the matrix of which we are computing the
+  * eigendecomposition; this is expected to be an instantiation of the Matrix
+  * class template.
+  *
+  * A matrix \f$ A \f$ is selfadjoint if it equals its adjoint. For real
+  * matrices, this means that the matrix is symmetric: it equals its
+  * transpose. This class computes the eigenvalues and eigenvectors of a
+  * selfadjoint matrix. These are the scalars \f$ \lambda \f$ and vectors
+  * \f$ v \f$ such that \f$ Av = \lambda v \f$.  The eigenvalues of a
+  * selfadjoint matrix are always real. If \f$ D \f$ is a diagonal matrix with
+  * the eigenvalues on the diagonal, and \f$ V \f$ is a matrix with the
+  * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$ (for selfadjoint
+  * matrices, the matrix \f$ V \f$ is always invertible). This is called the
+  * eigendecomposition.
+  *
+  * The algorithm exploits the fact that the matrix is selfadjoint, making it
+  * faster and more accurate than the general purpose eigenvalue algorithms
+  * implemented in EigenSolver and ComplexEigenSolver.
+  *
+  * Only the \b lower \b triangular \b part of the input matrix is referenced.
+  *
+  * Call the function compute() to compute the eigenvalues and eigenvectors of
+  * a given matrix. Alternatively, you can use the
+  * SelfAdjointEigenSolver(const MatrixType&, int) constructor which computes
+  * the eigenvalues and eigenvectors at construction time. Once the eigenvalue
+  * and eigenvectors are computed, they can be retrieved with the eigenvalues()
+  * and eigenvectors() functions.
+  *
+  * The documentation for SelfAdjointEigenSolver(const MatrixType&, int)
+  * contains an example of the typical use of this class.
+  *
+  * To solve the \em generalized eigenvalue problem \f$ Av = \lambda Bv \f$ and
+  * the likes, see the class GeneralizedSelfAdjointEigenSolver.
+  *
+  * \sa MatrixBase::eigenvalues(), class EigenSolver, class ComplexEigenSolver
+  */
+template<typename _MatrixType> class SelfAdjointEigenSolver
+{
+  public:
+
+    typedef _MatrixType MatrixType;
+    enum {
+      Size = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    
+    /** \brief Scalar type for matrices of type \p _MatrixType. */
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+
+    /** \brief Real scalar type for \p _MatrixType.
+      *
+      * This is just \c Scalar if #Scalar is real (e.g., \c float or
+      * \c double), and the type of the real part of \c Scalar if #Scalar is
+      * complex.
+      */
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    
+    friend struct internal::direct_selfadjoint_eigenvalues<SelfAdjointEigenSolver,Size,NumTraits<Scalar>::IsComplex>;
+
+    /** \brief Type for vector of eigenvalues as returned by eigenvalues().
+      *
+      * This is a column vector with entries of type #RealScalar.
+      * The length of the vector is the size of \p _MatrixType.
+      */
+    typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVectorType;
+    typedef Tridiagonalization<MatrixType> TridiagonalizationType;
+
+    /** \brief Default constructor for fixed-size matrices.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via compute(). This constructor
+      * can only be used if \p _MatrixType is a fixed-size matrix; use
+      * SelfAdjointEigenSolver(Index) for dynamic-size matrices.
+      *
+      * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp
+      * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver.out
+      */
+    SelfAdjointEigenSolver()
+        : m_eivec(),
+          m_eivalues(),
+          m_subdiag(),
+          m_isInitialized(false)
+    { }
+
+    /** \brief Constructor, pre-allocates memory for dynamic-size matrices.
+      *
+      * \param [in]  size  Positive integer, size of the matrix whose
+      * eigenvalues and eigenvectors will be computed.
+      *
+      * This constructor is useful for dynamic-size matrices, when the user
+      * intends to perform decompositions via compute(). The \p size
+      * parameter is only used as a hint. It is not an error to give a wrong
+      * \p size, but it may impair performance.
+      *
+      * \sa compute() for an example
+      */
+    SelfAdjointEigenSolver(Index size)
+        : m_eivec(size, size),
+          m_eivalues(size),
+          m_subdiag(size > 1 ? size - 1 : 1),
+          m_isInitialized(false)
+    {}
+
+    /** \brief Constructor; computes eigendecomposition of given matrix.
+      *
+      * \param[in]  matrix  Selfadjoint matrix whose eigendecomposition is to
+      *    be computed. Only the lower triangular part of the matrix is referenced.
+      * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+      *
+      * This constructor calls compute(const MatrixType&, int) to compute the
+      * eigenvalues of the matrix \p matrix. The eigenvectors are computed if
+      * \p options equals #ComputeEigenvectors.
+      *
+      * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType.cpp
+      * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType.out
+      *
+      * \sa compute(const MatrixType&, int)
+      */
+    SelfAdjointEigenSolver(const MatrixType& matrix, int options = ComputeEigenvectors)
+      : m_eivec(matrix.rows(), matrix.cols()),
+        m_eivalues(matrix.cols()),
+        m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
+        m_isInitialized(false)
+    {
+      compute(matrix, options);
+    }
+
+    /** \brief Computes eigendecomposition of given matrix.
+      *
+      * \param[in]  matrix  Selfadjoint matrix whose eigendecomposition is to
+      *    be computed. Only the lower triangular part of the matrix is referenced.
+      * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+      * \returns    Reference to \c *this
+      *
+      * This function computes the eigenvalues of \p matrix.  The eigenvalues()
+      * function can be used to retrieve them.  If \p options equals #ComputeEigenvectors,
+      * then the eigenvectors are also computed and can be retrieved by
+      * calling eigenvectors().
+      *
+      * This implementation uses a symmetric QR algorithm. The matrix is first
+      * reduced to tridiagonal form using the Tridiagonalization class. The
+      * tridiagonal matrix is then brought to diagonal form with implicit
+      * symmetric QR steps with Wilkinson shift. Details can be found in
+      * Section 8.3 of Golub \& Van Loan, <i>%Matrix Computations</i>.
+      *
+      * The cost of the computation is about \f$ 9n^3 \f$ if the eigenvectors
+      * are required and \f$ 4n^3/3 \f$ if they are not required.
+      *
+      * This method reuses the memory in the SelfAdjointEigenSolver object that
+      * was allocated when the object was constructed, if the size of the
+      * matrix does not change.
+      *
+      * Example: \include SelfAdjointEigenSolver_compute_MatrixType.cpp
+      * Output: \verbinclude SelfAdjointEigenSolver_compute_MatrixType.out
+      *
+      * \sa SelfAdjointEigenSolver(const MatrixType&, int)
+      */
+    SelfAdjointEigenSolver& compute(const MatrixType& matrix, int options = ComputeEigenvectors);
+    
+    /** \brief Computes eigendecomposition of given matrix using a direct algorithm
+      *
+      * This is a variant of compute(const MatrixType&, int options) which
+      * directly solves the underlying polynomial equation.
+      * 
+      * Currently only 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
+      * 
+      * This method is usually significantly faster than the QR algorithm
+      * but it might also be less accurate. It is also worth noting that
+      * for 3x3 matrices it involves trigonometric operations which are
+      * not necessarily available for all scalar types.
+      *
+      * \sa compute(const MatrixType&, int options)
+      */
+    SelfAdjointEigenSolver& computeDirect(const MatrixType& matrix, int options = ComputeEigenvectors);
+
+    /** \brief Returns the eigenvectors of given matrix.
+      *
+      * \returns  A const reference to the matrix whose columns are the eigenvectors.
+      *
+      * \pre The eigenvectors have been computed before.
+      *
+      * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
+      * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
+      * eigenvectors are normalized to have (Euclidean) norm equal to one. If
+      * this object was used to solve the eigenproblem for the selfadjoint
+      * matrix \f$ A \f$, then the matrix returned by this function is the
+      * matrix \f$ V \f$ in the eigendecomposition \f$ A = V D V^{-1} \f$.
+      *
+      * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp
+      * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out
+      *
+      * \sa eigenvalues()
+      */
+    const MatrixType& eigenvectors() const
+    {
+      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+      return m_eivec;
+    }
+
+    /** \brief Returns the eigenvalues of given matrix.
+      *
+      * \returns A const reference to the column vector containing the eigenvalues.
+      *
+      * \pre The eigenvalues have been computed before.
+      *
+      * The eigenvalues are repeated according to their algebraic multiplicity,
+      * so there are as many eigenvalues as rows in the matrix. The eigenvalues
+      * are sorted in increasing order.
+      *
+      * Example: \include SelfAdjointEigenSolver_eigenvalues.cpp
+      * Output: \verbinclude SelfAdjointEigenSolver_eigenvalues.out
+      *
+      * \sa eigenvectors(), MatrixBase::eigenvalues()
+      */
+    const RealVectorType& eigenvalues() const
+    {
+      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+      return m_eivalues;
+    }
+
+    /** \brief Computes the positive-definite square root of the matrix.
+      *
+      * \returns the positive-definite square root of the matrix
+      *
+      * \pre The eigenvalues and eigenvectors of a positive-definite matrix
+      * have been computed before.
+      *
+      * The square root of a positive-definite matrix \f$ A \f$ is the
+      * positive-definite matrix whose square equals \f$ A \f$. This function
+      * uses the eigendecomposition \f$ A = V D V^{-1} \f$ to compute the
+      * square root as \f$ A^{1/2} = V D^{1/2} V^{-1} \f$.
+      *
+      * Example: \include SelfAdjointEigenSolver_operatorSqrt.cpp
+      * Output: \verbinclude SelfAdjointEigenSolver_operatorSqrt.out
+      *
+      * \sa operatorInverseSqrt(),
+      *     \ref MatrixFunctions_Module "MatrixFunctions Module"
+      */
+    MatrixType operatorSqrt() const
+    {
+      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+      return m_eivec * m_eivalues.cwiseSqrt().asDiagonal() * m_eivec.adjoint();
+    }
+
+    /** \brief Computes the inverse square root of the matrix.
+      *
+      * \returns the inverse positive-definite square root of the matrix
+      *
+      * \pre The eigenvalues and eigenvectors of a positive-definite matrix
+      * have been computed before.
+      *
+      * This function uses the eigendecomposition \f$ A = V D V^{-1} \f$ to
+      * compute the inverse square root as \f$ V D^{-1/2} V^{-1} \f$. This is
+      * cheaper than first computing the square root with operatorSqrt() and
+      * then its inverse with MatrixBase::inverse().
+      *
+      * Example: \include SelfAdjointEigenSolver_operatorInverseSqrt.cpp
+      * Output: \verbinclude SelfAdjointEigenSolver_operatorInverseSqrt.out
+      *
+      * \sa operatorSqrt(), MatrixBase::inverse(),
+      *     \ref MatrixFunctions_Module "MatrixFunctions Module"
+      */
+    MatrixType operatorInverseSqrt() const
+    {
+      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+      return m_eivec * m_eivalues.cwiseInverse().cwiseSqrt().asDiagonal() * m_eivec.adjoint();
+    }
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+      return m_info;
+    }
+
+    /** \brief Maximum number of iterations.
+      *
+      * The algorithm terminates if it does not converge within m_maxIterations * n iterations, where n
+      * denotes the size of the matrix. This value is currently set to 30 (copied from LAPACK).
+      */
+    static const int m_maxIterations = 30;
+
+    #ifdef EIGEN2_SUPPORT
+    SelfAdjointEigenSolver(const MatrixType& matrix, bool computeEigenvectors)
+      : m_eivec(matrix.rows(), matrix.cols()),
+        m_eivalues(matrix.cols()),
+        m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
+        m_isInitialized(false)
+    {
+      compute(matrix, computeEigenvectors);
+    }
+    
+    SelfAdjointEigenSolver(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
+        : m_eivec(matA.cols(), matA.cols()),
+          m_eivalues(matA.cols()),
+          m_subdiag(matA.cols() > 1 ? matA.cols() - 1 : 1),
+          m_isInitialized(false)
+    {
+      static_cast<GeneralizedSelfAdjointEigenSolver<MatrixType>*>(this)->compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
+    }
+    
+    void compute(const MatrixType& matrix, bool computeEigenvectors)
+    {
+      compute(matrix, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
+    }
+
+    void compute(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
+    {
+      compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
+    }
+    #endif // EIGEN2_SUPPORT
+
+  protected:
+    MatrixType m_eivec;
+    RealVectorType m_eivalues;
+    typename TridiagonalizationType::SubDiagonalType m_subdiag;
+    ComputationInfo m_info;
+    bool m_isInitialized;
+    bool m_eigenvectorsOk;
+};
+
+/** \internal
+  *
+  * \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  * Performs a QR step on a tridiagonal symmetric matrix represented as a
+  * pair of two vectors \a diag and \a subdiag.
+  *
+  * \param matA the input selfadjoint matrix
+  * \param hCoeffs returned Householder coefficients
+  *
+  * For compilation efficiency reasons, this procedure does not use eigen expression
+  * for its arguments.
+  *
+  * Implemented from Golub's "Matrix Computations", algorithm 8.3.2:
+  * "implicit symmetric QR step with Wilkinson shift"
+  */
+namespace internal {
+template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
+static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n);
+}
+
+template<typename MatrixType>
+SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
+::compute(const MatrixType& matrix, int options)
+{
+  using std::abs;
+  eigen_assert(matrix.cols() == matrix.rows());
+  eigen_assert((options&~(EigVecMask|GenEigMask))==0
+          && (options&EigVecMask)!=EigVecMask
+          && "invalid option parameter");
+  bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
+  Index n = matrix.cols();
+  m_eivalues.resize(n,1);
+
+  if(n==1)
+  {
+    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0));
+    if(computeEigenvectors)
+      m_eivec.setOnes(n,n);
+    m_info = Success;
+    m_isInitialized = true;
+    m_eigenvectorsOk = computeEigenvectors;
+    return *this;
+  }
+
+  // declare some aliases
+  RealVectorType& diag = m_eivalues;
+  MatrixType& mat = m_eivec;
+
+  // map the matrix coefficients to [-1:1] to avoid over- and underflow.
+  mat = matrix.template triangularView<Lower>();
+  RealScalar scale = mat.cwiseAbs().maxCoeff();
+  if(scale==RealScalar(0)) scale = RealScalar(1);
+  mat.template triangularView<Lower>() /= scale;
+  m_subdiag.resize(n-1);
+  internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors);
+  
+  Index end = n-1;
+  Index start = 0;
+  Index iter = 0; // total number of iterations
+
+  while (end>0)
+  {
+    for (Index i = start; i<end; ++i)
+      if (internal::isMuchSmallerThan(abs(m_subdiag[i]),(abs(diag[i])+abs(diag[i+1]))))
+        m_subdiag[i] = 0;
+
+    // find the largest unreduced block
+    while (end>0 && m_subdiag[end-1]==0)
+    {
+      end--;
+    }
+    if (end<=0)
+      break;
+
+    // if we spent too many iterations, we give up
+    iter++;
+    if(iter > m_maxIterations * n) break;
+
+    start = end - 1;
+    while (start>0 && m_subdiag[start-1]!=0)
+      start--;
+
+    internal::tridiagonal_qr_step<MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor>(diag.data(), m_subdiag.data(), start, end, computeEigenvectors ? m_eivec.data() : (Scalar*)0, n);
+  }
+
+  if (iter <= m_maxIterations * n)
+    m_info = Success;
+  else
+    m_info = NoConvergence;
+
+  // Sort eigenvalues and corresponding vectors.
+  // TODO make the sort optional ?
+  // TODO use a better sort algorithm !!
+  if (m_info == Success)
+  {
+    for (Index i = 0; i < n-1; ++i)
+    {
+      Index k;
+      m_eivalues.segment(i,n-i).minCoeff(&k);
+      if (k > 0)
+      {
+        std::swap(m_eivalues[i], m_eivalues[k+i]);
+        if(computeEigenvectors)
+          m_eivec.col(i).swap(m_eivec.col(k+i));
+      }
+    }
+  }
+  
+  // scale back the eigen values
+  m_eivalues *= scale;
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = computeEigenvectors;
+  return *this;
+}
+
+
+namespace internal {
+  
+template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues
+{
+  static inline void run(SolverType& eig, const typename SolverType::MatrixType& A, int options)
+  { eig.compute(A,options); }
+};
+
+template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3,false>
+{
+  typedef typename SolverType::MatrixType MatrixType;
+  typedef typename SolverType::RealVectorType VectorType;
+  typedef typename SolverType::Scalar Scalar;
+  
+  static inline void computeRoots(const MatrixType& m, VectorType& roots)
+  {
+    using std::sqrt;
+    using std::atan2;
+    using std::cos;
+    using std::sin;
+    const Scalar s_inv3 = Scalar(1.0)/Scalar(3.0);
+    const Scalar s_sqrt3 = sqrt(Scalar(3.0));
+
+    // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The
+    // eigenvalues are the roots to this equation, all guaranteed to be
+    // real-valued, because the matrix is symmetric.
+    Scalar c0 = m(0,0)*m(1,1)*m(2,2) + Scalar(2)*m(1,0)*m(2,0)*m(2,1) - m(0,0)*m(2,1)*m(2,1) - m(1,1)*m(2,0)*m(2,0) - m(2,2)*m(1,0)*m(1,0);
+    Scalar c1 = m(0,0)*m(1,1) - m(1,0)*m(1,0) + m(0,0)*m(2,2) - m(2,0)*m(2,0) + m(1,1)*m(2,2) - m(2,1)*m(2,1);
+    Scalar c2 = m(0,0) + m(1,1) + m(2,2);
+
+    // Construct the parameters used in classifying the roots of the equation
+    // and in solving the equation for the roots in closed form.
+    Scalar c2_over_3 = c2*s_inv3;
+    Scalar a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
+    if (a_over_3 > Scalar(0))
+      a_over_3 = Scalar(0);
+
+    Scalar half_b = Scalar(0.5)*(c0 + c2_over_3*(Scalar(2)*c2_over_3*c2_over_3 - c1));
+
+    Scalar q = half_b*half_b + a_over_3*a_over_3*a_over_3;
+    if (q > Scalar(0))
+      q = Scalar(0);
+
+    // Compute the eigenvalues by solving for the roots of the polynomial.
+    Scalar rho = sqrt(-a_over_3);
+    Scalar theta = atan2(sqrt(-q),half_b)*s_inv3;
+    Scalar cos_theta = cos(theta);
+    Scalar sin_theta = sin(theta);
+    roots(0) = c2_over_3 + Scalar(2)*rho*cos_theta;
+    roots(1) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta);
+    roots(2) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta);
+
+    // Sort in increasing order.
+    if (roots(0) >= roots(1))
+      std::swap(roots(0),roots(1));
+    if (roots(1) >= roots(2))
+    {
+      std::swap(roots(1),roots(2));
+      if (roots(0) >= roots(1))
+        std::swap(roots(0),roots(1));
+    }
+  }
+  
+  static inline void run(SolverType& solver, const MatrixType& mat, int options)
+  {
+    using std::sqrt;
+    eigen_assert(mat.cols() == 3 && mat.cols() == mat.rows());
+    eigen_assert((options&~(EigVecMask|GenEigMask))==0
+            && (options&EigVecMask)!=EigVecMask
+            && "invalid option parameter");
+    bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
+    
+    MatrixType& eivecs = solver.m_eivec;
+    VectorType& eivals = solver.m_eivalues;
+  
+    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar scale = mat.cwiseAbs().maxCoeff();
+    MatrixType scaledMat = mat / scale;
+
+    // compute the eigenvalues
+    computeRoots(scaledMat,eivals);
+
+    // compute the eigen vectors
+    if(computeEigenvectors)
+    {
+      Scalar safeNorm2 = Eigen::NumTraits<Scalar>::epsilon();
+      if((eivals(2)-eivals(0))<=Eigen::NumTraits<Scalar>::epsilon())
+      {
+        eivecs.setIdentity();
+      }
+      else
+      {
+        scaledMat = scaledMat.template selfadjointView<Lower>();
+        MatrixType tmp;
+        tmp = scaledMat;
+
+        Scalar d0 = eivals(2) - eivals(1);
+        Scalar d1 = eivals(1) - eivals(0);
+        int k =  d0 > d1 ? 2 : 0;
+        d0 = d0 > d1 ? d0 : d1;
+
+        tmp.diagonal().array () -= eivals(k);
+        VectorType cross;
+        Scalar n;
+        n = (cross = tmp.row(0).cross(tmp.row(1))).squaredNorm();
+
+        if(n>safeNorm2)
+        {
+          eivecs.col(k) = cross / sqrt(n);
+        }
+        else
+        {
+          n = (cross = tmp.row(0).cross(tmp.row(2))).squaredNorm();
+
+          if(n>safeNorm2)
+          {
+            eivecs.col(k) = cross / sqrt(n);
+          }
+          else
+          {
+            n = (cross = tmp.row(1).cross(tmp.row(2))).squaredNorm();
+
+            if(n>safeNorm2)
+            {
+              eivecs.col(k) = cross / sqrt(n);
+            }
+            else
+            {
+              // the input matrix and/or the eigenvaues probably contains some inf/NaN,
+              // => exit
+              // scale back to the original size.
+              eivals *= scale;
+
+              solver.m_info = NumericalIssue;
+              solver.m_isInitialized = true;
+              solver.m_eigenvectorsOk = computeEigenvectors;
+              return;
+            }
+          }
+        }
+
+        tmp = scaledMat;
+        tmp.diagonal().array() -= eivals(1);
+
+        if(d0<=Eigen::NumTraits<Scalar>::epsilon())
+        {
+          eivecs.col(1) = eivecs.col(k).unitOrthogonal();
+        }
+        else
+        {
+          n = (cross = eivecs.col(k).cross(tmp.row(0))).squaredNorm();
+          if(n>safeNorm2)
+          {
+            eivecs.col(1) = cross / sqrt(n);
+          }
+          else
+          {
+            n = (cross = eivecs.col(k).cross(tmp.row(1))).squaredNorm();
+            if(n>safeNorm2)
+              eivecs.col(1) = cross / sqrt(n);
+            else
+            {
+              n = (cross = eivecs.col(k).cross(tmp.row(2))).squaredNorm();
+              if(n>safeNorm2)
+                eivecs.col(1) = cross / sqrt(n);
+              else
+              {
+                // we should never reach this point,
+                // if so the last two eigenvalues are likely to be very close to each other
+                eivecs.col(1) = eivecs.col(k).unitOrthogonal();
+              }
+            }
+          }
+
+          // make sure that eivecs[1] is orthogonal to eivecs[2]
+          // FIXME: this step should not be needed
+          Scalar d = eivecs.col(1).dot(eivecs.col(k));
+          eivecs.col(1) = (eivecs.col(1) - d * eivecs.col(k)).normalized();
+        }
+
+        eivecs.col(k==2 ? 0 : 2) = eivecs.col(k).cross(eivecs.col(1)).normalized();
+      }
+    }
+    // Rescale back to the original size.
+    eivals *= scale;
+    
+    solver.m_info = Success;
+    solver.m_isInitialized = true;
+    solver.m_eigenvectorsOk = computeEigenvectors;
+  }
+};
+
+// 2x2 direct eigenvalues decomposition, code from Hauke Heibel
+template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,2,false>
+{
+  typedef typename SolverType::MatrixType MatrixType;
+  typedef typename SolverType::RealVectorType VectorType;
+  typedef typename SolverType::Scalar Scalar;
+  
+  static inline void computeRoots(const MatrixType& m, VectorType& roots)
+  {
+    using std::sqrt;
+    const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*m(1,0)*m(1,0));
+    const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
+    roots(0) = t1 - t0;
+    roots(1) = t1 + t0;
+  }
+  
+  static inline void run(SolverType& solver, const MatrixType& mat, int options)
+  {
+    using std::sqrt;
+    eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows());
+    eigen_assert((options&~(EigVecMask|GenEigMask))==0
+            && (options&EigVecMask)!=EigVecMask
+            && "invalid option parameter");
+    bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
+    
+    MatrixType& eivecs = solver.m_eivec;
+    VectorType& eivals = solver.m_eivalues;
+  
+    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar scale = mat.cwiseAbs().maxCoeff();
+    scale = (std::max)(scale,Scalar(1));
+    MatrixType scaledMat = mat / scale;
+    
+    // Compute the eigenvalues
+    computeRoots(scaledMat,eivals);
+    
+    // compute the eigen vectors
+    if(computeEigenvectors)
+    {
+      scaledMat.diagonal().array () -= eivals(1);
+      Scalar a2 = numext::abs2(scaledMat(0,0));
+      Scalar c2 = numext::abs2(scaledMat(1,1));
+      Scalar b2 = numext::abs2(scaledMat(1,0));
+      if(a2>c2)
+      {
+        eivecs.col(1) << -scaledMat(1,0), scaledMat(0,0);
+        eivecs.col(1) /= sqrt(a2+b2);
+      }
+      else
+      {
+        eivecs.col(1) << -scaledMat(1,1), scaledMat(1,0);
+        eivecs.col(1) /= sqrt(c2+b2);
+      }
+
+      eivecs.col(0) << eivecs.col(1).unitOrthogonal();
+    }
+    
+    // Rescale back to the original size.
+    eivals *= scale;
+    
+    solver.m_info = Success;
+    solver.m_isInitialized = true;
+    solver.m_eigenvectorsOk = computeEigenvectors;
+  }
+};
+
+}
+
+template<typename MatrixType>
+SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
+::computeDirect(const MatrixType& matrix, int options)
+{
+  internal::direct_selfadjoint_eigenvalues<SelfAdjointEigenSolver,Size,NumTraits<Scalar>::IsComplex>::run(*this,matrix,options);
+  return *this;
+}
+
+namespace internal {
+template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
+static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
+{
+  using std::abs;
+  RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
+  RealScalar e = subdiag[end-1];
+  // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
+  // underflow thus leading to inf/NaN values when using the following commented code:
+//   RealScalar e2 = numext::abs2(subdiag[end-1]);
+//   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
+  // This explain the following, somewhat more complicated, version:
+  RealScalar mu = diag[end];
+  if(td==0)
+    mu -= abs(e);
+  else
+  {
+    RealScalar e2 = numext::abs2(subdiag[end-1]);
+    RealScalar h = numext::hypot(td,e);
+    if(e2==0)  mu -= (e / (td + (td>0 ? 1 : -1))) * (e / h);
+    else       mu -= e2 / (td + (td>0 ? h : -h));
+  }
+  
+  RealScalar x = diag[start] - mu;
+  RealScalar z = subdiag[start];
+  for (Index k = start; k < end; ++k)
+  {
+    JacobiRotation<RealScalar> rot;
+    rot.makeGivens(x, z);
+
+    // do T = G' T G
+    RealScalar sdk = rot.s() * diag[k] + rot.c() * subdiag[k];
+    RealScalar dkp1 = rot.s() * subdiag[k] + rot.c() * diag[k+1];
+
+    diag[k] = rot.c() * (rot.c() * diag[k] - rot.s() * subdiag[k]) - rot.s() * (rot.c() * subdiag[k] - rot.s() * diag[k+1]);
+    diag[k+1] = rot.s() * sdk + rot.c() * dkp1;
+    subdiag[k] = rot.c() * sdk - rot.s() * dkp1;
+    
+
+    if (k > start)
+      subdiag[k - 1] = rot.c() * subdiag[k-1] - rot.s() * z;
+
+    x = subdiag[k];
+
+    if (k < end - 1)
+    {
+      z = -rot.s() * subdiag[k+1];
+      subdiag[k + 1] = rot.c() * subdiag[k+1];
+    }
+    
+    // apply the givens rotation to the unit matrix Q = Q * G
+    if (matrixQ)
+    {
+      // FIXME if StorageOrder == RowMajor this operation is not very efficient
+      Map<Matrix<Scalar,Dynamic,Dynamic,StorageOrder> > q(matrixQ,n,n);
+      q.applyOnTheRight(k,k+1,rot);
+    }
+  }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINTEIGENSOLVER_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
new file mode 100644
index 0000000..17c0dad
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
@@ -0,0 +1,92 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *    Self-adjoint eigenvalues/eigenvectors.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SAEIGENSOLVER_MKL_H
+#define EIGEN_SAEIGENSOLVER_MKL_H
+
+#include "Eigen/src/Core/util/MKL_support.h"
+
+namespace Eigen { 
+
+/** \internal Specialization for the data types supported by MKL */
+
+#define EIGEN_MKL_EIG_SELFADJ(EIGTYPE, MKLTYPE, MKLRTYPE, MKLNAME, EIGCOLROW, MKLCOLROW ) \
+template<> inline \
+SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
+SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, int options) \
+{ \
+  eigen_assert(matrix.cols() == matrix.rows()); \
+  eigen_assert((options&~(EigVecMask|GenEigMask))==0 \
+          && (options&EigVecMask)!=EigVecMask \
+          && "invalid option parameter"); \
+  bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \
+  lapack_int n = matrix.cols(), lda, matrix_order, info; \
+  m_eivalues.resize(n,1); \
+  m_subdiag.resize(n-1); \
+  m_eivec = matrix; \
+\
+  if(n==1) \
+  { \
+    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0)); \
+    if(computeEigenvectors) m_eivec.setOnes(n,n); \
+    m_info = Success; \
+    m_isInitialized = true; \
+    m_eigenvectorsOk = computeEigenvectors; \
+    return *this; \
+  } \
+\
+  lda = matrix.outerStride(); \
+  matrix_order=MKLCOLROW; \
+  char jobz, uplo='L'/*, range='A'*/; \
+  jobz = computeEigenvectors ? 'V' : 'N'; \
+\
+  info = LAPACKE_##MKLNAME( matrix_order, jobz, uplo, n, (MKLTYPE*)m_eivec.data(), lda, (MKLRTYPE*)m_eivalues.data() ); \
+  m_info = (info==0) ? Success : NoConvergence; \
+  m_isInitialized = true; \
+  m_eigenvectorsOk = computeEigenvectors; \
+  return *this; \
+}
+
+
+EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)
+
+EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)
+
+} // end namespace Eigen
+
+#endif // EIGEN_SAEIGENSOLVER_H
diff --git a/include/eigen3/Eigen/src/Eigenvalues/Tridiagonalization.h b/include/eigen3/Eigen/src/Eigenvalues/Tridiagonalization.h
new file mode 100644
index 0000000..192278d
--- /dev/null
+++ b/include/eigen3/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -0,0 +1,557 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIDIAGONALIZATION_H
+#define EIGEN_TRIDIAGONALIZATION_H
+
+namespace Eigen { 
+
+namespace internal {
+  
+template<typename MatrixType> struct TridiagonalizationMatrixTReturnType;
+template<typename MatrixType>
+struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
+{
+  typedef typename MatrixType::PlainObject ReturnType;
+};
+
+template<typename MatrixType, typename CoeffVectorType>
+void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
+}
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  *
+  * \class Tridiagonalization
+  *
+  * \brief Tridiagonal decomposition of a selfadjoint matrix
+  *
+  * \tparam _MatrixType the type of the matrix of which we are computing the
+  * tridiagonal decomposition; this is expected to be an instantiation of the
+  * Matrix class template.
+  *
+  * This class performs a tridiagonal decomposition of a selfadjoint matrix \f$ A \f$ such that:
+  * \f$ A = Q T Q^* \f$ where \f$ Q \f$ is unitary and \f$ T \f$ a real symmetric tridiagonal matrix.
+  *
+  * A tridiagonal matrix is a matrix which has nonzero elements only on the
+  * main diagonal and the first diagonal below and above it. The Hessenberg
+  * decomposition of a selfadjoint matrix is in fact a tridiagonal
+  * decomposition. This class is used in SelfAdjointEigenSolver to compute the
+  * eigenvalues and eigenvectors of a selfadjoint matrix.
+  *
+  * Call the function compute() to compute the tridiagonal decomposition of a
+  * given matrix. Alternatively, you can use the Tridiagonalization(const MatrixType&)
+  * constructor which computes the tridiagonal Schur decomposition at
+  * construction time. Once the decomposition is computed, you can use the
+  * matrixQ() and matrixT() functions to retrieve the matrices Q and T in the
+  * decomposition.
+  *
+  * The documentation of Tridiagonalization(const MatrixType&) contains an
+  * example of the typical use of this class.
+  *
+  * \sa class HessenbergDecomposition, class SelfAdjointEigenSolver
+  */
+template<typename _MatrixType> class Tridiagonalization
+{
+  public:
+
+    /** \brief Synonym for the template parameter \p _MatrixType. */
+    typedef _MatrixType MatrixType;
+
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename MatrixType::Index Index;
+
+    enum {
+      Size = MatrixType::RowsAtCompileTime,
+      SizeMinusOne = Size == Dynamic ? Dynamic : (Size > 1 ? Size - 1 : 1),
+      Options = MatrixType::Options,
+      MaxSize = MatrixType::MaxRowsAtCompileTime,
+      MaxSizeMinusOne = MaxSize == Dynamic ? Dynamic : (MaxSize > 1 ? MaxSize - 1 : 1)
+    };
+
+    typedef Matrix<Scalar, SizeMinusOne, 1, Options & ~RowMajor, MaxSizeMinusOne, 1> CoeffVectorType;
+    typedef typename internal::plain_col_type<MatrixType, RealScalar>::type DiagonalType;
+    typedef Matrix<RealScalar, SizeMinusOne, 1, Options & ~RowMajor, MaxSizeMinusOne, 1> SubDiagonalType;
+    typedef typename internal::remove_all<typename MatrixType::RealReturnType>::type MatrixTypeRealView;
+    typedef internal::TridiagonalizationMatrixTReturnType<MatrixTypeRealView> MatrixTReturnType;
+
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+              typename internal::add_const_on_value_type<typename Diagonal<const MatrixType>::RealReturnType>::type,
+              const Diagonal<const MatrixType>
+            >::type DiagonalReturnType;
+
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+              typename internal::add_const_on_value_type<typename Diagonal<
+                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >::RealReturnType>::type,
+              const Diagonal<
+                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >
+            >::type SubDiagonalReturnType;
+
+    /** \brief Return type of matrixQ() */
+    typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename CoeffVectorType::ConjugateReturnType>::type> HouseholderSequenceType;
+
+    /** \brief Default constructor.
+      *
+      * \param [in]  size  Positive integer, size of the matrix whose tridiagonal
+      * decomposition will be computed.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via compute().  The \p size parameter is only
+      * used as a hint. It is not an error to give a wrong \p size, but it may
+      * impair performance.
+      *
+      * \sa compute() for an example.
+      */
+    Tridiagonalization(Index size = Size==Dynamic ? 2 : Size)
+      : m_matrix(size,size),
+        m_hCoeffs(size > 1 ? size-1 : 1),
+        m_isInitialized(false)
+    {}
+
+    /** \brief Constructor; computes tridiagonal decomposition of given matrix.
+      *
+      * \param[in]  matrix  Selfadjoint matrix whose tridiagonal decomposition
+      * is to be computed.
+      *
+      * This constructor calls compute() to compute the tridiagonal decomposition.
+      *
+      * Example: \include Tridiagonalization_Tridiagonalization_MatrixType.cpp
+      * Output: \verbinclude Tridiagonalization_Tridiagonalization_MatrixType.out
+      */
+    Tridiagonalization(const MatrixType& matrix)
+      : m_matrix(matrix),
+        m_hCoeffs(matrix.cols() > 1 ? matrix.cols()-1 : 1),
+        m_isInitialized(false)
+    {
+      internal::tridiagonalization_inplace(m_matrix, m_hCoeffs);
+      m_isInitialized = true;
+    }
+
+    /** \brief Computes tridiagonal decomposition of given matrix.
+      *
+      * \param[in]  matrix  Selfadjoint matrix whose tridiagonal decomposition
+      * is to be computed.
+      * \returns    Reference to \c *this
+      *
+      * The tridiagonal decomposition is computed by bringing the columns of
+      * the matrix successively in the required form using Householder
+      * reflections. The cost is \f$ 4n^3/3 \f$ flops, where \f$ n \f$ denotes
+      * the size of the given matrix.
+      *
+      * This method reuses of the allocated data in the Tridiagonalization
+      * object, if the size of the matrix does not change.
+      *
+      * Example: \include Tridiagonalization_compute.cpp
+      * Output: \verbinclude Tridiagonalization_compute.out
+      */
+    Tridiagonalization& compute(const MatrixType& matrix)
+    {
+      m_matrix = matrix;
+      m_hCoeffs.resize(matrix.rows()-1, 1);
+      internal::tridiagonalization_inplace(m_matrix, m_hCoeffs);
+      m_isInitialized = true;
+      return *this;
+    }
+
+    /** \brief Returns the Householder coefficients.
+      *
+      * \returns a const reference to the vector of Householder coefficients
+      *
+      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+      * the member function compute(const MatrixType&) has been called before
+      * to compute the tridiagonal decomposition of a matrix.
+      *
+      * The Householder coefficients allow the reconstruction of the matrix
+      * \f$ Q \f$ in the tridiagonal decomposition from the packed data.
+      *
+      * Example: \include Tridiagonalization_householderCoefficients.cpp
+      * Output: \verbinclude Tridiagonalization_householderCoefficients.out
+      *
+      * \sa packedMatrix(), \ref Householder_Module "Householder module"
+      */
+    inline CoeffVectorType householderCoefficients() const
+    {
+      eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+      return m_hCoeffs;
+    }
+
+    /** \brief Returns the internal representation of the decomposition
+      *
+      *	\returns a const reference to a matrix with the internal representation
+      *	         of the decomposition.
+      *
+      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+      * the member function compute(const MatrixType&) has been called before
+      * to compute the tridiagonal decomposition of a matrix.
+      *
+      * The returned matrix contains the following information:
+      *  - the strict upper triangular part is equal to the input matrix A.
+      *  - the diagonal and lower sub-diagonal represent the real tridiagonal
+      *    symmetric matrix T.
+      *  - the rest of the lower part contains the Householder vectors that,
+      *    combined with Householder coefficients returned by
+      *    householderCoefficients(), allows to reconstruct the matrix Q as
+      *       \f$ Q = H_{N-1} \ldots H_1 H_0 \f$.
+      *    Here, the matrices \f$ H_i \f$ are the Householder transformations
+      *       \f$ H_i = (I - h_i v_i v_i^T) \f$
+      *    where \f$ h_i \f$ is the \f$ i \f$th Householder coefficient and
+      *    \f$ v_i \f$ is the Householder vector defined by
+      *       \f$ v_i = [ 0, \ldots, 0, 1, M(i+2,i), \ldots, M(N-1,i) ]^T \f$
+      *    with M the matrix returned by this function.
+      *
+      * See LAPACK for further details on this packed storage.
+      *
+      * Example: \include Tridiagonalization_packedMatrix.cpp
+      * Output: \verbinclude Tridiagonalization_packedMatrix.out
+      *
+      * \sa householderCoefficients()
+      */
+    inline const MatrixType& packedMatrix() const
+    {
+      eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+      return m_matrix;
+    }
+
+    /** \brief Returns the unitary matrix Q in the decomposition
+      *
+      * \returns object representing the matrix Q
+      *
+      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+      * the member function compute(const MatrixType&) has been called before
+      * to compute the tridiagonal decomposition of a matrix.
+      *
+      * This function returns a light-weight object of template class
+      * HouseholderSequence. You can either apply it directly to a matrix or
+      * you can convert it to a matrix of type #MatrixType.
+      *
+      * \sa Tridiagonalization(const MatrixType&) for an example,
+      *     matrixT(), class HouseholderSequence
+      */
+    HouseholderSequenceType matrixQ() const
+    {
+      eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+      return HouseholderSequenceType(m_matrix, m_hCoeffs.conjugate())
+             .setLength(m_matrix.rows() - 1)
+             .setShift(1);
+    }
+
+    /** \brief Returns an expression of the tridiagonal matrix T in the decomposition
+      *
+      * \returns expression object representing the matrix T
+      *
+      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+      * the member function compute(const MatrixType&) has been called before
+      * to compute the tridiagonal decomposition of a matrix.
+      *
+      * Currently, this function can be used to extract the matrix T from internal
+      * data and copy it to a dense matrix object. In most cases, it may be
+      * sufficient to directly use the packed matrix or the vector expressions
+      * returned by diagonal() and subDiagonal() instead of creating a new
+      * dense copy matrix with this function.
+      *
+      * \sa Tridiagonalization(const MatrixType&) for an example,
+      * matrixQ(), packedMatrix(), diagonal(), subDiagonal()
+      */
+    MatrixTReturnType matrixT() const
+    {
+      eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+      return MatrixTReturnType(m_matrix.real());
+    }
+
+    /** \brief Returns the diagonal of the tridiagonal matrix T in the decomposition.
+      *
+      * \returns expression representing the diagonal of T
+      *
+      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+      * the member function compute(const MatrixType&) has been called before
+      * to compute the tridiagonal decomposition of a matrix.
+      *
+      * Example: \include Tridiagonalization_diagonal.cpp
+      * Output: \verbinclude Tridiagonalization_diagonal.out
+      *
+      * \sa matrixT(), subDiagonal()
+      */
+    DiagonalReturnType diagonal() const;
+
+    /** \brief Returns the subdiagonal of the tridiagonal matrix T in the decomposition.
+      *
+      * \returns expression representing the subdiagonal of T
+      *
+      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+      * the member function compute(const MatrixType&) has been called before
+      * to compute the tridiagonal decomposition of a matrix.
+      *
+      * \sa diagonal() for an example, matrixT()
+      */
+    SubDiagonalReturnType subDiagonal() const;
+
+  protected:
+
+    MatrixType m_matrix;
+    CoeffVectorType m_hCoeffs;
+    bool m_isInitialized;
+};
+
+template<typename MatrixType>
+typename Tridiagonalization<MatrixType>::DiagonalReturnType
+Tridiagonalization<MatrixType>::diagonal() const
+{
+  eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+  return m_matrix.diagonal();
+}
+
+template<typename MatrixType>
+typename Tridiagonalization<MatrixType>::SubDiagonalReturnType
+Tridiagonalization<MatrixType>::subDiagonal() const
+{
+  eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+  Index n = m_matrix.rows();
+  return Block<const MatrixType,SizeMinusOne,SizeMinusOne>(m_matrix, 1, 0, n-1,n-1).diagonal();
+}
+
+namespace internal {
+
+/** \internal
+  * Performs a tridiagonal decomposition of the selfadjoint matrix \a matA in-place.
+  *
+  * \param[in,out] matA On input the selfadjoint matrix. Only the \b lower triangular part is referenced.
+  *                     On output, the strict upper part is left unchanged, and the lower triangular part
+  *                     represents the T and Q matrices in packed format has detailed below.
+  * \param[out]    hCoeffs returned Householder coefficients (see below)
+  *
+  * On output, the tridiagonal selfadjoint matrix T is stored in the diagonal
+  * and lower sub-diagonal of the matrix \a matA.
+  * The unitary matrix Q is represented in a compact way as a product of
+  * Householder reflectors \f$ H_i \f$ such that:
+  *       \f$ Q = H_{N-1} \ldots H_1 H_0 \f$.
+  * The Householder reflectors are defined as
+  *       \f$ H_i = (I - h_i v_i v_i^T) \f$
+  * where \f$ h_i = hCoeffs[i]\f$ is the \f$ i \f$th Householder coefficient and
+  * \f$ v_i \f$ is the Householder vector defined by
+  *       \f$ v_i = [ 0, \ldots, 0, 1, matA(i+2,i), \ldots, matA(N-1,i) ]^T \f$.
+  *
+  * Implemented from Golub's "Matrix Computations", algorithm 8.3.1.
+  *
+  * \sa Tridiagonalization::packedMatrix()
+  */
+template<typename MatrixType, typename CoeffVectorType>
+void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
+{
+  using numext::conj;
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index n = matA.rows();
+  eigen_assert(n==matA.cols());
+  eigen_assert(n==hCoeffs.size()+1 || n==1);
+  
+  for (Index i = 0; i<n-1; ++i)
+  {
+    Index remainingSize = n-i-1;
+    RealScalar beta;
+    Scalar h;
+    matA.col(i).tail(remainingSize).makeHouseholderInPlace(h, beta);
+
+    // Apply similarity transformation to remaining columns,
+    // i.e., A = H A H' where H = I - h v v' and v = matA.col(i).tail(n-i-1)
+    matA.col(i).coeffRef(i+1) = 1;
+
+    hCoeffs.tail(n-i-1).noalias() = (matA.bottomRightCorner(remainingSize,remainingSize).template selfadjointView<Lower>()
+                                  * (conj(h) * matA.col(i).tail(remainingSize)));
+
+    hCoeffs.tail(n-i-1) += (conj(h)*Scalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
+
+    matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>()
+      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), -1);
+
+    matA.col(i).coeffRef(i+1) = beta;
+    hCoeffs.coeffRef(i) = h;
+  }
+}
+
+// forward declaration, implementation at the end of this file
+template<typename MatrixType,
+         int Size=MatrixType::ColsAtCompileTime,
+         bool IsComplex=NumTraits<typename MatrixType::Scalar>::IsComplex>
+struct tridiagonalization_inplace_selector;
+
+/** \brief Performs a full tridiagonalization in place
+  *
+  * \param[in,out]  mat  On input, the selfadjoint matrix whose tridiagonal
+  *    decomposition is to be computed. Only the lower triangular part referenced.
+  *    The rest is left unchanged. On output, the orthogonal matrix Q
+  *    in the decomposition if \p extractQ is true.
+  * \param[out]  diag  The diagonal of the tridiagonal matrix T in the
+  *    decomposition.
+  * \param[out]  subdiag  The subdiagonal of the tridiagonal matrix T in
+  *    the decomposition.
+  * \param[in]  extractQ  If true, the orthogonal matrix Q in the
+  *    decomposition is computed and stored in \p mat.
+  *
+  * Computes the tridiagonal decomposition of the selfadjoint matrix \p mat in place
+  * such that \f$ mat = Q T Q^* \f$ where \f$ Q \f$ is unitary and \f$ T \f$ a real
+  * symmetric tridiagonal matrix.
+  *
+  * The tridiagonal matrix T is passed to the output parameters \p diag and \p subdiag. If
+  * \p extractQ is true, then the orthogonal matrix Q is passed to \p mat. Otherwise the lower
+  * part of the matrix \p mat is destroyed.
+  *
+  * The vectors \p diag and \p subdiag are not resized. The function
+  * assumes that they are already of the correct size. The length of the
+  * vector \p diag should equal the number of rows in \p mat, and the
+  * length of the vector \p subdiag should be one left.
+  *
+  * This implementation contains an optimized path for 3-by-3 matrices
+  * which is especially useful for plane fitting.
+  *
+  * \note Currently, it requires two temporary vectors to hold the intermediate
+  * Householder coefficients, and to reconstruct the matrix Q from the Householder
+  * reflectors.
+  *
+  * Example (this uses the same matrix as the example in
+  *    Tridiagonalization::Tridiagonalization(const MatrixType&)):
+  *    \include Tridiagonalization_decomposeInPlace.cpp
+  * Output: \verbinclude Tridiagonalization_decomposeInPlace.out
+  *
+  * \sa class Tridiagonalization
+  */
+template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
+void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+{
+  eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
+  tridiagonalization_inplace_selector<MatrixType>::run(mat, diag, subdiag, extractQ);
+}
+
+/** \internal
+  * General full tridiagonalization
+  */
+template<typename MatrixType, int Size, bool IsComplex>
+struct tridiagonalization_inplace_selector
+{
+  typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
+  typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
+  typedef typename MatrixType::Index Index;
+  template<typename DiagonalType, typename SubDiagonalType>
+  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  {
+    CoeffVectorType hCoeffs(mat.cols()-1);
+    tridiagonalization_inplace(mat,hCoeffs);
+    diag = mat.diagonal().real();
+    subdiag = mat.template diagonal<-1>().real();
+    if(extractQ)
+      mat = HouseholderSequenceType(mat, hCoeffs.conjugate())
+            .setLength(mat.rows() - 1)
+            .setShift(1);
+  }
+};
+
+/** \internal
+  * Specialization for 3x3 real matrices.
+  * Especially useful for plane fitting.
+  */
+template<typename MatrixType>
+struct tridiagonalization_inplace_selector<MatrixType,3,false>
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+  template<typename DiagonalType, typename SubDiagonalType>
+  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  {
+    using std::sqrt;
+    diag[0] = mat(0,0);
+    RealScalar v1norm2 = numext::abs2(mat(2,0));
+    if(v1norm2 == RealScalar(0))
+    {
+      diag[1] = mat(1,1);
+      diag[2] = mat(2,2);
+      subdiag[0] = mat(1,0);
+      subdiag[1] = mat(2,1);
+      if (extractQ)
+        mat.setIdentity();
+    }
+    else
+    {
+      RealScalar beta = sqrt(numext::abs2(mat(1,0)) + v1norm2);
+      RealScalar invBeta = RealScalar(1)/beta;
+      Scalar m01 = mat(1,0) * invBeta;
+      Scalar m02 = mat(2,0) * invBeta;
+      Scalar q = RealScalar(2)*m01*mat(2,1) + m02*(mat(2,2) - mat(1,1));
+      diag[1] = mat(1,1) + m02*q;
+      diag[2] = mat(2,2) - m02*q;
+      subdiag[0] = beta;
+      subdiag[1] = mat(2,1) - m01 * q;
+      if (extractQ)
+      {
+        mat << 1,   0,    0,
+               0, m01,  m02,
+               0, m02, -m01;
+      }
+    }
+  }
+};
+
+/** \internal
+  * Trivial specialization for 1x1 matrices
+  */
+template<typename MatrixType, bool IsComplex>
+struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  template<typename DiagonalType, typename SubDiagonalType>
+  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
+  {
+    diag(0,0) = numext::real(mat(0,0));
+    if(extractQ)
+      mat(0,0) = Scalar(1);
+  }
+};
+
+/** \internal
+  * \eigenvalues_module \ingroup Eigenvalues_Module
+  *
+  * \brief Expression type for return value of Tridiagonalization::matrixT()
+  *
+  * \tparam MatrixType type of underlying dense matrix
+  */
+template<typename MatrixType> struct TridiagonalizationMatrixTReturnType
+: public ReturnByValue<TridiagonalizationMatrixTReturnType<MatrixType> >
+{
+    typedef typename MatrixType::Index Index;
+  public:
+    /** \brief Constructor.
+      *
+      * \param[in] mat The underlying dense matrix
+      */
+    TridiagonalizationMatrixTReturnType(const MatrixType& mat) : m_matrix(mat) { }
+
+    template <typename ResultType>
+    inline void evalTo(ResultType& result) const
+    {
+      result.setZero();
+      result.template diagonal<1>() = m_matrix.template diagonal<-1>().conjugate();
+      result.diagonal() = m_matrix.diagonal();
+      result.template diagonal<-1>() = m_matrix.template diagonal<-1>();
+    }
+
+    Index rows() const { return m_matrix.rows(); }
+    Index cols() const { return m_matrix.cols(); }
+
+  protected:
+    typename MatrixType::Nested m_matrix;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIDIAGONALIZATION_H
diff --git a/include/eigen3/Eigen/src/Geometry/AlignedBox.h b/include/eigen3/Eigen/src/Geometry/AlignedBox.h
new file mode 100644
index 0000000..8e186d5
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/AlignedBox.h
@@ -0,0 +1,375 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ALIGNEDBOX_H
+#define EIGEN_ALIGNEDBOX_H
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  *
+  * \class AlignedBox
+  *
+  * \brief An axis aligned box
+  *
+  * \param _Scalar the type of the scalar coefficients
+  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  *
+  * This class represents an axis aligned box as a pair of the minimal and maximal corners.
+  */
+template <typename _Scalar, int _AmbientDim>
+class AlignedBox
+{
+public:
+EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
+  enum { AmbientDimAtCompileTime = _AmbientDim };
+  typedef _Scalar                                   Scalar;
+  typedef NumTraits<Scalar>                         ScalarTraits;
+  typedef DenseIndex                                Index;
+  typedef typename ScalarTraits::Real               RealScalar;
+  typedef typename ScalarTraits::NonInteger      NonInteger;
+  typedef Matrix<Scalar,AmbientDimAtCompileTime,1>  VectorType;
+
+  /** Define constants to name the corners of a 1D, 2D or 3D axis aligned bounding box */
+  enum CornerType
+  {
+    /** 1D names */
+    Min=0, Max=1,
+
+    /** Added names for 2D */
+    BottomLeft=0, BottomRight=1,
+    TopLeft=2, TopRight=3,
+
+    /** Added names for 3D */
+    BottomLeftFloor=0, BottomRightFloor=1,
+    TopLeftFloor=2, TopRightFloor=3,
+    BottomLeftCeil=4, BottomRightCeil=5,
+    TopLeftCeil=6, TopRightCeil=7
+  };
+
+
+  /** Default constructor initializing a null box. */
+  inline AlignedBox()
+  { if (AmbientDimAtCompileTime!=Dynamic) setEmpty(); }
+
+  /** Constructs a null box with \a _dim the dimension of the ambient space. */
+  inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
+  { setEmpty(); }
+
+  /** Constructs a box with extremities \a _min and \a _max. */
+  template<typename OtherVectorType1, typename OtherVectorType2>
+  inline AlignedBox(const OtherVectorType1& _min, const OtherVectorType2& _max) : m_min(_min), m_max(_max) {}
+
+  /** Constructs a box containing a single point \a p. */
+  template<typename Derived>
+  inline explicit AlignedBox(const MatrixBase<Derived>& a_p)
+  {
+    typename internal::nested<Derived,2>::type p(a_p.derived());
+    m_min = p;
+    m_max = p;
+  }
+
+  ~AlignedBox() {}
+
+  /** \returns the dimension in which the box holds */
+  inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size() : Index(AmbientDimAtCompileTime); }
+
+  /** \deprecated use isEmpty */
+  inline bool isNull() const { return isEmpty(); }
+
+  /** \deprecated use setEmpty */
+  inline void setNull() { setEmpty(); }
+
+  /** \returns true if the box is empty. */
+  inline bool isEmpty() const { return (m_min.array() > m_max.array()).any(); }
+
+  /** Makes \c *this an empty box. */
+  inline void setEmpty()
+  {
+    m_min.setConstant( ScalarTraits::highest() );
+    m_max.setConstant( ScalarTraits::lowest() );
+  }
+
+  /** \returns the minimal corner */
+  inline const VectorType& (min)() const { return m_min; }
+  /** \returns a non const reference to the minimal corner */
+  inline VectorType& (min)() { return m_min; }
+  /** \returns the maximal corner */
+  inline const VectorType& (max)() const { return m_max; }
+  /** \returns a non const reference to the maximal corner */
+  inline VectorType& (max)() { return m_max; }
+
+  /** \returns the center of the box */
+  inline const CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>,
+                            const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const VectorType, const VectorType> >
+  center() const
+  { return (m_min+m_max)/2; }
+
+  /** \returns the lengths of the sides of the bounding box.
+    * Note that this function does not get the same
+    * result for integral or floating scalar types: see
+    */
+  inline const CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> sizes() const
+  { return m_max - m_min; }
+
+  /** \returns the volume of the bounding box */
+  inline Scalar volume() const
+  { return sizes().prod(); }
+
+  /** \returns an expression for the bounding box diagonal vector
+    * if the length of the diagonal is needed: diagonal().norm()
+    * will provide it.
+    */
+  inline CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> diagonal() const
+  { return sizes(); }
+
+  /** \returns the vertex of the bounding box at the corner defined by
+    * the corner-id corner. It works only for a 1D, 2D or 3D bounding box.
+    * For 1D bounding boxes corners are named by 2 enum constants:
+    * BottomLeft and BottomRight.
+    * For 2D bounding boxes, corners are named by 4 enum constants:
+    * BottomLeft, BottomRight, TopLeft, TopRight.
+    * For 3D bounding boxes, the following names are added:
+    * BottomLeftCeil, BottomRightCeil, TopLeftCeil, TopRightCeil.
+    */
+  inline VectorType corner(CornerType corner) const
+  {
+    EIGEN_STATIC_ASSERT(_AmbientDim <= 3, THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE);
+
+    VectorType res;
+
+    Index mult = 1;
+    for(Index d=0; d<dim(); ++d)
+    {
+      if( mult & corner ) res[d] = m_max[d];
+      else                res[d] = m_min[d];
+      mult *= 2;
+    }
+    return res;
+  }
+
+  /** \returns a random point inside the bounding box sampled with
+   * a uniform distribution */
+  inline VectorType sample() const
+  {
+    VectorType r;
+    for(Index d=0; d<dim(); ++d)
+    {
+      if(!ScalarTraits::IsInteger)
+      {
+        r[d] = m_min[d] + (m_max[d]-m_min[d])
+             * internal::random<Scalar>(Scalar(0), Scalar(1));
+      }
+      else
+        r[d] = internal::random(m_min[d], m_max[d]);
+    }
+    return r;
+  }
+
+  /** \returns true if the point \a p is inside the box \c *this. */
+  template<typename Derived>
+  inline bool contains(const MatrixBase<Derived>& a_p) const
+  {
+    typename internal::nested<Derived,2>::type p(a_p.derived());
+    return (m_min.array()<=p.array()).all() && (p.array()<=m_max.array()).all();
+  }
+
+  /** \returns true if the box \a b is entirely inside the box \c *this. */
+  inline bool contains(const AlignedBox& b) const
+  { return (m_min.array()<=(b.min)().array()).all() && ((b.max)().array()<=m_max.array()).all(); }
+
+  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this. */
+  template<typename Derived>
+  inline AlignedBox& extend(const MatrixBase<Derived>& a_p)
+  {
+    typename internal::nested<Derived,2>::type p(a_p.derived());
+    m_min = m_min.cwiseMin(p);
+    m_max = m_max.cwiseMax(p);
+    return *this;
+  }
+
+  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this. */
+  inline AlignedBox& extend(const AlignedBox& b)
+  {
+    m_min = m_min.cwiseMin(b.m_min);
+    m_max = m_max.cwiseMax(b.m_max);
+    return *this;
+  }
+
+  /** Clamps \c *this by the box \a b and returns a reference to \c *this. */
+  inline AlignedBox& clamp(const AlignedBox& b)
+  {
+    m_min = m_min.cwiseMax(b.m_min);
+    m_max = m_max.cwiseMin(b.m_max);
+    return *this;
+  }
+
+  /** Returns an AlignedBox that is the intersection of \a b and \c *this */
+  inline AlignedBox intersection(const AlignedBox& b) const
+  {return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max)); }
+
+  /** Returns an AlignedBox that is the union of \a b and \c *this */
+  inline AlignedBox merged(const AlignedBox& b) const
+  { return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max)); }
+
+  /** Translate \c *this by the vector \a t and returns a reference to \c *this. */
+  template<typename Derived>
+  inline AlignedBox& translate(const MatrixBase<Derived>& a_t)
+  {
+    const typename internal::nested<Derived,2>::type t(a_t.derived());
+    m_min += t;
+    m_max += t;
+    return *this;
+  }
+
+  /** \returns the squared distance between the point \a p and the box \c *this,
+    * and zero if \a p is inside the box.
+    * \sa exteriorDistance()
+    */
+  template<typename Derived>
+  inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& a_p) const;
+
+  /** \returns the squared distance between the boxes \a b and \c *this,
+    * and zero if the boxes intersect.
+    * \sa exteriorDistance()
+    */
+  inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
+
+  /** \returns the distance between the point \a p and the box \c *this,
+    * and zero if \a p is inside the box.
+    * \sa squaredExteriorDistance()
+    */
+  template<typename Derived>
+  inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const
+  { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(p))); }
+
+  /** \returns the distance between the boxes \a b and \c *this,
+    * and zero if the boxes intersect.
+    * \sa squaredExteriorDistance()
+    */
+  inline NonInteger exteriorDistance(const AlignedBox& b) const
+  { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(b))); }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<AlignedBox,
+           AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
+  {
+    return typename internal::cast_return_type<AlignedBox,
+                    AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
+  }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
+  {
+    m_min = (other.min)().template cast<Scalar>();
+    m_max = (other.max)().template cast<Scalar>();
+  }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const AlignedBox& other, const RealScalar& prec = ScalarTraits::dummy_precision()) const
+  { return m_min.isApprox(other.m_min, prec) && m_max.isApprox(other.m_max, prec); }
+
+protected:
+
+  VectorType m_min, m_max;
+};
+
+
+
+template<typename Scalar,int AmbientDim>
+template<typename Derived>
+inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const MatrixBase<Derived>& a_p) const
+{
+  typename internal::nested<Derived,2*AmbientDim>::type p(a_p.derived());
+  Scalar dist2(0);
+  Scalar aux;
+  for (Index k=0; k<dim(); ++k)
+  {
+    if( m_min[k] > p[k] )
+    {
+      aux = m_min[k] - p[k];
+      dist2 += aux*aux;
+    }
+    else if( p[k] > m_max[k] )
+    {
+      aux = p[k] - m_max[k];
+      dist2 += aux*aux;
+    }
+  }
+  return dist2;
+}
+
+template<typename Scalar,int AmbientDim>
+inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const AlignedBox& b) const
+{
+  Scalar dist2(0);
+  Scalar aux;
+  for (Index k=0; k<dim(); ++k)
+  {
+    if( m_min[k] > b.m_max[k] )
+    {
+      aux = m_min[k] - b.m_max[k];
+      dist2 += aux*aux;
+    }
+    else if( b.m_min[k] > m_max[k] )
+    {
+      aux = b.m_min[k] - m_max[k];
+      dist2 += aux*aux;
+    }
+  }
+  return dist2;
+}
+
+/** \defgroup alignedboxtypedefs Global aligned box typedefs
+  *
+  * \ingroup Geometry_Module
+  *
+  * Eigen defines several typedef shortcuts for most common aligned box types.
+  *
+  * The general patterns are the following:
+  *
+  * \c AlignedBoxSizeType where \c Size can be \c 1, \c 2,\c 3,\c 4 for fixed size boxes or \c X for dynamic size,
+  * and where \c Type can be \c i for integer, \c f for float, \c d for double.
+  *
+  * For example, \c AlignedBox3d is a fixed-size 3x3 aligned box type of doubles, and \c AlignedBoxXf is a dynamic-size aligned box of floats.
+  *
+  * \sa class AlignedBox
+  */
+
+#define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)    \
+/** \ingroup alignedboxtypedefs */                                 \
+typedef AlignedBox<Type, Size>   AlignedBox##SizeSuffix##TypeSuffix;
+
+#define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 1, 1) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 2, 2) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 3, 3) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 4, 4) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X)
+
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(int,                  i)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(float,                f)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(double,               d)
+
+#undef EIGEN_MAKE_TYPEDEFS_ALL_SIZES
+#undef EIGEN_MAKE_TYPEDEFS
+
+} // end namespace Eigen
+
+#endif // EIGEN_ALIGNEDBOX_H
diff --git a/include/eigen3/Eigen/src/Geometry/AngleAxis.h b/include/eigen3/Eigen/src/Geometry/AngleAxis.h
new file mode 100644
index 0000000..553d38c
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/AngleAxis.h
@@ -0,0 +1,233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ANGLEAXIS_H
+#define EIGEN_ANGLEAXIS_H
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class AngleAxis
+  *
+  * \brief Represents a 3D rotation as a rotation angle around an arbitrary 3D axis
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  *
+  * \warning When setting up an AngleAxis object, the axis vector \b must \b be \b normalized.
+  *
+  * The following two typedefs are provided for convenience:
+  * \li \c AngleAxisf for \c float
+  * \li \c AngleAxisd for \c double
+  *
+  * Combined with MatrixBase::Unit{X,Y,Z}, AngleAxis can be used to easily
+  * mimic Euler-angles. Here is an example:
+  * \include AngleAxis_mimic_euler.cpp
+  * Output: \verbinclude AngleAxis_mimic_euler.out
+  *
+  * \note This class is not aimed to be used to store a rotation transformation,
+  * but rather to make easier the creation of other rotation (Quaternion, rotation Matrix)
+  * and transformation objects.
+  *
+  * \sa class Quaternion, class Transform, MatrixBase::UnitX()
+  */
+
+namespace internal {
+template<typename _Scalar> struct traits<AngleAxis<_Scalar> >
+{
+  typedef _Scalar Scalar;
+};
+}
+
+template<typename _Scalar>
+class AngleAxis : public RotationBase<AngleAxis<_Scalar>,3>
+{
+  typedef RotationBase<AngleAxis<_Scalar>,3> Base;
+
+public:
+
+  using Base::operator*;
+
+  enum { Dim = 3 };
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Quaternion<Scalar> QuaternionType;
+
+protected:
+
+  Vector3 m_axis;
+  Scalar m_angle;
+
+public:
+
+  /** Default constructor without initialization. */
+  AngleAxis() {}
+  /** Constructs and initialize the angle-axis rotation from an \a angle in radian
+    * and an \a axis which \b must \b be \b normalized.
+    *
+    * \warning If the \a axis vector is not normalized, then the angle-axis object
+    *          represents an invalid rotation. */
+  template<typename Derived>
+  inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
+  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
+  template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
+  /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
+  template<typename Derived>
+  inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
+
+  Scalar angle() const { return m_angle; }
+  Scalar& angle() { return m_angle; }
+
+  const Vector3& axis() const { return m_axis; }
+  Vector3& axis() { return m_axis; }
+
+  /** Concatenates two rotations */
+  inline QuaternionType operator* (const AngleAxis& other) const
+  { return QuaternionType(*this) * QuaternionType(other); }
+
+  /** Concatenates two rotations */
+  inline QuaternionType operator* (const QuaternionType& other) const
+  { return QuaternionType(*this) * other; }
+
+  /** Concatenates two rotations */
+  friend inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
+  { return a * QuaternionType(b); }
+
+  /** \returns the inverse rotation, i.e., an angle-axis with opposite rotation angle */
+  AngleAxis inverse() const
+  { return AngleAxis(-m_angle, m_axis); }
+
+  template<class QuatDerived>
+  AngleAxis& operator=(const QuaternionBase<QuatDerived>& q);
+  template<typename Derived>
+  AngleAxis& operator=(const MatrixBase<Derived>& m);
+
+  template<typename Derived>
+  AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
+  Matrix3 toRotationMatrix(void) const;
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
+  { return typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
+  {
+    m_axis = other.axis().template cast<Scalar>();
+    m_angle = Scalar(other.angle());
+  }
+
+  static inline const AngleAxis Identity() { return AngleAxis(0, Vector3::UnitX()); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const AngleAxis& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  { return m_axis.isApprox(other.m_axis, prec) && internal::isApprox(m_angle,other.m_angle, prec); }
+};
+
+/** \ingroup Geometry_Module
+  * single precision angle-axis type */
+typedef AngleAxis<float> AngleAxisf;
+/** \ingroup Geometry_Module
+  * double precision angle-axis type */
+typedef AngleAxis<double> AngleAxisd;
+
+/** Set \c *this from a \b unit quaternion.
+  * The axis is normalized.
+  * 
+  * \warning As any other method dealing with quaternion, if the input quaternion
+  *          is not normalized then the result is undefined.
+  */
+template<typename Scalar>
+template<typename QuatDerived>
+AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
+{
+  using std::acos;
+  using std::min;
+  using std::max;
+  using std::sqrt;
+  Scalar n2 = q.vec().squaredNorm();
+  if (n2 < NumTraits<Scalar>::dummy_precision()*NumTraits<Scalar>::dummy_precision())
+  {
+    m_angle = 0;
+    m_axis << 1, 0, 0;
+  }
+  else
+  {
+    m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1)));
+    m_axis = q.vec() / sqrt(n2);
+  }
+  return *this;
+}
+
+/** Set \c *this from a 3x3 rotation matrix \a mat.
+  */
+template<typename Scalar>
+template<typename Derived>
+AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
+{
+  // Since a direct conversion would not be really faster,
+  // let's use the robust Quaternion implementation:
+  return *this = QuaternionType(mat);
+}
+
+/**
+* \brief Sets \c *this from a 3x3 rotation matrix.
+**/
+template<typename Scalar>
+template<typename Derived>
+AngleAxis<Scalar>& AngleAxis<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
+{
+  return *this = QuaternionType(mat);
+}
+
+/** Constructs and \returns an equivalent 3x3 rotation matrix.
+  */
+template<typename Scalar>
+typename AngleAxis<Scalar>::Matrix3
+AngleAxis<Scalar>::toRotationMatrix(void) const
+{
+  using std::sin;
+  using std::cos;
+  Matrix3 res;
+  Vector3 sin_axis  = sin(m_angle) * m_axis;
+  Scalar c = cos(m_angle);
+  Vector3 cos1_axis = (Scalar(1)-c) * m_axis;
+
+  Scalar tmp;
+  tmp = cos1_axis.x() * m_axis.y();
+  res.coeffRef(0,1) = tmp - sin_axis.z();
+  res.coeffRef(1,0) = tmp + sin_axis.z();
+
+  tmp = cos1_axis.x() * m_axis.z();
+  res.coeffRef(0,2) = tmp + sin_axis.y();
+  res.coeffRef(2,0) = tmp - sin_axis.y();
+
+  tmp = cos1_axis.y() * m_axis.z();
+  res.coeffRef(1,2) = tmp - sin_axis.x();
+  res.coeffRef(2,1) = tmp + sin_axis.x();
+
+  res.diagonal() = (cos1_axis.cwiseProduct(m_axis)).array() + c;
+
+  return res;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ANGLEAXIS_H
diff --git a/include/eigen3/Eigen/src/Geometry/EulerAngles.h b/include/eigen3/Eigen/src/Geometry/EulerAngles.h
new file mode 100644
index 0000000..82802fb
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/EulerAngles.h
@@ -0,0 +1,104 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLES_H
+#define EIGEN_EULERANGLES_H
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  *
+  * \returns the Euler-angles of the rotation matrix \c *this using the convention defined by the triplet (\a a0,\a a1,\a a2)
+  *
+  * Each of the three parameters \a a0,\a a1,\a a2 represents the respective rotation axis as an integer in {0,1,2}.
+  * For instance, in:
+  * \code Vector3f ea = mat.eulerAngles(2, 0, 2); \endcode
+  * "2" represents the z axis and "0" the x axis, etc. The returned angles are such that
+  * we have the following equality:
+  * \code
+  * mat == AngleAxisf(ea[0], Vector3f::UnitZ())
+  *      * AngleAxisf(ea[1], Vector3f::UnitX())
+  *      * AngleAxisf(ea[2], Vector3f::UnitZ()); \endcode
+  * This corresponds to the right-multiply conventions (with right hand side frames).
+  * 
+  * The returned angles are in the ranges [0:pi]x[-pi:pi]x[-pi:pi].
+  * 
+  * \sa class AngleAxis
+  */
+template<typename Derived>
+inline Matrix<typename MatrixBase<Derived>::Scalar,3,1>
+MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
+{
+  using std::atan2;
+  using std::sin;
+  using std::cos;
+  /* Implemented from Graphics Gems IV */
+  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived,3,3)
+
+  Matrix<Scalar,3,1> res;
+  typedef Matrix<typename Derived::Scalar,2,1> Vector2;
+
+  const Index odd = ((a0+1)%3 == a1) ? 0 : 1;
+  const Index i = a0;
+  const Index j = (a0 + 1 + odd)%3;
+  const Index k = (a0 + 2 - odd)%3;
+  
+  if (a0==a2)
+  {
+    res[0] = atan2(coeff(j,i), coeff(k,i));
+    if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
+    {
+      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
+      Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
+      res[1] = -atan2(s2, coeff(i,i));
+    }
+    else
+    {
+      Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
+      res[1] = atan2(s2, coeff(i,i));
+    }
+    
+    // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
+    // we can compute their respective rotation, and apply its inverse to M. Since the result must
+    // be a rotation around x, we have:
+    //
+    //  c2  s1.s2 c1.s2                   1  0   0 
+    //  0   c1    -s1       *    M    =   0  c3  s3
+    //  -s2 s1.c2 c1.c2                   0 -s3  c3
+    //
+    //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
+    
+    Scalar s1 = sin(res[0]);
+    Scalar c1 = cos(res[0]);
+    res[2] = atan2(c1*coeff(j,k)-s1*coeff(k,k), c1*coeff(j,j) - s1 * coeff(k,j));
+  } 
+  else
+  {
+    res[0] = atan2(coeff(j,k), coeff(k,k));
+    Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
+    if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
+      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
+      res[1] = atan2(-coeff(i,k), -c2);
+    }
+    else
+      res[1] = atan2(-coeff(i,k), c2);
+    Scalar s1 = sin(res[0]);
+    Scalar c1 = cos(res[0]);
+    res[2] = atan2(s1*coeff(k,i)-c1*coeff(j,i), c1*coeff(j,j) - s1 * coeff(k,j));
+  }
+  if (!odd)
+    res = -res;
+  
+  return res;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_EULERANGLES_H
diff --git a/include/eigen3/Eigen/src/Geometry/Homogeneous.h b/include/eigen3/Eigen/src/Geometry/Homogeneous.h
new file mode 100644
index 0000000..00e71d1
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/Homogeneous.h
@@ -0,0 +1,307 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_HOMOGENEOUS_H
+#define EIGEN_HOMOGENEOUS_H
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Homogeneous
+  *
+  * \brief Expression of one (or a set of) homogeneous vector(s)
+  *
+  * \param MatrixType the type of the object in which we are making homogeneous
+  *
+  * This class represents an expression of one (or a set of) homogeneous vector(s).
+  * It is the return type of MatrixBase::homogeneous() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa MatrixBase::homogeneous()
+  */
+
+namespace internal {
+
+template<typename MatrixType,int Direction>
+struct traits<Homogeneous<MatrixType,Direction> >
+ : traits<MatrixType>
+{
+  typedef typename traits<MatrixType>::StorageKind StorageKind;
+  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    RowsPlusOne = (MatrixType::RowsAtCompileTime != Dynamic) ?
+                  int(MatrixType::RowsAtCompileTime) + 1 : Dynamic,
+    ColsPlusOne = (MatrixType::ColsAtCompileTime != Dynamic) ?
+                  int(MatrixType::ColsAtCompileTime) + 1 : Dynamic,
+    RowsAtCompileTime = Direction==Vertical  ?  RowsPlusOne : MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = Direction==Horizontal ? ColsPlusOne : MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    TmpFlags = _MatrixTypeNested::Flags & HereditaryBits,
+    Flags = ColsAtCompileTime==1 ? (TmpFlags & ~RowMajorBit)
+          : RowsAtCompileTime==1 ? (TmpFlags | RowMajorBit)
+          : TmpFlags,
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+  };
+};
+
+template<typename MatrixType,typename Lhs> struct homogeneous_left_product_impl;
+template<typename MatrixType,typename Rhs> struct homogeneous_right_product_impl;
+
+} // end namespace internal
+
+template<typename MatrixType,int _Direction> class Homogeneous
+  : internal::no_assignment_operator, public MatrixBase<Homogeneous<MatrixType,_Direction> >
+{
+  public:
+
+    enum { Direction = _Direction };
+
+    typedef MatrixBase<Homogeneous> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Homogeneous)
+
+    inline Homogeneous(const MatrixType& matrix)
+      : m_matrix(matrix)
+    {}
+
+    inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
+    inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
+
+    inline Scalar coeff(Index row, Index col) const
+    {
+      if(  (int(Direction)==Vertical   && row==m_matrix.rows())
+        || (int(Direction)==Horizontal && col==m_matrix.cols()))
+        return 1;
+      return m_matrix.coeff(row, col);
+    }
+
+    template<typename Rhs>
+    inline const internal::homogeneous_right_product_impl<Homogeneous,Rhs>
+    operator* (const MatrixBase<Rhs>& rhs) const
+    {
+      eigen_assert(int(Direction)==Horizontal);
+      return internal::homogeneous_right_product_impl<Homogeneous,Rhs>(m_matrix,rhs.derived());
+    }
+
+    template<typename Lhs> friend
+    inline const internal::homogeneous_left_product_impl<Homogeneous,Lhs>
+    operator* (const MatrixBase<Lhs>& lhs, const Homogeneous& rhs)
+    {
+      eigen_assert(int(Direction)==Vertical);
+      return internal::homogeneous_left_product_impl<Homogeneous,Lhs>(lhs.derived(),rhs.m_matrix);
+    }
+
+    template<typename Scalar, int Dim, int Mode, int Options> friend
+    inline const internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >
+    operator* (const Transform<Scalar,Dim,Mode,Options>& lhs, const Homogeneous& rhs)
+    {
+      eigen_assert(int(Direction)==Vertical);
+      return internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >(lhs,rhs.m_matrix);
+    }
+
+  protected:
+    typename MatrixType::Nested m_matrix;
+};
+
+/** \geometry_module
+  *
+  * \return an expression of the equivalent homogeneous vector
+  *
+  * \only_for_vectors
+  *
+  * Example: \include MatrixBase_homogeneous.cpp
+  * Output: \verbinclude MatrixBase_homogeneous.out
+  *
+  * \sa class Homogeneous
+  */
+template<typename Derived>
+inline typename MatrixBase<Derived>::HomogeneousReturnType
+MatrixBase<Derived>::homogeneous() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return derived();
+}
+
+/** \geometry_module
+  *
+  * \returns a matrix expression of homogeneous column (or row) vectors
+  *
+  * Example: \include VectorwiseOp_homogeneous.cpp
+  * Output: \verbinclude VectorwiseOp_homogeneous.out
+  *
+  * \sa MatrixBase::homogeneous() */
+template<typename ExpressionType, int Direction>
+inline Homogeneous<ExpressionType,Direction>
+VectorwiseOp<ExpressionType,Direction>::homogeneous() const
+{
+  return _expression();
+}
+
+/** \geometry_module
+  *
+  * \returns an expression of the homogeneous normalized vector of \c *this
+  *
+  * Example: \include MatrixBase_hnormalized.cpp
+  * Output: \verbinclude MatrixBase_hnormalized.out
+  *
+  * \sa VectorwiseOp::hnormalized() */
+template<typename Derived>
+inline const typename MatrixBase<Derived>::HNormalizedReturnType
+MatrixBase<Derived>::hnormalized() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return ConstStartMinusOne(derived(),0,0,
+    ColsAtCompileTime==1?size()-1:1,
+    ColsAtCompileTime==1?1:size()-1) / coeff(size()-1);
+}
+
+/** \geometry_module
+  *
+  * \returns an expression of the homogeneous normalized vector of \c *this
+  *
+  * Example: \include DirectionWise_hnormalized.cpp
+  * Output: \verbinclude DirectionWise_hnormalized.out
+  *
+  * \sa MatrixBase::hnormalized() */
+template<typename ExpressionType, int Direction>
+inline const typename VectorwiseOp<ExpressionType,Direction>::HNormalizedReturnType
+VectorwiseOp<ExpressionType,Direction>::hnormalized() const
+{
+  return HNormalized_Block(_expression(),0,0,
+      Direction==Vertical   ? _expression().rows()-1 : _expression().rows(),
+      Direction==Horizontal ? _expression().cols()-1 : _expression().cols()).cwiseQuotient(
+      Replicate<HNormalized_Factors,
+                Direction==Vertical   ? HNormalized_SizeMinusOne : 1,
+                Direction==Horizontal ? HNormalized_SizeMinusOne : 1>
+        (HNormalized_Factors(_expression(),
+          Direction==Vertical    ? _expression().rows()-1:0,
+          Direction==Horizontal  ? _expression().cols()-1:0,
+          Direction==Vertical    ? 1 : _expression().rows(),
+          Direction==Horizontal  ? 1 : _expression().cols()),
+         Direction==Vertical   ? _expression().rows()-1 : 1,
+         Direction==Horizontal ? _expression().cols()-1 : 1));
+}
+
+namespace internal {
+
+template<typename MatrixOrTransformType>
+struct take_matrix_for_product
+{
+  typedef MatrixOrTransformType type;
+  static const type& run(const type &x) { return x; }
+};
+
+template<typename Scalar, int Dim, int Mode,int Options>
+struct take_matrix_for_product<Transform<Scalar, Dim, Mode, Options> >
+{
+  typedef Transform<Scalar, Dim, Mode, Options> TransformType;
+  typedef typename internal::add_const<typename TransformType::ConstAffinePart>::type type;
+  static type run (const TransformType& x) { return x.affine(); }
+};
+
+template<typename Scalar, int Dim, int Options>
+struct take_matrix_for_product<Transform<Scalar, Dim, Projective, Options> >
+{
+  typedef Transform<Scalar, Dim, Projective, Options> TransformType;
+  typedef typename TransformType::MatrixType type;
+  static const type& run (const TransformType& x) { return x.matrix(); }
+};
+
+template<typename MatrixType,typename Lhs>
+struct traits<homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs> >
+{
+  typedef typename take_matrix_for_product<Lhs>::type LhsMatrixType;
+  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+  typedef typename remove_all<LhsMatrixType>::type LhsMatrixTypeCleaned;
+  typedef typename make_proper_matrix_type<
+                 typename traits<MatrixTypeCleaned>::Scalar,
+                 LhsMatrixTypeCleaned::RowsAtCompileTime,
+                 MatrixTypeCleaned::ColsAtCompileTime,
+                 MatrixTypeCleaned::PlainObject::Options,
+                 LhsMatrixTypeCleaned::MaxRowsAtCompileTime,
+                 MatrixTypeCleaned::MaxColsAtCompileTime>::type ReturnType;
+};
+
+template<typename MatrixType,typename Lhs>
+struct homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs>
+  : public ReturnByValue<homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs> >
+{
+  typedef typename traits<homogeneous_left_product_impl>::LhsMatrixType LhsMatrixType;
+  typedef typename remove_all<LhsMatrixType>::type LhsMatrixTypeCleaned;
+  typedef typename remove_all<typename LhsMatrixTypeCleaned::Nested>::type LhsMatrixTypeNested;
+  typedef typename MatrixType::Index Index;
+  homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
+    : m_lhs(take_matrix_for_product<Lhs>::run(lhs)),
+      m_rhs(rhs)
+  {}
+
+  inline Index rows() const { return m_lhs.rows(); }
+  inline Index cols() const { return m_rhs.cols(); }
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    // FIXME investigate how to allow lazy evaluation of this product when possible
+    dst = Block<const LhsMatrixTypeNested,
+              LhsMatrixTypeNested::RowsAtCompileTime,
+              LhsMatrixTypeNested::ColsAtCompileTime==Dynamic?Dynamic:LhsMatrixTypeNested::ColsAtCompileTime-1>
+            (m_lhs,0,0,m_lhs.rows(),m_lhs.cols()-1) * m_rhs;
+    dst += m_lhs.col(m_lhs.cols()-1).rowwise()
+            .template replicate<MatrixType::ColsAtCompileTime>(m_rhs.cols());
+  }
+
+  typename LhsMatrixTypeCleaned::Nested m_lhs;
+  typename MatrixType::Nested m_rhs;
+};
+
+template<typename MatrixType,typename Rhs>
+struct traits<homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs> >
+{
+  typedef typename make_proper_matrix_type<typename traits<MatrixType>::Scalar,
+                 MatrixType::RowsAtCompileTime,
+                 Rhs::ColsAtCompileTime,
+                 MatrixType::PlainObject::Options,
+                 MatrixType::MaxRowsAtCompileTime,
+                 Rhs::MaxColsAtCompileTime>::type ReturnType;
+};
+
+template<typename MatrixType,typename Rhs>
+struct homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs>
+  : public ReturnByValue<homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs> >
+{
+  typedef typename remove_all<typename Rhs::Nested>::type RhsNested;
+  typedef typename MatrixType::Index Index;
+  homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs)
+    : m_lhs(lhs), m_rhs(rhs)
+  {}
+
+  inline Index rows() const { return m_lhs.rows(); }
+  inline Index cols() const { return m_rhs.cols(); }
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    // FIXME investigate how to allow lazy evaluation of this product when possible
+    dst = m_lhs * Block<const RhsNested,
+                        RhsNested::RowsAtCompileTime==Dynamic?Dynamic:RhsNested::RowsAtCompileTime-1,
+                        RhsNested::ColsAtCompileTime>
+            (m_rhs,0,0,m_rhs.rows()-1,m_rhs.cols());
+    dst += m_rhs.row(m_rhs.rows()-1).colwise()
+            .template replicate<MatrixType::RowsAtCompileTime>(m_lhs.rows());
+  }
+
+  typename MatrixType::Nested m_lhs;
+  typename Rhs::Nested m_rhs;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_HOMOGENEOUS_H
diff --git a/include/eigen3/Eigen/src/Geometry/Hyperplane.h b/include/eigen3/Eigen/src/Geometry/Hyperplane.h
new file mode 100644
index 0000000..00b7c43
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/Hyperplane.h
@@ -0,0 +1,280 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_HYPERPLANE_H
+#define EIGEN_HYPERPLANE_H
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Hyperplane
+  *
+  * \brief A hyperplane
+  *
+  * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
+  * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  *             Notice that the dimension of the hyperplane is _AmbientDim-1.
+  *
+  * This class represents an hyperplane as the zero set of the implicit equation
+  * \f$ n \cdot x + d = 0 \f$ where \f$ n \f$ is a unit normal vector of the plane (linear part)
+  * and \f$ d \f$ is the distance (offset) to the origin.
+  */
+template <typename _Scalar, int _AmbientDim, int _Options>
+class Hyperplane
+{
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
+  enum {
+    AmbientDimAtCompileTime = _AmbientDim,
+    Options = _Options
+  };
+  typedef _Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef DenseIndex Index;
+  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
+  typedef Matrix<Scalar,Index(AmbientDimAtCompileTime)==Dynamic
+                        ? Dynamic
+                        : Index(AmbientDimAtCompileTime)+1,1,Options> Coefficients;
+  typedef Block<Coefficients,AmbientDimAtCompileTime,1> NormalReturnType;
+  typedef const Block<const Coefficients,AmbientDimAtCompileTime,1> ConstNormalReturnType;
+
+  /** Default constructor without initialization */
+  inline Hyperplane() {}
+  
+  template<int OtherOptions>
+  Hyperplane(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
+   : m_coeffs(other.coeffs())
+  {}
+
+  /** Constructs a dynamic-size hyperplane with \a _dim the dimension
+    * of the ambient space */
+  inline explicit Hyperplane(Index _dim) : m_coeffs(_dim+1) {}
+
+  /** Construct a plane from its normal \a n and a point \a e onto the plane.
+    * \warning the vector normal is assumed to be normalized.
+    */
+  inline Hyperplane(const VectorType& n, const VectorType& e)
+    : m_coeffs(n.size()+1)
+  {
+    normal() = n;
+    offset() = -n.dot(e);
+  }
+
+  /** Constructs a plane from its normal \a n and distance to the origin \a d
+    * such that the algebraic equation of the plane is \f$ n \cdot x + d = 0 \f$.
+    * \warning the vector normal is assumed to be normalized.
+    */
+  inline Hyperplane(const VectorType& n, const Scalar& d)
+    : m_coeffs(n.size()+1)
+  {
+    normal() = n;
+    offset() = d;
+  }
+
+  /** Constructs a hyperplane passing through the two points. If the dimension of the ambient space
+    * is greater than 2, then there isn't uniqueness, so an arbitrary choice is made.
+    */
+  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
+  {
+    Hyperplane result(p0.size());
+    result.normal() = (p1 - p0).unitOrthogonal();
+    result.offset() = -p0.dot(result.normal());
+    return result;
+  }
+
+  /** Constructs a hyperplane passing through the three points. The dimension of the ambient space
+    * is required to be exactly 3.
+    */
+  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
+  {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 3)
+    Hyperplane result(p0.size());
+    VectorType v0(p2 - p0), v1(p1 - p0);
+    result.normal() = v0.cross(v1);
+    RealScalar norm = result.normal().norm();
+    if(norm <= v0.norm() * v1.norm() * NumTraits<RealScalar>::epsilon())
+    {
+      Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
+      JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
+      result.normal() = svd.matrixV().col(2);
+    }
+    else
+      result.normal() /= norm;
+    result.offset() = -p0.dot(result.normal());
+    return result;
+  }
+
+  /** Constructs a hyperplane passing through the parametrized line \a parametrized.
+    * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
+    * so an arbitrary choice is made.
+    */
+  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
+  explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
+  {
+    normal() = parametrized.direction().unitOrthogonal();
+    offset() = -parametrized.origin().dot(normal());
+  }
+
+  ~Hyperplane() {}
+
+  /** \returns the dimension in which the plane holds */
+  inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_coeffs.size()-1 : Index(AmbientDimAtCompileTime); }
+
+  /** normalizes \c *this */
+  void normalize(void)
+  {
+    m_coeffs /= normal().norm();
+  }
+
+  /** \returns the signed distance between the plane \c *this and a point \a p.
+    * \sa absDistance()
+    */
+  inline Scalar signedDistance(const VectorType& p) const { return normal().dot(p) + offset(); }
+
+  /** \returns the absolute distance between the plane \c *this and a point \a p.
+    * \sa signedDistance()
+    */
+  inline Scalar absDistance(const VectorType& p) const { using std::abs; return abs(signedDistance(p)); }
+
+  /** \returns the projection of a point \a p onto the plane \c *this.
+    */
+  inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
+
+  /** \returns a constant reference to the unit normal vector of the plane, which corresponds
+    * to the linear part of the implicit equation.
+    */
+  inline ConstNormalReturnType normal() const { return ConstNormalReturnType(m_coeffs,0,0,dim(),1); }
+
+  /** \returns a non-constant reference to the unit normal vector of the plane, which corresponds
+    * to the linear part of the implicit equation.
+    */
+  inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
+
+  /** \returns the distance to the origin, which is also the "constant term" of the implicit equation
+    * \warning the vector normal is assumed to be normalized.
+    */
+  inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
+
+  /** \returns a non-constant reference to the distance to the origin, which is also the constant part
+    * of the implicit equation */
+  inline Scalar& offset() { return m_coeffs(dim()); }
+
+  /** \returns a constant reference to the coefficients c_i of the plane equation:
+    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
+    */
+  inline const Coefficients& coeffs() const { return m_coeffs; }
+
+  /** \returns a non-constant reference to the coefficients c_i of the plane equation:
+    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
+    */
+  inline Coefficients& coeffs() { return m_coeffs; }
+
+  /** \returns the intersection of *this with \a other.
+    *
+    * \warning The ambient space must be a plane, i.e. have dimension 2, so that \c *this and \a other are lines.
+    *
+    * \note If \a other is approximately parallel to *this, this method will return any point on *this.
+    */
+  VectorType intersection(const Hyperplane& other) const
+  {
+    using std::abs;
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
+    Scalar det = coeffs().coeff(0) * other.coeffs().coeff(1) - coeffs().coeff(1) * other.coeffs().coeff(0);
+    // since the line equations ax+by=c are normalized with a^2+b^2=1, the following tests
+    // whether the two lines are approximately parallel.
+    if(internal::isMuchSmallerThan(det, Scalar(1)))
+    {   // special case where the two lines are approximately parallel. Pick any point on the first line.
+        if(abs(coeffs().coeff(1))>abs(coeffs().coeff(0)))
+            return VectorType(coeffs().coeff(1), -coeffs().coeff(2)/coeffs().coeff(1)-coeffs().coeff(0));
+        else
+            return VectorType(-coeffs().coeff(2)/coeffs().coeff(0)-coeffs().coeff(1), coeffs().coeff(0));
+    }
+    else
+    {   // general case
+        Scalar invdet = Scalar(1) / det;
+        return VectorType(invdet*(coeffs().coeff(1)*other.coeffs().coeff(2)-other.coeffs().coeff(1)*coeffs().coeff(2)),
+                          invdet*(other.coeffs().coeff(0)*coeffs().coeff(2)-coeffs().coeff(0)*other.coeffs().coeff(2)));
+    }
+  }
+
+  /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
+    *
+    * \param mat the Dim x Dim transformation matrix
+    * \param traits specifies whether the matrix \a mat represents an #Isometry
+    *               or a more generic #Affine transformation. The default is #Affine.
+    */
+  template<typename XprType>
+  inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
+  {
+    if (traits==Affine)
+      normal() = mat.inverse().transpose() * normal();
+    else if (traits==Isometry)
+      normal() = mat * normal();
+    else
+    {
+      eigen_assert(0 && "invalid traits value in Hyperplane::transform()");
+    }
+    return *this;
+  }
+
+  /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
+    *
+    * \param t the transformation of dimension Dim
+    * \param traits specifies whether the transformation \a t represents an #Isometry
+    *               or a more generic #Affine transformation. The default is #Affine.
+    *               Other kind of transformations are not supported.
+    */
+  template<int TrOptions>
+  inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime,Affine,TrOptions>& t,
+                                TransformTraits traits = Affine)
+  {
+    transform(t.linear(), traits);
+    offset() -= normal().dot(t.translation());
+    return *this;
+  }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Hyperplane,
+           Hyperplane<NewScalarType,AmbientDimAtCompileTime,Options> >::type cast() const
+  {
+    return typename internal::cast_return_type<Hyperplane,
+                    Hyperplane<NewScalarType,AmbientDimAtCompileTime,Options> >::type(*this);
+  }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType,int OtherOptions>
+  inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
+  { m_coeffs = other.coeffs().template cast<Scalar>(); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  template<int OtherOptions>
+  bool isApprox(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  { return m_coeffs.isApprox(other.m_coeffs, prec); }
+
+protected:
+
+  Coefficients m_coeffs;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_HYPERPLANE_H
diff --git a/include/eigen3/Eigen/src/Geometry/OrthoMethods.h b/include/eigen3/Eigen/src/Geometry/OrthoMethods.h
new file mode 100644
index 0000000..556bc81
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/OrthoMethods.h
@@ -0,0 +1,218 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ORTHOMETHODS_H
+#define EIGEN_ORTHOMETHODS_H
+
+namespace Eigen { 
+
+/** \geometry_module
+  *
+  * \returns the cross product of \c *this and \a other
+  *
+  * Here is a very good explanation of cross-product: http://xkcd.com/199/
+  * \sa MatrixBase::cross3()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
+MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3)
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
+
+  // Note that there is no need for an expression here since the compiler
+  // optimize such a small temporary very well (even within a complex expression)
+  typename internal::nested<Derived,2>::type lhs(derived());
+  typename internal::nested<OtherDerived,2>::type rhs(other.derived());
+  return typename cross_product_return_type<OtherDerived>::type(
+    numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
+    numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
+    numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0))
+  );
+}
+
+namespace internal {
+
+template< int Arch,typename VectorLhs,typename VectorRhs,
+          typename Scalar = typename VectorLhs::Scalar,
+          bool Vectorizable = bool((VectorLhs::Flags&VectorRhs::Flags)&PacketAccessBit)>
+struct cross3_impl {
+  static inline typename internal::plain_matrix_type<VectorLhs>::type
+  run(const VectorLhs& lhs, const VectorRhs& rhs)
+  {
+    return typename internal::plain_matrix_type<VectorLhs>::type(
+      numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
+      numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
+      numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0)),
+      0
+    );
+  }
+};
+
+}
+
+/** \geometry_module
+  *
+  * \returns the cross product of \c *this and \a other using only the x, y, and z coefficients
+  *
+  * The size of \c *this and \a other must be four. This function is especially useful
+  * when using 4D vectors instead of 3D ones to get advantage of SSE/AltiVec vectorization.
+  *
+  * \sa MatrixBase::cross()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::cross3(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,4)
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,4)
+
+  typedef typename internal::nested<Derived,2>::type DerivedNested;
+  typedef typename internal::nested<OtherDerived,2>::type OtherDerivedNested;
+  DerivedNested lhs(derived());
+  OtherDerivedNested rhs(other.derived());
+
+  return internal::cross3_impl<Architecture::Target,
+                        typename internal::remove_all<DerivedNested>::type,
+                        typename internal::remove_all<OtherDerivedNested>::type>::run(lhs,rhs);
+}
+
+/** \returns a matrix expression of the cross product of each column or row
+  * of the referenced expression with the \a other vector.
+  *
+  * The referenced matrix must have one dimension equal to 3.
+  * The result matrix has the same dimensions than the referenced one.
+  *
+  * \geometry_module
+  *
+  * \sa MatrixBase::cross() */
+template<typename ExpressionType, int Direction>
+template<typename OtherDerived>
+const typename VectorwiseOp<ExpressionType,Direction>::CrossReturnType
+VectorwiseOp<ExpressionType,Direction>::cross(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  CrossReturnType res(_expression().rows(),_expression().cols());
+  if(Direction==Vertical)
+  {
+    eigen_assert(CrossReturnType::RowsAtCompileTime==3 && "the matrix must have exactly 3 rows");
+    res.row(0) = (_expression().row(1) * other.coeff(2) - _expression().row(2) * other.coeff(1)).conjugate();
+    res.row(1) = (_expression().row(2) * other.coeff(0) - _expression().row(0) * other.coeff(2)).conjugate();
+    res.row(2) = (_expression().row(0) * other.coeff(1) - _expression().row(1) * other.coeff(0)).conjugate();
+  }
+  else
+  {
+    eigen_assert(CrossReturnType::ColsAtCompileTime==3 && "the matrix must have exactly 3 columns");
+    res.col(0) = (_expression().col(1) * other.coeff(2) - _expression().col(2) * other.coeff(1)).conjugate();
+    res.col(1) = (_expression().col(2) * other.coeff(0) - _expression().col(0) * other.coeff(2)).conjugate();
+    res.col(2) = (_expression().col(0) * other.coeff(1) - _expression().col(1) * other.coeff(0)).conjugate();
+  }
+  return res;
+}
+
+namespace internal {
+
+template<typename Derived, int Size = Derived::SizeAtCompileTime>
+struct unitOrthogonal_selector
+{
+  typedef typename plain_matrix_type<Derived>::type VectorType;
+  typedef typename traits<Derived>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename Derived::Index Index;
+  typedef Matrix<Scalar,2,1> Vector2;
+  static inline VectorType run(const Derived& src)
+  {
+    VectorType perp = VectorType::Zero(src.size());
+    Index maxi = 0;
+    Index sndi = 0;
+    src.cwiseAbs().maxCoeff(&maxi);
+    if (maxi==0)
+      sndi = 1;
+    RealScalar invnm = RealScalar(1)/(Vector2() << src.coeff(sndi),src.coeff(maxi)).finished().norm();
+    perp.coeffRef(maxi) = -numext::conj(src.coeff(sndi)) * invnm;
+    perp.coeffRef(sndi) =  numext::conj(src.coeff(maxi)) * invnm;
+
+    return perp;
+   }
+};
+
+template<typename Derived>
+struct unitOrthogonal_selector<Derived,3>
+{
+  typedef typename plain_matrix_type<Derived>::type VectorType;
+  typedef typename traits<Derived>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline VectorType run(const Derived& src)
+  {
+    VectorType perp;
+    /* Let us compute the crossed product of *this with a vector
+     * that is not too close to being colinear to *this.
+     */
+
+    /* unless the x and y coords are both close to zero, we can
+     * simply take ( -y, x, 0 ) and normalize it.
+     */
+    if((!isMuchSmallerThan(src.x(), src.z()))
+    || (!isMuchSmallerThan(src.y(), src.z())))
+    {
+      RealScalar invnm = RealScalar(1)/src.template head<2>().norm();
+      perp.coeffRef(0) = -numext::conj(src.y())*invnm;
+      perp.coeffRef(1) = numext::conj(src.x())*invnm;
+      perp.coeffRef(2) = 0;
+    }
+    /* if both x and y are close to zero, then the vector is close
+     * to the z-axis, so it's far from colinear to the x-axis for instance.
+     * So we take the crossed product with (1,0,0) and normalize it.
+     */
+    else
+    {
+      RealScalar invnm = RealScalar(1)/src.template tail<2>().norm();
+      perp.coeffRef(0) = 0;
+      perp.coeffRef(1) = -numext::conj(src.z())*invnm;
+      perp.coeffRef(2) = numext::conj(src.y())*invnm;
+    }
+
+    return perp;
+   }
+};
+
+template<typename Derived>
+struct unitOrthogonal_selector<Derived,2>
+{
+  typedef typename plain_matrix_type<Derived>::type VectorType;
+  static inline VectorType run(const Derived& src)
+  { return VectorType(-numext::conj(src.y()), numext::conj(src.x())).normalized(); }
+};
+
+} // end namespace internal
+
+/** \returns a unit vector which is orthogonal to \c *this
+  *
+  * The size of \c *this must be at least 2. If the size is exactly 2,
+  * then the returned vector is a counter clock wise rotation of \c *this, i.e., (-y,x).normalized().
+  *
+  * \sa cross()
+  */
+template<typename Derived>
+typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::unitOrthogonal() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return internal::unitOrthogonal_selector<Derived>::run(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ORTHOMETHODS_H
diff --git a/include/eigen3/Eigen/src/Geometry/ParametrizedLine.h b/include/eigen3/Eigen/src/Geometry/ParametrizedLine.h
new file mode 100644
index 0000000..77fa228
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/ParametrizedLine.h
@@ -0,0 +1,195 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARAMETRIZEDLINE_H
+#define EIGEN_PARAMETRIZEDLINE_H
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class ParametrizedLine
+  *
+  * \brief A parametrized line
+  *
+  * A parametrized line is defined by an origin point \f$ \mathbf{o} \f$ and a unit
+  * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
+  * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$.
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  */
+template <typename _Scalar, int _AmbientDim, int _Options>
+class ParametrizedLine
+{
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
+  enum {
+    AmbientDimAtCompileTime = _AmbientDim,
+    Options = _Options
+  };
+  typedef _Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef DenseIndex Index;
+  typedef Matrix<Scalar,AmbientDimAtCompileTime,1,Options> VectorType;
+
+  /** Default constructor without initialization */
+  inline ParametrizedLine() {}
+  
+  template<int OtherOptions>
+  ParametrizedLine(const ParametrizedLine<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
+   : m_origin(other.origin()), m_direction(other.direction())
+  {}
+
+  /** Constructs a dynamic-size line with \a _dim the dimension
+    * of the ambient space */
+  inline explicit ParametrizedLine(Index _dim) : m_origin(_dim), m_direction(_dim) {}
+
+  /** Initializes a parametrized line of direction \a direction and origin \a origin.
+    * \warning the vector direction is assumed to be normalized.
+    */
+  ParametrizedLine(const VectorType& origin, const VectorType& direction)
+    : m_origin(origin), m_direction(direction) {}
+
+  template <int OtherOptions>
+  explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane);
+
+  /** Constructs a parametrized line going from \a p0 to \a p1. */
+  static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
+  { return ParametrizedLine(p0, (p1-p0).normalized()); }
+
+  ~ParametrizedLine() {}
+
+  /** \returns the dimension in which the line holds */
+  inline Index dim() const { return m_direction.size(); }
+
+  const VectorType& origin() const { return m_origin; }
+  VectorType& origin() { return m_origin; }
+
+  const VectorType& direction() const { return m_direction; }
+  VectorType& direction() { return m_direction; }
+
+  /** \returns the squared distance of a point \a p to its projection onto the line \c *this.
+    * \sa distance()
+    */
+  RealScalar squaredDistance(const VectorType& p) const
+  {
+    VectorType diff = p - origin();
+    return (diff - direction().dot(diff) * direction()).squaredNorm();
+  }
+  /** \returns the distance of a point \a p to its projection onto the line \c *this.
+    * \sa squaredDistance()
+    */
+  RealScalar distance(const VectorType& p) const { using std::sqrt; return sqrt(squaredDistance(p)); }
+
+  /** \returns the projection of a point \a p onto the line \c *this. */
+  VectorType projection(const VectorType& p) const
+  { return origin() + direction().dot(p-origin()) * direction(); }
+
+  VectorType pointAt(const Scalar& t) const;
+  
+  template <int OtherOptions>
+  Scalar intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+ 
+  template <int OtherOptions>
+  Scalar intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+  
+  template <int OtherOptions>
+  VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<ParametrizedLine,
+           ParametrizedLine<NewScalarType,AmbientDimAtCompileTime,Options> >::type cast() const
+  {
+    return typename internal::cast_return_type<ParametrizedLine,
+                    ParametrizedLine<NewScalarType,AmbientDimAtCompileTime,Options> >::type(*this);
+  }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType,int OtherOptions>
+  inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
+  {
+    m_origin = other.origin().template cast<Scalar>();
+    m_direction = other.direction().template cast<Scalar>();
+  }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }
+
+protected:
+
+  VectorType m_origin, m_direction;
+};
+
+/** Constructs a parametrized line from a 2D hyperplane
+  *
+  * \warning the ambient space must have dimension 2 such that the hyperplane actually describes a line
+  */
+template <typename _Scalar, int _AmbientDim, int _Options>
+template <int OtherOptions>
+inline ParametrizedLine<_Scalar, _AmbientDim,_Options>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim,OtherOptions>& hyperplane)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
+  direction() = hyperplane.normal().unitOrthogonal();
+  origin() = -hyperplane.normal()*hyperplane.offset();
+}
+
+/** \returns the point at \a t along this line
+  */
+template <typename _Scalar, int _AmbientDim, int _Options>
+inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
+ParametrizedLine<_Scalar, _AmbientDim,_Options>::pointAt(const _Scalar& t) const
+{
+  return origin() + (direction()*t); 
+}
+
+/** \returns the parameter value of the intersection between \c *this and the given \a hyperplane
+  */
+template <typename _Scalar, int _AmbientDim, int _Options>
+template <int OtherOptions>
+inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
+{
+  return -(hyperplane.offset()+hyperplane.normal().dot(origin()))
+          / hyperplane.normal().dot(direction());
+}
+
+
+/** \deprecated use intersectionParameter()
+  * \returns the parameter value of the intersection between \c *this and the given \a hyperplane
+  */
+template <typename _Scalar, int _AmbientDim, int _Options>
+template <int OtherOptions>
+inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
+{
+  return intersectionParameter(hyperplane);
+}
+
+/** \returns the point of the intersection between \c *this and the given hyperplane
+  */
+template <typename _Scalar, int _AmbientDim, int _Options>
+template <int OtherOptions>
+inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
+ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
+{
+  return pointAt(intersectionParameter(hyperplane));
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARAMETRIZEDLINE_H
diff --git a/include/eigen3/Eigen/src/Geometry/Quaternion.h b/include/eigen3/Eigen/src/Geometry/Quaternion.h
new file mode 100644
index 0000000..f06653f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/Quaternion.h
@@ -0,0 +1,778 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Mathieu Gautier <mathieu.gautier at cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_QUATERNION_H
+#define EIGEN_QUATERNION_H
+namespace Eigen { 
+
+
+/***************************************************************************
+* Definition of QuaternionBase<Derived>
+* The implementation is at the end of the file
+***************************************************************************/
+
+namespace internal {
+template<typename Other,
+         int OtherRows=Other::RowsAtCompileTime,
+         int OtherCols=Other::ColsAtCompileTime>
+struct quaternionbase_assign_impl;
+}
+
+/** \geometry_module \ingroup Geometry_Module
+  * \class QuaternionBase
+  * \brief Base class for quaternion expressions
+  * \tparam Derived derived type (CRTP)
+  * \sa class Quaternion
+  */
+template<class Derived>
+class QuaternionBase : public RotationBase<Derived, 3>
+{
+  typedef RotationBase<Derived, 3> Base;
+public:
+  using Base::operator*;
+  using Base::derived;
+
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::traits<Derived>::Coefficients Coefficients;
+  enum {
+    Flags = Eigen::internal::traits<Derived>::Flags
+  };
+
+ // typedef typename Matrix<Scalar,4,1> Coefficients;
+  /** the type of a 3D vector */
+  typedef Matrix<Scalar,3,1> Vector3;
+  /** the equivalent rotation matrix type */
+  typedef Matrix<Scalar,3,3> Matrix3;
+  /** the equivalent angle-axis type */
+  typedef AngleAxis<Scalar> AngleAxisType;
+
+
+
+  /** \returns the \c x coefficient */
+  inline Scalar x() const { return this->derived().coeffs().coeff(0); }
+  /** \returns the \c y coefficient */
+  inline Scalar y() const { return this->derived().coeffs().coeff(1); }
+  /** \returns the \c z coefficient */
+  inline Scalar z() const { return this->derived().coeffs().coeff(2); }
+  /** \returns the \c w coefficient */
+  inline Scalar w() const { return this->derived().coeffs().coeff(3); }
+
+  /** \returns a reference to the \c x coefficient */
+  inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
+  /** \returns a reference to the \c y coefficient */
+  inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
+  /** \returns a reference to the \c z coefficient */
+  inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
+  /** \returns a reference to the \c w coefficient */
+  inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
+
+  /** \returns a read-only vector expression of the imaginary part (x,y,z) */
+  inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
+
+  /** \returns a vector expression of the imaginary part (x,y,z) */
+  inline VectorBlock<Coefficients,3> vec() { return coeffs().template head<3>(); }
+
+  /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
+  inline const typename internal::traits<Derived>::Coefficients& coeffs() const { return derived().coeffs(); }
+
+  /** \returns a vector expression of the coefficients (x,y,z,w) */
+  inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }
+
+  EIGEN_STRONG_INLINE QuaternionBase<Derived>& operator=(const QuaternionBase<Derived>& other);
+  template<class OtherDerived> EIGEN_STRONG_INLINE Derived& operator=(const QuaternionBase<OtherDerived>& other);
+
+// disabled this copy operator as it is giving very strange compilation errors when compiling
+// test_stdvector with GCC 4.4.2. This looks like a GCC bug though, so feel free to re-enable it if it's
+// useful; however notice that we already have the templated operator= above and e.g. in MatrixBase
+// we didn't have to add, in addition to templated operator=, such a non-templated copy operator.
+//  Derived& operator=(const QuaternionBase& other)
+//  { return operator=<Derived>(other); }
+
+  Derived& operator=(const AngleAxisType& aa);
+  template<class OtherDerived> Derived& operator=(const MatrixBase<OtherDerived>& m);
+
+  /** \returns a quaternion representing an identity rotation
+    * \sa MatrixBase::Identity()
+    */
+  static inline Quaternion<Scalar> Identity() { return Quaternion<Scalar>(1, 0, 0, 0); }
+
+  /** \sa QuaternionBase::Identity(), MatrixBase::setIdentity()
+    */
+  inline QuaternionBase& setIdentity() { coeffs() << 0, 0, 0, 1; return *this; }
+
+  /** \returns the squared norm of the quaternion's coefficients
+    * \sa QuaternionBase::norm(), MatrixBase::squaredNorm()
+    */
+  inline Scalar squaredNorm() const { return coeffs().squaredNorm(); }
+
+  /** \returns the norm of the quaternion's coefficients
+    * \sa QuaternionBase::squaredNorm(), MatrixBase::norm()
+    */
+  inline Scalar norm() const { return coeffs().norm(); }
+
+  /** Normalizes the quaternion \c *this
+    * \sa normalized(), MatrixBase::normalize() */
+  inline void normalize() { coeffs().normalize(); }
+  /** \returns a normalized copy of \c *this
+    * \sa normalize(), MatrixBase::normalized() */
+  inline Quaternion<Scalar> normalized() const { return Quaternion<Scalar>(coeffs().normalized()); }
+
+    /** \returns the dot product of \c *this and \a other
+    * Geometrically speaking, the dot product of two unit quaternions
+    * corresponds to the cosine of half the angle between the two rotations.
+    * \sa angularDistance()
+    */
+  template<class OtherDerived> inline Scalar dot(const QuaternionBase<OtherDerived>& other) const { return coeffs().dot(other.coeffs()); }
+
+  template<class OtherDerived> Scalar angularDistance(const QuaternionBase<OtherDerived>& other) const;
+
+  /** \returns an equivalent 3x3 rotation matrix */
+  Matrix3 toRotationMatrix() const;
+
+  /** \returns the quaternion which transform \a a into \a b through a rotation */
+  template<typename Derived1, typename Derived2>
+  Derived& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+
+  template<class OtherDerived> EIGEN_STRONG_INLINE Quaternion<Scalar> operator* (const QuaternionBase<OtherDerived>& q) const;
+  template<class OtherDerived> EIGEN_STRONG_INLINE Derived& operator*= (const QuaternionBase<OtherDerived>& q);
+
+  /** \returns the quaternion describing the inverse rotation */
+  Quaternion<Scalar> inverse() const;
+
+  /** \returns the conjugated quaternion */
+  Quaternion<Scalar> conjugate() const;
+
+  template<class OtherDerived> Quaternion<Scalar> slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const;
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  template<class OtherDerived>
+  bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
+  { return coeffs().isApprox(other.coeffs(), prec); }
+
+	/** return the result vector of \a v through the rotation*/
+  EIGEN_STRONG_INLINE Vector3 _transformVector(Vector3 v) const;
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const
+  {
+    return typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type(derived());
+  }
+
+#ifdef EIGEN_QUATERNIONBASE_PLUGIN
+# include EIGEN_QUATERNIONBASE_PLUGIN
+#endif
+};
+
+/***************************************************************************
+* Definition/implementation of Quaternion<Scalar>
+***************************************************************************/
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Quaternion
+  *
+  * \brief The quaternion class used to represent 3D orientations and rotations
+  *
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _Options controls the memory alignment of the coefficients. Can be \# AutoAlign or \# DontAlign. Default is AutoAlign.
+  *
+  * This class represents a quaternion \f$ w+xi+yj+zk \f$ that is a convenient representation of
+  * orientations and rotations of objects in three dimensions. Compared to other representations
+  * like Euler angles or 3x3 matrices, quaternions offer the following advantages:
+  * \li \b compact storage (4 scalars)
+  * \li \b efficient to compose (28 flops),
+  * \li \b stable spherical interpolation
+  *
+  * The following two typedefs are provided for convenience:
+  * \li \c Quaternionf for \c float
+  * \li \c Quaterniond for \c double
+  *
+  * \warning Operations interpreting the quaternion as rotation have undefined behavior if the quaternion is not normalized.
+  *
+  * \sa  class AngleAxis, class Transform
+  */
+
+namespace internal {
+template<typename _Scalar,int _Options>
+struct traits<Quaternion<_Scalar,_Options> >
+{
+  typedef Quaternion<_Scalar,_Options> PlainObject;
+  typedef _Scalar Scalar;
+  typedef Matrix<_Scalar,4,1,_Options> Coefficients;
+  enum{
+    IsAligned = internal::traits<Coefficients>::Flags & AlignedBit,
+    Flags = IsAligned ? (AlignedBit | LvalueBit) : LvalueBit
+  };
+};
+}
+
+template<typename _Scalar, int _Options>
+class Quaternion : public QuaternionBase<Quaternion<_Scalar,_Options> >
+{
+  typedef QuaternionBase<Quaternion<_Scalar,_Options> > Base;
+  enum { IsAligned = internal::traits<Quaternion>::IsAligned };
+
+public:
+  typedef _Scalar Scalar;
+
+  EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Quaternion)
+  using Base::operator*=;
+
+  typedef typename internal::traits<Quaternion>::Coefficients Coefficients;
+  typedef typename Base::AngleAxisType AngleAxisType;
+
+  /** Default constructor leaving the quaternion uninitialized. */
+  inline Quaternion() {}
+
+  /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
+    * its four coefficients \a w, \a x, \a y and \a z.
+    *
+    * \warning Note the order of the arguments: the real \a w coefficient first,
+    * while internally the coefficients are stored in the following order:
+    * [\c x, \c y, \c z, \c w]
+    */
+  inline Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, w){}
+
+  /** Constructs and initialize a quaternion from the array data */
+  inline Quaternion(const Scalar* data) : m_coeffs(data) {}
+
+  /** Copy constructor */
+  template<class Derived> EIGEN_STRONG_INLINE Quaternion(const QuaternionBase<Derived>& other) { this->Base::operator=(other); }
+
+  /** Constructs and initializes a quaternion from the angle-axis \a aa */
+  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
+
+  /** Constructs and initializes a quaternion from either:
+    *  - a rotation matrix expression,
+    *  - a 4D vector expression representing quaternion coefficients.
+    */
+  template<typename Derived>
+  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
+
+  /** Explicit copy constructor with scalar conversion */
+  template<typename OtherScalar, int OtherOptions>
+  explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
+  { m_coeffs = other.coeffs().template cast<Scalar>(); }
+
+  template<typename Derived1, typename Derived2>
+  static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+
+  inline Coefficients& coeffs() { return m_coeffs;}
+  inline const Coefficients& coeffs() const { return m_coeffs;}
+
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(IsAligned)
+
+protected:
+  Coefficients m_coeffs;
+  
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    static EIGEN_STRONG_INLINE void _check_template_params()
+    {
+      EIGEN_STATIC_ASSERT( (_Options & DontAlign) == _Options,
+        INVALID_MATRIX_TEMPLATE_PARAMETERS)
+    }
+#endif
+};
+
+/** \ingroup Geometry_Module
+  * single precision quaternion type */
+typedef Quaternion<float> Quaternionf;
+/** \ingroup Geometry_Module
+  * double precision quaternion type */
+typedef Quaternion<double> Quaterniond;
+
+/***************************************************************************
+* Specialization of Map<Quaternion<Scalar>>
+***************************************************************************/
+
+namespace internal {
+  template<typename _Scalar, int _Options>
+  struct traits<Map<Quaternion<_Scalar>, _Options> > : traits<Quaternion<_Scalar, (int(_Options)&Aligned)==Aligned ? AutoAlign : DontAlign> >
+  {
+    typedef Map<Matrix<_Scalar,4,1>, _Options> Coefficients;
+  };
+}
+
+namespace internal {
+  template<typename _Scalar, int _Options>
+  struct traits<Map<const Quaternion<_Scalar>, _Options> > : traits<Quaternion<_Scalar, (int(_Options)&Aligned)==Aligned ? AutoAlign : DontAlign> >
+  {
+    typedef Map<const Matrix<_Scalar,4,1>, _Options> Coefficients;
+    typedef traits<Quaternion<_Scalar, (int(_Options)&Aligned)==Aligned ? AutoAlign : DontAlign> > TraitsBase;
+    enum {
+      Flags = TraitsBase::Flags & ~LvalueBit
+    };
+  };
+}
+
+/** \ingroup Geometry_Module
+  * \brief Quaternion expression mapping a constant memory buffer
+  *
+  * \tparam _Scalar the type of the Quaternion coefficients
+  * \tparam _Options see class Map
+  *
+  * This is a specialization of class Map for Quaternion. This class allows to view
+  * a 4 scalar memory buffer as an Eigen's Quaternion object.
+  *
+  * \sa class Map, class Quaternion, class QuaternionBase
+  */
+template<typename _Scalar, int _Options>
+class Map<const Quaternion<_Scalar>, _Options >
+  : public QuaternionBase<Map<const Quaternion<_Scalar>, _Options> >
+{
+    typedef QuaternionBase<Map<const Quaternion<_Scalar>, _Options> > Base;
+
+  public:
+    typedef _Scalar Scalar;
+    typedef typename internal::traits<Map>::Coefficients Coefficients;
+    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Map)
+    using Base::operator*=;
+
+    /** Constructs a Mapped Quaternion object from the pointer \a coeffs
+      *
+      * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order:
+      * \code *coeffs == {x, y, z, w} \endcode
+      *
+      * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
+    EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {}
+
+    inline const Coefficients& coeffs() const { return m_coeffs;}
+
+  protected:
+    const Coefficients m_coeffs;
+};
+
+/** \ingroup Geometry_Module
+  * \brief Expression of a quaternion from a memory buffer
+  *
+  * \tparam _Scalar the type of the Quaternion coefficients
+  * \tparam _Options see class Map
+  *
+  * This is a specialization of class Map for Quaternion. This class allows to view
+  * a 4 scalar memory buffer as an Eigen's  Quaternion object.
+  *
+  * \sa class Map, class Quaternion, class QuaternionBase
+  */
+template<typename _Scalar, int _Options>
+class Map<Quaternion<_Scalar>, _Options >
+  : public QuaternionBase<Map<Quaternion<_Scalar>, _Options> >
+{
+    typedef QuaternionBase<Map<Quaternion<_Scalar>, _Options> > Base;
+
+  public:
+    typedef _Scalar Scalar;
+    typedef typename internal::traits<Map>::Coefficients Coefficients;
+    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Map)
+    using Base::operator*=;
+
+    /** Constructs a Mapped Quaternion object from the pointer \a coeffs
+      *
+      * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order:
+      * \code *coeffs == {x, y, z, w} \endcode
+      *
+      * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
+    EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {}
+
+    inline Coefficients& coeffs() { return m_coeffs; }
+    inline const Coefficients& coeffs() const { return m_coeffs; }
+
+  protected:
+    Coefficients m_coeffs;
+};
+
+/** \ingroup Geometry_Module
+  * Map an unaligned array of single precision scalars as a quaternion */
+typedef Map<Quaternion<float>, 0>         QuaternionMapf;
+/** \ingroup Geometry_Module
+  * Map an unaligned array of double precision scalars as a quaternion */
+typedef Map<Quaternion<double>, 0>        QuaternionMapd;
+/** \ingroup Geometry_Module
+  * Map a 16-byte aligned array of single precision scalars as a quaternion */
+typedef Map<Quaternion<float>, Aligned>   QuaternionMapAlignedf;
+/** \ingroup Geometry_Module
+  * Map a 16-byte aligned array of double precision scalars as a quaternion */
+typedef Map<Quaternion<double>, Aligned>  QuaternionMapAlignedd;
+
+/***************************************************************************
+* Implementation of QuaternionBase methods
+***************************************************************************/
+
+// Generic Quaternion * Quaternion product
+// This product can be specialized for a given architecture via the Arch template argument.
+namespace internal {
+template<int Arch, class Derived1, class Derived2, typename Scalar, int _Options> struct quat_product
+{
+  static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived1>& a, const QuaternionBase<Derived2>& b){
+    return Quaternion<Scalar>
+    (
+      a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
+      a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
+      a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
+      a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x()
+    );
+  }
+};
+}
+
+/** \returns the concatenation of two rotations as a quaternion-quaternion product */
+template <class Derived>
+template <class OtherDerived>
+EIGEN_STRONG_INLINE Quaternion<typename internal::traits<Derived>::Scalar>
+QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value),
+   YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  return internal::quat_product<Architecture::Target, Derived, OtherDerived,
+                         typename internal::traits<Derived>::Scalar,
+                         internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned>::run(*this, other);
+}
+
+/** \sa operator*(Quaternion) */
+template <class Derived>
+template <class OtherDerived>
+EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const QuaternionBase<OtherDerived>& other)
+{
+  derived() = derived() * other.derived();
+  return derived();
+}
+
+/** Rotation of a vector by a quaternion.
+  * \remarks If the quaternion is used to rotate several points (>1)
+  * then it is much more efficient to first convert it to a 3x3 Matrix.
+  * Comparison of the operation cost for n transformations:
+  *   - Quaternion2:    30n
+  *   - Via a Matrix3: 24 + 15n
+  */
+template <class Derived>
+EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
+QuaternionBase<Derived>::_transformVector(Vector3 v) const
+{
+    // Note that this algorithm comes from the optimization by hand
+    // of the conversion to a Matrix followed by a Matrix/Vector product.
+    // It appears to be much faster than the common algorithm found
+    // in the literature (30 versus 39 flops). It also requires two
+    // Vector3 as temporaries.
+    Vector3 uv = this->vec().cross(v);
+    uv += uv;
+    return v + this->w() * uv + this->vec().cross(uv);
+}
+
+template<class Derived>
+EIGEN_STRONG_INLINE QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(const QuaternionBase<Derived>& other)
+{
+  coeffs() = other.coeffs();
+  return derived();
+}
+
+template<class Derived>
+template<class OtherDerived>
+EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const QuaternionBase<OtherDerived>& other)
+{
+  coeffs() = other.coeffs();
+  return derived();
+}
+
+/** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
+  */
+template<class Derived>
+EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const AngleAxisType& aa)
+{
+  using std::cos;
+  using std::sin;
+  Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
+  this->w() = cos(ha);
+  this->vec() = sin(ha) * aa.axis();
+  return derived();
+}
+
+/** Set \c *this from the expression \a xpr:
+  *   - if \a xpr is a 4x1 vector, then \a xpr is assumed to be a quaternion
+  *   - if \a xpr is a 3x3 matrix, then \a xpr is assumed to be rotation matrix
+  *     and \a xpr is converted to a quaternion
+  */
+
+template<class Derived>
+template<class MatrixDerived>
+inline Derived& QuaternionBase<Derived>::operator=(const MatrixBase<MatrixDerived>& xpr)
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename MatrixDerived::Scalar>::value),
+   YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  internal::quaternionbase_assign_impl<MatrixDerived>::run(*this, xpr.derived());
+  return derived();
+}
+
+/** Convert the quaternion to a 3x3 rotation matrix. The quaternion is required to
+  * be normalized, otherwise the result is undefined.
+  */
+template<class Derived>
+inline typename QuaternionBase<Derived>::Matrix3
+QuaternionBase<Derived>::toRotationMatrix(void) const
+{
+  // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
+  // if not inlined then the cost of the return by value is huge ~ +35%,
+  // however, not inlining this function is an order of magnitude slower, so
+  // it has to be inlined, and so the return by value is not an issue
+  Matrix3 res;
+
+  const Scalar tx  = Scalar(2)*this->x();
+  const Scalar ty  = Scalar(2)*this->y();
+  const Scalar tz  = Scalar(2)*this->z();
+  const Scalar twx = tx*this->w();
+  const Scalar twy = ty*this->w();
+  const Scalar twz = tz*this->w();
+  const Scalar txx = tx*this->x();
+  const Scalar txy = ty*this->x();
+  const Scalar txz = tz*this->x();
+  const Scalar tyy = ty*this->y();
+  const Scalar tyz = tz*this->y();
+  const Scalar tzz = tz*this->z();
+
+  res.coeffRef(0,0) = Scalar(1)-(tyy+tzz);
+  res.coeffRef(0,1) = txy-twz;
+  res.coeffRef(0,2) = txz+twy;
+  res.coeffRef(1,0) = txy+twz;
+  res.coeffRef(1,1) = Scalar(1)-(txx+tzz);
+  res.coeffRef(1,2) = tyz-twx;
+  res.coeffRef(2,0) = txz-twy;
+  res.coeffRef(2,1) = tyz+twx;
+  res.coeffRef(2,2) = Scalar(1)-(txx+tyy);
+
+  return res;
+}
+
+/** Sets \c *this to be a quaternion representing a rotation between
+  * the two arbitrary vectors \a a and \a b. In other words, the built
+  * rotation represent a rotation sending the line of direction \a a
+  * to the line of direction \a b, both lines passing through the origin.
+  *
+  * \returns a reference to \c *this.
+  *
+  * Note that the two input vectors do \b not have to be normalized, and
+  * do not need to have the same norm.
+  */
+template<class Derived>
+template<typename Derived1, typename Derived2>
+inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
+{
+  using std::max;
+  using std::sqrt;
+  Vector3 v0 = a.normalized();
+  Vector3 v1 = b.normalized();
+  Scalar c = v1.dot(v0);
+
+  // if dot == -1, vectors are nearly opposites
+  // => accurately compute the rotation axis by computing the
+  //    intersection of the two planes. This is done by solving:
+  //       x^T v0 = 0
+  //       x^T v1 = 0
+  //    under the constraint:
+  //       ||x|| = 1
+  //    which yields a singular value problem
+  if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision())
+  {
+    c = (max)(c,Scalar(-1));
+    Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
+    JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
+    Vector3 axis = svd.matrixV().col(2);
+
+    Scalar w2 = (Scalar(1)+c)*Scalar(0.5);
+    this->w() = sqrt(w2);
+    this->vec() = axis * sqrt(Scalar(1) - w2);
+    return derived();
+  }
+  Vector3 axis = v0.cross(v1);
+  Scalar s = sqrt((Scalar(1)+c)*Scalar(2));
+  Scalar invs = Scalar(1)/s;
+  this->vec() = axis * invs;
+  this->w() = s * Scalar(0.5);
+
+  return derived();
+}
+
+
+/** Returns a quaternion representing a rotation between
+  * the two arbitrary vectors \a a and \a b. In other words, the built
+  * rotation represent a rotation sending the line of direction \a a
+  * to the line of direction \a b, both lines passing through the origin.
+  *
+  * \returns resulting quaternion
+  *
+  * Note that the two input vectors do \b not have to be normalized, and
+  * do not need to have the same norm.
+  */
+template<typename Scalar, int Options>
+template<typename Derived1, typename Derived2>
+Quaternion<Scalar,Options> Quaternion<Scalar,Options>::FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
+{
+    Quaternion quat;
+    quat.setFromTwoVectors(a, b);
+    return quat;
+}
+
+
+/** \returns the multiplicative inverse of \c *this
+  * Note that in most cases, i.e., if you simply want the opposite rotation,
+  * and/or the quaternion is normalized, then it is enough to use the conjugate.
+  *
+  * \sa QuaternionBase::conjugate()
+  */
+template <class Derived>
+inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::inverse() const
+{
+  // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
+  Scalar n2 = this->squaredNorm();
+  if (n2 > 0)
+    return Quaternion<Scalar>(conjugate().coeffs() / n2);
+  else
+  {
+    // return an invalid result to flag the error
+    return Quaternion<Scalar>(Coefficients::Zero());
+  }
+}
+
+/** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
+  * if the quaternion is normalized.
+  * The conjugate of a quaternion represents the opposite rotation.
+  *
+  * \sa Quaternion2::inverse()
+  */
+template <class Derived>
+inline Quaternion<typename internal::traits<Derived>::Scalar>
+QuaternionBase<Derived>::conjugate() const
+{
+  return Quaternion<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
+}
+
+/** \returns the angle (in radian) between two rotations
+  * \sa dot()
+  */
+template <class Derived>
+template <class OtherDerived>
+inline typename internal::traits<Derived>::Scalar
+QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& other) const
+{
+  using std::acos;
+  using std::abs;
+  Scalar d = abs(this->dot(other));
+  if (d>=Scalar(1))
+    return Scalar(0);
+  return Scalar(2) * acos(d);
+}
+
+ 
+    
+/** \returns the spherical linear interpolation between the two quaternions
+  * \c *this and \a other at the parameter \a t in [0;1].
+  * 
+  * This represents an interpolation for a constant motion between \c *this and \a other,
+  * see also http://en.wikipedia.org/wiki/Slerp.
+  */
+template <class Derived>
+template <class OtherDerived>
+Quaternion<typename internal::traits<Derived>::Scalar>
+QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const
+{
+  using std::acos;
+  using std::sin;
+  using std::abs;
+  static const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
+  Scalar d = this->dot(other);
+  Scalar absD = abs(d);
+
+  Scalar scale0;
+  Scalar scale1;
+
+  if(absD>=one)
+  {
+    scale0 = Scalar(1) - t;
+    scale1 = t;
+  }
+  else
+  {
+    // theta is the angle between the 2 quaternions
+    Scalar theta = acos(absD);
+    Scalar sinTheta = sin(theta);
+
+    scale0 = sin( ( Scalar(1) - t ) * theta) / sinTheta;
+    scale1 = sin( ( t * theta) ) / sinTheta;
+  }
+  if(d<0) scale1 = -scale1;
+
+  return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
+}
+
+namespace internal {
+
+// set from a rotation matrix
+template<typename Other>
+struct quaternionbase_assign_impl<Other,3,3>
+{
+  typedef typename Other::Scalar Scalar;
+  typedef DenseIndex Index;
+  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& mat)
+  {
+    using std::sqrt;
+    // This algorithm comes from  "Quaternion Calculus and Fast Animation",
+    // Ken Shoemake, 1987 SIGGRAPH course notes
+    Scalar t = mat.trace();
+    if (t > Scalar(0))
+    {
+      t = sqrt(t + Scalar(1.0));
+      q.w() = Scalar(0.5)*t;
+      t = Scalar(0.5)/t;
+      q.x() = (mat.coeff(2,1) - mat.coeff(1,2)) * t;
+      q.y() = (mat.coeff(0,2) - mat.coeff(2,0)) * t;
+      q.z() = (mat.coeff(1,0) - mat.coeff(0,1)) * t;
+    }
+    else
+    {
+      DenseIndex i = 0;
+      if (mat.coeff(1,1) > mat.coeff(0,0))
+        i = 1;
+      if (mat.coeff(2,2) > mat.coeff(i,i))
+        i = 2;
+      DenseIndex j = (i+1)%3;
+      DenseIndex k = (j+1)%3;
+
+      t = sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
+      q.coeffs().coeffRef(i) = Scalar(0.5) * t;
+      t = Scalar(0.5)/t;
+      q.w() = (mat.coeff(k,j)-mat.coeff(j,k))*t;
+      q.coeffs().coeffRef(j) = (mat.coeff(j,i)+mat.coeff(i,j))*t;
+      q.coeffs().coeffRef(k) = (mat.coeff(k,i)+mat.coeff(i,k))*t;
+    }
+  }
+};
+
+// set from a vector of coefficients assumed to be a quaternion
+template<typename Other>
+struct quaternionbase_assign_impl<Other,4,1>
+{
+  typedef typename Other::Scalar Scalar;
+  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& vec)
+  {
+    q.coeffs() = vec;
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_QUATERNION_H
diff --git a/include/eigen3/Eigen/src/Geometry/Rotation2D.h b/include/eigen3/Eigen/src/Geometry/Rotation2D.h
new file mode 100644
index 0000000..a2d59fc
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/Rotation2D.h
@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ROTATION2D_H
+#define EIGEN_ROTATION2D_H
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Rotation2D
+  *
+  * \brief Represents a rotation/orientation in a 2 dimensional space.
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  *
+  * This class is equivalent to a single scalar representing a counter clock wise rotation
+  * as a single angle in radian. It provides some additional features such as the automatic
+  * conversion from/to a 2x2 rotation matrix. Moreover this class aims to provide a similar
+  * interface to Quaternion in order to facilitate the writing of generic algorithms
+  * dealing with rotations.
+  *
+  * \sa class Quaternion, class Transform
+  */
+
+namespace internal {
+
+template<typename _Scalar> struct traits<Rotation2D<_Scalar> >
+{
+  typedef _Scalar Scalar;
+};
+} // end namespace internal
+
+template<typename _Scalar>
+class Rotation2D : public RotationBase<Rotation2D<_Scalar>,2>
+{
+  typedef RotationBase<Rotation2D<_Scalar>,2> Base;
+
+public:
+
+  using Base::operator*;
+
+  enum { Dim = 2 };
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+  typedef Matrix<Scalar,2,1> Vector2;
+  typedef Matrix<Scalar,2,2> Matrix2;
+
+protected:
+
+  Scalar m_angle;
+
+public:
+
+  /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
+  inline Rotation2D(const Scalar& a) : m_angle(a) {}
+  
+  /** Default constructor wihtout initialization. The represented rotation is undefined. */
+  Rotation2D() {}
+
+  /** \returns the rotation angle */
+  inline Scalar angle() const { return m_angle; }
+
+  /** \returns a read-write reference to the rotation angle */
+  inline Scalar& angle() { return m_angle; }
+
+  /** \returns the inverse rotation */
+  inline Rotation2D inverse() const { return -m_angle; }
+
+  /** Concatenates two rotations */
+  inline Rotation2D operator*(const Rotation2D& other) const
+  { return m_angle + other.m_angle; }
+
+  /** Concatenates two rotations */
+  inline Rotation2D& operator*=(const Rotation2D& other)
+  { m_angle += other.m_angle; return *this; }
+
+  /** Applies the rotation to a 2D vector */
+  Vector2 operator* (const Vector2& vec) const
+  { return toRotationMatrix() * vec; }
+  
+  template<typename Derived>
+  Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
+  Matrix2 toRotationMatrix() const;
+
+  /** \returns the spherical interpolation between \c *this and \a other using
+    * parameter \a t. It is in fact equivalent to a linear interpolation.
+    */
+  inline Rotation2D slerp(const Scalar& t, const Rotation2D& other) const
+  { return m_angle * (1-t) + other.angle() * t; }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
+  { return typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
+  {
+    m_angle = Scalar(other.angle());
+  }
+
+  static inline Rotation2D Identity() { return Rotation2D(0); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const Rotation2D& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  { return internal::isApprox(m_angle,other.m_angle, prec); }
+};
+
+/** \ingroup Geometry_Module
+  * single precision 2D rotation type */
+typedef Rotation2D<float> Rotation2Df;
+/** \ingroup Geometry_Module
+  * double precision 2D rotation type */
+typedef Rotation2D<double> Rotation2Dd;
+
+/** Set \c *this from a 2x2 rotation matrix \a mat.
+  * In other words, this function extract the rotation angle
+  * from the rotation matrix.
+  */
+template<typename Scalar>
+template<typename Derived>
+Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
+{
+  using std::atan2;
+  EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
+  m_angle = atan2(mat.coeff(1,0), mat.coeff(0,0));
+  return *this;
+}
+
+/** Constructs and \returns an equivalent 2x2 rotation matrix.
+  */
+template<typename Scalar>
+typename Rotation2D<Scalar>::Matrix2
+Rotation2D<Scalar>::toRotationMatrix(void) const
+{
+  using std::sin;
+  using std::cos;
+  Scalar sinA = sin(m_angle);
+  Scalar cosA = cos(m_angle);
+  return (Matrix2() << cosA, -sinA, sinA, cosA).finished();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ROTATION2D_H
diff --git a/include/eigen3/Eigen/src/Geometry/RotationBase.h b/include/eigen3/Eigen/src/Geometry/RotationBase.h
new file mode 100644
index 0000000..b88661d
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/RotationBase.h
@@ -0,0 +1,206 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ROTATIONBASE_H
+#define EIGEN_ROTATIONBASE_H
+
+namespace Eigen { 
+
+// forward declaration
+namespace internal {
+template<typename RotationDerived, typename MatrixType, bool IsVector=MatrixType::IsVectorAtCompileTime>
+struct rotation_base_generic_product_selector;
+}
+
+/** \class RotationBase
+  *
+  * \brief Common base class for compact rotation representations
+  *
+  * \param Derived is the derived type, i.e., a rotation type
+  * \param _Dim the dimension of the space
+  */
+template<typename Derived, int _Dim>
+class RotationBase
+{
+  public:
+    enum { Dim = _Dim };
+    /** the scalar type of the coefficients */
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+
+    /** corresponding linear transformation matrix type */
+    typedef Matrix<Scalar,Dim,Dim> RotationMatrixType;
+    typedef Matrix<Scalar,Dim,1> VectorType;
+
+  public:
+    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    inline Derived& derived() { return *static_cast<Derived*>(this); }
+
+    /** \returns an equivalent rotation matrix */
+    inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
+
+    /** \returns an equivalent rotation matrix 
+      * This function is added to be conform with the Transform class' naming scheme.
+      */
+    inline RotationMatrixType matrix() const { return derived().toRotationMatrix(); }
+
+    /** \returns the inverse rotation */
+    inline Derived inverse() const { return derived().inverse(); }
+
+    /** \returns the concatenation of the rotation \c *this with a translation \a t */
+    inline Transform<Scalar,Dim,Isometry> operator*(const Translation<Scalar,Dim>& t) const
+    { return Transform<Scalar,Dim,Isometry>(*this) * t; }
+
+    /** \returns the concatenation of the rotation \c *this with a uniform scaling \a s */
+    inline RotationMatrixType operator*(const UniformScaling<Scalar>& s) const
+    { return toRotationMatrix() * s.factor(); }
+
+    /** \returns the concatenation of the rotation \c *this with a generic expression \a e
+      * \a e can be:
+      *  - a DimxDim linear transformation matrix
+      *  - a DimxDim diagonal matrix (axis aligned scaling)
+      *  - a vector of size Dim
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE typename internal::rotation_base_generic_product_selector<Derived,OtherDerived,OtherDerived::IsVectorAtCompileTime>::ReturnType
+    operator*(const EigenBase<OtherDerived>& e) const
+    { return internal::rotation_base_generic_product_selector<Derived,OtherDerived>::run(derived(), e.derived()); }
+
+    /** \returns the concatenation of a linear transformation \a l with the rotation \a r */
+    template<typename OtherDerived> friend
+    inline RotationMatrixType operator*(const EigenBase<OtherDerived>& l, const Derived& r)
+    { return l.derived() * r.toRotationMatrix(); }
+
+    /** \returns the concatenation of a scaling \a l with the rotation \a r */
+    friend inline Transform<Scalar,Dim,Affine> operator*(const DiagonalMatrix<Scalar,Dim>& l, const Derived& r)
+    { 
+      Transform<Scalar,Dim,Affine> res(r);
+      res.linear().applyOnTheLeft(l);
+      return res;
+    }
+
+    /** \returns the concatenation of the rotation \c *this with a transformation \a t */
+    template<int Mode, int Options>
+    inline Transform<Scalar,Dim,Mode> operator*(const Transform<Scalar,Dim,Mode,Options>& t) const
+    { return toRotationMatrix() * t; }
+
+    template<typename OtherVectorType>
+    inline VectorType _transformVector(const OtherVectorType& v) const
+    { return toRotationMatrix() * v; }
+};
+
+namespace internal {
+
+// implementation of the generic product rotation * matrix
+template<typename RotationDerived, typename MatrixType>
+struct rotation_base_generic_product_selector<RotationDerived,MatrixType,false>
+{
+  enum { Dim = RotationDerived::Dim };
+  typedef Matrix<typename RotationDerived::Scalar,Dim,Dim> ReturnType;
+  static inline ReturnType run(const RotationDerived& r, const MatrixType& m)
+  { return r.toRotationMatrix() * m; }
+};
+
+template<typename RotationDerived, typename Scalar, int Dim, int MaxDim>
+struct rotation_base_generic_product_selector< RotationDerived, DiagonalMatrix<Scalar,Dim,MaxDim>, false >
+{
+  typedef Transform<Scalar,Dim,Affine> ReturnType;
+  static inline ReturnType run(const RotationDerived& r, const DiagonalMatrix<Scalar,Dim,MaxDim>& m)
+  {
+    ReturnType res(r);
+    res.linear() *= m;
+    return res;
+  }
+};
+
+template<typename RotationDerived,typename OtherVectorType>
+struct rotation_base_generic_product_selector<RotationDerived,OtherVectorType,true>
+{
+  enum { Dim = RotationDerived::Dim };
+  typedef Matrix<typename RotationDerived::Scalar,Dim,1> ReturnType;
+  static EIGEN_STRONG_INLINE ReturnType run(const RotationDerived& r, const OtherVectorType& v)
+  {
+    return r._transformVector(v);
+  }
+};
+
+} // end namespace internal
+
+/** \geometry_module
+  *
+  * \brief Constructs a Dim x Dim rotation matrix from the rotation \a r
+  */
+template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
+template<typename OtherDerived>
+Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
+::Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
+{
+  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
+  *this = r.toRotationMatrix();
+}
+
+/** \geometry_module
+  *
+  * \brief Set a Dim x Dim rotation matrix from the rotation \a r
+  */
+template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
+template<typename OtherDerived>
+Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
+Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
+::operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
+{
+  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
+  return *this = r.toRotationMatrix();
+}
+
+namespace internal {
+
+/** \internal
+  *
+  * Helper function to return an arbitrary rotation object to a rotation matrix.
+  *
+  * \param Scalar the numeric type of the matrix coefficients
+  * \param Dim the dimension of the current space
+  *
+  * It returns a Dim x Dim fixed size matrix.
+  *
+  * Default specializations are provided for:
+  *   - any scalar type (2D),
+  *   - any matrix expression,
+  *   - any type based on RotationBase (e.g., Quaternion, AngleAxis, Rotation2D)
+  *
+  * Currently toRotationMatrix is only used by Transform.
+  *
+  * \sa class Transform, class Rotation2D, class Quaternion, class AngleAxis
+  */
+template<typename Scalar, int Dim>
+static inline Matrix<Scalar,2,2> toRotationMatrix(const Scalar& s)
+{
+  EIGEN_STATIC_ASSERT(Dim==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
+  return Rotation2D<Scalar>(s).toRotationMatrix();
+}
+
+template<typename Scalar, int Dim, typename OtherDerived>
+static inline Matrix<Scalar,Dim,Dim> toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
+{
+  return r.toRotationMatrix();
+}
+
+template<typename Scalar, int Dim, typename OtherDerived>
+static inline const MatrixBase<OtherDerived>& toRotationMatrix(const MatrixBase<OtherDerived>& mat)
+{
+  EIGEN_STATIC_ASSERT(OtherDerived::RowsAtCompileTime==Dim && OtherDerived::ColsAtCompileTime==Dim,
+    YOU_MADE_A_PROGRAMMING_MISTAKE)
+  return mat;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ROTATIONBASE_H
diff --git a/include/eigen3/Eigen/src/Geometry/Scaling.h b/include/eigen3/Eigen/src/Geometry/Scaling.h
new file mode 100644
index 0000000..1c25f36
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/Scaling.h
@@ -0,0 +1,166 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SCALING_H
+#define EIGEN_SCALING_H
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Scaling
+  *
+  * \brief Represents a generic uniform scaling transformation
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  *
+  * This class represent a uniform scaling transformation. It is the return
+  * type of Scaling(Scalar), and most of the time this is the only way it
+  * is used. In particular, this class is not aimed to be used to store a scaling transformation,
+  * but rather to make easier the constructions and updates of Transform objects.
+  *
+  * To represent an axis aligned scaling, use the DiagonalMatrix class.
+  *
+  * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform
+  */
+template<typename _Scalar>
+class UniformScaling
+{
+public:
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+
+protected:
+
+  Scalar m_factor;
+
+public:
+
+  /** Default constructor without initialization. */
+  UniformScaling() {}
+  /** Constructs and initialize a uniform scaling transformation */
+  explicit inline UniformScaling(const Scalar& s) : m_factor(s) {}
+
+  inline const Scalar& factor() const { return m_factor; }
+  inline Scalar& factor() { return m_factor; }
+
+  /** Concatenates two uniform scaling */
+  inline UniformScaling operator* (const UniformScaling& other) const
+  { return UniformScaling(m_factor * other.factor()); }
+
+  /** Concatenates a uniform scaling and a translation */
+  template<int Dim>
+  inline Transform<Scalar,Dim,Affine> operator* (const Translation<Scalar,Dim>& t) const;
+
+  /** Concatenates a uniform scaling and an affine transformation */
+  template<int Dim, int Mode, int Options>
+  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> operator* (const Transform<Scalar,Dim, Mode, Options>& t) const
+  {
+   Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
+   res.prescale(factor());
+   return res;
+}
+
+  /** Concatenates a uniform scaling and a linear transformation matrix */
+  // TODO returns an expression
+  template<typename Derived>
+  inline typename internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
+  { return other * m_factor; }
+
+  template<typename Derived,int Dim>
+  inline Matrix<Scalar,Dim,Dim> operator*(const RotationBase<Derived,Dim>& r) const
+  { return r.toRotationMatrix() * m_factor; }
+
+  /** \returns the inverse scaling */
+  inline UniformScaling inverse() const
+  { return UniformScaling(Scalar(1)/m_factor); }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline UniformScaling<NewScalarType> cast() const
+  { return UniformScaling<NewScalarType>(NewScalarType(m_factor)); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit UniformScaling(const UniformScaling<OtherScalarType>& other)
+  { m_factor = Scalar(other.factor()); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const UniformScaling& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  { return internal::isApprox(m_factor, other.factor(), prec); }
+
+};
+
+/** Concatenates a linear transformation matrix and a uniform scaling */
+// NOTE this operator is defiend in MatrixBase and not as a friend function
+// of UniformScaling to fix an internal crash of Intel's ICC
+template<typename Derived> typename MatrixBase<Derived>::ScalarMultipleReturnType
+MatrixBase<Derived>::operator*(const UniformScaling<Scalar>& s) const
+{ return derived() * s.factor(); }
+
+/** Constructs a uniform scaling from scale factor \a s */
+static inline UniformScaling<float> Scaling(float s) { return UniformScaling<float>(s); }
+/** Constructs a uniform scaling from scale factor \a s */
+static inline UniformScaling<double> Scaling(double s) { return UniformScaling<double>(s); }
+/** Constructs a uniform scaling from scale factor \a s */
+template<typename RealScalar>
+static inline UniformScaling<std::complex<RealScalar> > Scaling(const std::complex<RealScalar>& s)
+{ return UniformScaling<std::complex<RealScalar> >(s); }
+
+/** Constructs a 2D axis aligned scaling */
+template<typename Scalar>
+static inline DiagonalMatrix<Scalar,2> Scaling(const Scalar& sx, const Scalar& sy)
+{ return DiagonalMatrix<Scalar,2>(sx, sy); }
+/** Constructs a 3D axis aligned scaling */
+template<typename Scalar>
+static inline DiagonalMatrix<Scalar,3> Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
+{ return DiagonalMatrix<Scalar,3>(sx, sy, sz); }
+
+/** Constructs an axis aligned scaling expression from vector expression \a coeffs
+  * This is an alias for coeffs.asDiagonal()
+  */
+template<typename Derived>
+static inline const DiagonalWrapper<const Derived> Scaling(const MatrixBase<Derived>& coeffs)
+{ return coeffs.asDiagonal(); }
+
+/** \addtogroup Geometry_Module */
+//@{
+/** \deprecated */
+typedef DiagonalMatrix<float, 2> AlignedScaling2f;
+/** \deprecated */
+typedef DiagonalMatrix<double,2> AlignedScaling2d;
+/** \deprecated */
+typedef DiagonalMatrix<float, 3> AlignedScaling3f;
+/** \deprecated */
+typedef DiagonalMatrix<double,3> AlignedScaling3d;
+//@}
+
+template<typename Scalar>
+template<int Dim>
+inline Transform<Scalar,Dim,Affine>
+UniformScaling<Scalar>::operator* (const Translation<Scalar,Dim>& t) const
+{
+  Transform<Scalar,Dim,Affine> res;
+  res.matrix().setZero();
+  res.linear().diagonal().fill(factor());
+  res.translation() = factor() * t.vector();
+  res(Dim,Dim) = Scalar(1);
+  return res;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SCALING_H
diff --git a/include/eigen3/Eigen/src/Geometry/Transform.h b/include/eigen3/Eigen/src/Geometry/Transform.h
new file mode 100644
index 0000000..e786e53
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/Transform.h
@@ -0,0 +1,1455 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRANSFORM_H
+#define EIGEN_TRANSFORM_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Transform>
+struct transform_traits
+{
+  enum
+  {
+    Dim = Transform::Dim,
+    HDim = Transform::HDim,
+    Mode = Transform::Mode,
+    IsProjective = (int(Mode)==int(Projective))
+  };
+};
+
+template< typename TransformType,
+          typename MatrixType,
+          int Case = transform_traits<TransformType>::IsProjective ? 0
+                   : int(MatrixType::RowsAtCompileTime) == int(transform_traits<TransformType>::HDim) ? 1
+                   : 2>
+struct transform_right_product_impl;
+
+template< typename Other,
+          int Mode,
+          int Options,
+          int Dim,
+          int HDim,
+          int OtherRows=Other::RowsAtCompileTime,
+          int OtherCols=Other::ColsAtCompileTime>
+struct transform_left_product_impl;
+
+template< typename Lhs,
+          typename Rhs,
+          bool AnyProjective = 
+            transform_traits<Lhs>::IsProjective ||
+            transform_traits<Rhs>::IsProjective>
+struct transform_transform_product_impl;
+
+template< typename Other,
+          int Mode,
+          int Options,
+          int Dim,
+          int HDim,
+          int OtherRows=Other::RowsAtCompileTime,
+          int OtherCols=Other::ColsAtCompileTime>
+struct transform_construct_from_matrix;
+
+template<typename TransformType> struct transform_take_affine_part;
+
+template<int Mode> struct transform_make_affine;
+
+} // end namespace internal
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Transform
+  *
+  * \brief Represents an homogeneous transformation in a N dimensional space
+  *
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _Dim the dimension of the space
+  * \tparam _Mode the type of the transformation. Can be:
+  *              - #Affine: the transformation is stored as a (Dim+1)^2 matrix,
+  *                         where the last row is assumed to be [0 ... 0 1].
+  *              - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix.
+  *              - #Projective: the transformation is stored as a (Dim+1)^2 matrix
+  *                             without any assumption.
+  * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor.
+  *                  These Options are passed directly to the underlying matrix type.
+  *
+  * The homography is internally represented and stored by a matrix which
+  * is available through the matrix() method. To understand the behavior of
+  * this class you have to think a Transform object as its internal
+  * matrix representation. The chosen convention is right multiply:
+  *
+  * \code v' = T * v \endcode
+  *
+  * Therefore, an affine transformation matrix M is shaped like this:
+  *
+  * \f$ \left( \begin{array}{cc}
+  * linear & translation\\
+  * 0 ... 0 & 1
+  * \end{array} \right) \f$
+  *
+  * Note that for a projective transformation the last row can be anything,
+  * and then the interpretation of different parts might be sightly different.
+  *
+  * However, unlike a plain matrix, the Transform class provides many features
+  * simplifying both its assembly and usage. In particular, it can be composed
+  * with any other transformations (Transform,Translation,RotationBase,Matrix)
+  * and can be directly used to transform implicit homogeneous vectors. All these
+  * operations are handled via the operator*. For the composition of transformations,
+  * its principle consists to first convert the right/left hand sides of the product
+  * to a compatible (Dim+1)^2 matrix and then perform a pure matrix product.
+  * Of course, internally, operator* tries to perform the minimal number of operations
+  * according to the nature of each terms. Likewise, when applying the transform
+  * to non homogeneous vectors, the latters are automatically promoted to homogeneous
+  * one before doing the matrix product. The convertions to homogeneous representations
+  * are performed as follow:
+  *
+  * \b Translation t (Dim)x(1):
+  * \f$ \left( \begin{array}{cc}
+  * I & t \\
+  * 0\,...\,0 & 1
+  * \end{array} \right) \f$
+  *
+  * \b Rotation R (Dim)x(Dim):
+  * \f$ \left( \begin{array}{cc}
+  * R & 0\\
+  * 0\,...\,0 & 1
+  * \end{array} \right) \f$
+  *
+  * \b Linear \b Matrix L (Dim)x(Dim):
+  * \f$ \left( \begin{array}{cc}
+  * L & 0\\
+  * 0\,...\,0 & 1
+  * \end{array} \right) \f$
+  *
+  * \b Affine \b Matrix A (Dim)x(Dim+1):
+  * \f$ \left( \begin{array}{c}
+  * A\\
+  * 0\,...\,0\,1
+  * \end{array} \right) \f$
+  *
+  * \b Column \b vector v (Dim)x(1):
+  * \f$ \left( \begin{array}{c}
+  * v\\
+  * 1
+  * \end{array} \right) \f$
+  *
+  * \b Set \b of \b column \b vectors V1...Vn (Dim)x(n):
+  * \f$ \left( \begin{array}{ccc}
+  * v_1 & ... & v_n\\
+  * 1 & ... & 1
+  * \end{array} \right) \f$
+  *
+  * The concatenation of a Transform object with any kind of other transformation
+  * always returns a Transform object.
+  *
+  * A little exception to the "as pure matrix product" rule is the case of the
+  * transformation of non homogeneous vectors by an affine transformation. In
+  * that case the last matrix row can be ignored, and the product returns non
+  * homogeneous vectors.
+  *
+  * Since, for instance, a Dim x Dim matrix is interpreted as a linear transformation,
+  * it is not possible to directly transform Dim vectors stored in a Dim x Dim matrix.
+  * The solution is either to use a Dim x Dynamic matrix or explicitly request a
+  * vector transformation by making the vector homogeneous:
+  * \code
+  * m' = T * m.colwise().homogeneous();
+  * \endcode
+  * Note that there is zero overhead.
+  *
+  * Conversion methods from/to Qt's QMatrix and QTransform are available if the
+  * preprocessor token EIGEN_QT_SUPPORT is defined.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TRANSFORM_PLUGIN.
+  *
+  * \sa class Matrix, class Quaternion
+  */
+template<typename _Scalar, int _Dim, int _Mode, int _Options>
+class Transform
+{
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim==Dynamic ? Dynamic : (_Dim+1)*(_Dim+1))
+  enum {
+    Mode = _Mode,
+    Options = _Options,
+    Dim = _Dim,     ///< space dimension in which the transformation holds
+    HDim = _Dim+1,  ///< size of a respective homogeneous vector
+    Rows = int(Mode)==(AffineCompact) ? Dim : HDim
+  };
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+  typedef DenseIndex Index;
+  /** type of the matrix used to represent the transformation */
+  typedef typename internal::make_proper_matrix_type<Scalar,Rows,HDim,Options>::type MatrixType;
+  /** constified MatrixType */
+  typedef const MatrixType ConstMatrixType;
+  /** type of the matrix used to represent the linear part of the transformation */
+  typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType;
+  /** type of read/write reference to the linear part of the transformation */
+  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> LinearPart;
+  /** type of read reference to the linear part of the transformation */
+  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> ConstLinearPart;
+  /** type of read/write reference to the affine part of the transformation */
+  typedef typename internal::conditional<int(Mode)==int(AffineCompact),
+                              MatrixType&,
+                              Block<MatrixType,Dim,HDim> >::type AffinePart;
+  /** type of read reference to the affine part of the transformation */
+  typedef typename internal::conditional<int(Mode)==int(AffineCompact),
+                              const MatrixType&,
+                              const Block<const MatrixType,Dim,HDim> >::type ConstAffinePart;
+  /** type of a vector */
+  typedef Matrix<Scalar,Dim,1> VectorType;
+  /** type of a read/write reference to the translation part of the rotation */
+  typedef Block<MatrixType,Dim,1,int(Mode)==(AffineCompact)> TranslationPart;
+  /** type of a read reference to the translation part of the rotation */
+  typedef const Block<ConstMatrixType,Dim,1,int(Mode)==(AffineCompact)> ConstTranslationPart;
+  /** corresponding translation type */
+  typedef Translation<Scalar,Dim> TranslationType;
+  
+  // this intermediate enum is needed to avoid an ICE with gcc 3.4 and 4.0
+  enum { TransformTimeDiagonalMode = ((Mode==int(Isometry))?Affine:int(Mode)) };
+  /** The return type of the product between a diagonal matrix and a transform */
+  typedef Transform<Scalar,Dim,TransformTimeDiagonalMode> TransformTimeDiagonalReturnType;
+
+protected:
+
+  MatrixType m_matrix;
+
+public:
+
+  /** Default constructor without initialization of the meaningful coefficients.
+    * If Mode==Affine, then the last row is set to [0 ... 0 1] */
+  inline Transform()
+  {
+    check_template_params();
+    internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
+  }
+
+  inline Transform(const Transform& other)
+  {
+    check_template_params();
+    m_matrix = other.m_matrix;
+  }
+
+  inline explicit Transform(const TranslationType& t)
+  {
+    check_template_params();
+    *this = t;
+  }
+  inline explicit Transform(const UniformScaling<Scalar>& s)
+  {
+    check_template_params();
+    *this = s;
+  }
+  template<typename Derived>
+  inline explicit Transform(const RotationBase<Derived, Dim>& r)
+  {
+    check_template_params();
+    *this = r;
+  }
+
+  inline Transform& operator=(const Transform& other)
+  { m_matrix = other.m_matrix; return *this; }
+
+  typedef internal::transform_take_affine_part<Transform> take_affine_part;
+
+  /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */
+  template<typename OtherDerived>
+  inline explicit Transform(const EigenBase<OtherDerived>& other)
+  {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar,typename OtherDerived::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
+
+    check_template_params();
+    internal::transform_construct_from_matrix<OtherDerived,Mode,Options,Dim,HDim>::run(this, other.derived());
+  }
+
+  /** Set \c *this from a Dim^2 or (Dim+1)^2 matrix. */
+  template<typename OtherDerived>
+  inline Transform& operator=(const EigenBase<OtherDerived>& other)
+  {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar,typename OtherDerived::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
+
+    internal::transform_construct_from_matrix<OtherDerived,Mode,Options,Dim,HDim>::run(this, other.derived());
+    return *this;
+  }
+  
+  template<int OtherOptions>
+  inline Transform(const Transform<Scalar,Dim,Mode,OtherOptions>& other)
+  {
+    check_template_params();
+    // only the options change, we can directly copy the matrices
+    m_matrix = other.matrix();
+  }
+
+  template<int OtherMode,int OtherOptions>
+  inline Transform(const Transform<Scalar,Dim,OtherMode,OtherOptions>& other)
+  {
+    check_template_params();
+    // prevent conversions as:
+    // Affine | AffineCompact | Isometry = Projective
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(OtherMode==int(Projective), Mode==int(Projective)),
+                        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION)
+
+    // prevent conversions as:
+    // Isometry = Affine | AffineCompact
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(OtherMode==int(Affine)||OtherMode==int(AffineCompact), Mode!=int(Isometry)),
+                        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION)
+
+    enum { ModeIsAffineCompact = Mode == int(AffineCompact),
+           OtherModeIsAffineCompact = OtherMode == int(AffineCompact)
+    };
+
+    if(ModeIsAffineCompact == OtherModeIsAffineCompact)
+    {
+      // We need the block expression because the code is compiled for all
+      // combinations of transformations and will trigger a compile time error
+      // if one tries to assign the matrices directly
+      m_matrix.template block<Dim,Dim+1>(0,0) = other.matrix().template block<Dim,Dim+1>(0,0);
+      makeAffine();
+    }
+    else if(OtherModeIsAffineCompact)
+    {
+      typedef typename Transform<Scalar,Dim,OtherMode,OtherOptions>::MatrixType OtherMatrixType;
+      internal::transform_construct_from_matrix<OtherMatrixType,Mode,Options,Dim,HDim>::run(this, other.matrix());
+    }
+    else
+    {
+      // here we know that Mode == AffineCompact and OtherMode != AffineCompact.
+      // if OtherMode were Projective, the static assert above would already have caught it.
+      // So the only possibility is that OtherMode == Affine
+      linear() = other.linear();
+      translation() = other.translation();
+    }
+  }
+
+  template<typename OtherDerived>
+  Transform(const ReturnByValue<OtherDerived>& other)
+  {
+    check_template_params();
+    other.evalTo(*this);
+  }
+
+  template<typename OtherDerived>
+  Transform& operator=(const ReturnByValue<OtherDerived>& other)
+  {
+    other.evalTo(*this);
+    return *this;
+  }
+
+  #ifdef EIGEN_QT_SUPPORT
+  inline Transform(const QMatrix& other);
+  inline Transform& operator=(const QMatrix& other);
+  inline QMatrix toQMatrix(void) const;
+  inline Transform(const QTransform& other);
+  inline Transform& operator=(const QTransform& other);
+  inline QTransform toQTransform(void) const;
+  #endif
+
+  /** shortcut for m_matrix(row,col);
+    * \sa MatrixBase::operator(Index,Index) const */
+  inline Scalar operator() (Index row, Index col) const { return m_matrix(row,col); }
+  /** shortcut for m_matrix(row,col);
+    * \sa MatrixBase::operator(Index,Index) */
+  inline Scalar& operator() (Index row, Index col) { return m_matrix(row,col); }
+
+  /** \returns a read-only expression of the transformation matrix */
+  inline const MatrixType& matrix() const { return m_matrix; }
+  /** \returns a writable expression of the transformation matrix */
+  inline MatrixType& matrix() { return m_matrix; }
+
+  /** \returns a read-only expression of the linear part of the transformation */
+  inline ConstLinearPart linear() const { return ConstLinearPart(m_matrix,0,0); }
+  /** \returns a writable expression of the linear part of the transformation */
+  inline LinearPart linear() { return LinearPart(m_matrix,0,0); }
+
+  /** \returns a read-only expression of the Dim x HDim affine part of the transformation */
+  inline ConstAffinePart affine() const { return take_affine_part::run(m_matrix); }
+  /** \returns a writable expression of the Dim x HDim affine part of the transformation */
+  inline AffinePart affine() { return take_affine_part::run(m_matrix); }
+
+  /** \returns a read-only expression of the translation vector of the transformation */
+  inline ConstTranslationPart translation() const { return ConstTranslationPart(m_matrix,0,Dim); }
+  /** \returns a writable expression of the translation vector of the transformation */
+  inline TranslationPart translation() { return TranslationPart(m_matrix,0,Dim); }
+
+  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
+    *
+    * The right hand side \a other might be either:
+    * \li a vector of size Dim,
+    * \li an homogeneous vector of size Dim+1,
+    * \li a set of vectors of size Dim x Dynamic,
+    * \li a set of homogeneous vectors of size Dim+1 x Dynamic,
+    * \li a linear transformation matrix of size Dim x Dim,
+    * \li an affine transformation matrix of size Dim x Dim+1,
+    * \li a transformation matrix of size Dim+1 x Dim+1.
+    */
+  // note: this function is defined here because some compilers cannot find the respective declaration
+  template<typename OtherDerived>
+  EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform, OtherDerived>::ResultType
+  operator * (const EigenBase<OtherDerived> &other) const
+  { return internal::transform_right_product_impl<Transform, OtherDerived>::run(*this,other.derived()); }
+
+  /** \returns the product expression of a transformation matrix \a a times a transform \a b
+    *
+    * The left hand side \a other might be either:
+    * \li a linear transformation matrix of size Dim x Dim,
+    * \li an affine transformation matrix of size Dim x Dim+1,
+    * \li a general transformation matrix of size Dim+1 x Dim+1.
+    */
+  template<typename OtherDerived> friend
+  inline const typename internal::transform_left_product_impl<OtherDerived,Mode,Options,_Dim,_Dim+1>::ResultType
+    operator * (const EigenBase<OtherDerived> &a, const Transform &b)
+  { return internal::transform_left_product_impl<OtherDerived,Mode,Options,Dim,HDim>::run(a.derived(),b); }
+
+  /** \returns The product expression of a transform \a a times a diagonal matrix \a b
+    *
+    * The rhs diagonal matrix is interpreted as an affine scaling transformation. The
+    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
+    * mode is no isometry. In that case, the returned transform is an affinity.
+    */
+  template<typename DiagonalDerived>
+  inline const TransformTimeDiagonalReturnType
+    operator * (const DiagonalBase<DiagonalDerived> &b) const
+  {
+    TransformTimeDiagonalReturnType res(*this);
+    res.linear() *= b;
+    return res;
+  }
+
+  /** \returns The product expression of a diagonal matrix \a a times a transform \a b
+    *
+    * The lhs diagonal matrix is interpreted as an affine scaling transformation. The
+    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
+    * mode is no isometry. In that case, the returned transform is an affinity.
+    */
+  template<typename DiagonalDerived>
+  friend inline TransformTimeDiagonalReturnType
+    operator * (const DiagonalBase<DiagonalDerived> &a, const Transform &b)
+  {
+    TransformTimeDiagonalReturnType res;
+    res.linear().noalias() = a*b.linear();
+    res.translation().noalias() = a*b.translation();
+    if (Mode!=int(AffineCompact))
+      res.matrix().row(Dim) = b.matrix().row(Dim);
+    return res;
+  }
+
+  template<typename OtherDerived>
+  inline Transform& operator*=(const EigenBase<OtherDerived>& other) { return *this = *this * other; }
+
+  /** Concatenates two transformations */
+  inline const Transform operator * (const Transform& other) const
+  {
+    return internal::transform_transform_product_impl<Transform,Transform>::run(*this,other);
+  }
+  
+  #ifdef __INTEL_COMPILER
+private:
+  // this intermediate structure permits to workaround a bug in ICC 11:
+  //   error: template instantiation resulted in unexpected function type of "Eigen::Transform<double, 3, 32, 0>
+  //             (const Eigen::Transform<double, 3, 2, 0> &) const"
+  //  (the meaning of a name may have changed since the template declaration -- the type of the template is:
+  // "Eigen::internal::transform_transform_product_impl<Eigen::Transform<double, 3, 32, 0>,
+  //     Eigen::Transform<double, 3, Mode, Options>, <expression>>::ResultType (const Eigen::Transform<double, 3, Mode, Options> &) const")
+  // 
+  template<int OtherMode,int OtherOptions> struct icc_11_workaround
+  {
+    typedef internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> > ProductType;
+    typedef typename ProductType::ResultType ResultType;
+  };
+  
+public:
+  /** Concatenates two different transformations */
+  template<int OtherMode,int OtherOptions>
+  inline typename icc_11_workaround<OtherMode,OtherOptions>::ResultType
+    operator * (const Transform<Scalar,Dim,OtherMode,OtherOptions>& other) const
+  {
+    typedef typename icc_11_workaround<OtherMode,OtherOptions>::ProductType ProductType;
+    return ProductType::run(*this,other);
+  }
+  #else
+  /** Concatenates two different transformations */
+  template<int OtherMode,int OtherOptions>
+  inline typename internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::ResultType
+    operator * (const Transform<Scalar,Dim,OtherMode,OtherOptions>& other) const
+  {
+    return internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::run(*this,other);
+  }
+  #endif
+
+  /** \sa MatrixBase::setIdentity() */
+  void setIdentity() { m_matrix.setIdentity(); }
+
+  /**
+   * \brief Returns an identity transformation.
+   * \todo In the future this function should be returning a Transform expression.
+   */
+  static const Transform Identity()
+  {
+    return Transform(MatrixType::Identity());
+  }
+
+  template<typename OtherDerived>
+  inline Transform& scale(const MatrixBase<OtherDerived> &other);
+
+  template<typename OtherDerived>
+  inline Transform& prescale(const MatrixBase<OtherDerived> &other);
+
+  inline Transform& scale(const Scalar& s);
+  inline Transform& prescale(const Scalar& s);
+
+  template<typename OtherDerived>
+  inline Transform& translate(const MatrixBase<OtherDerived> &other);
+
+  template<typename OtherDerived>
+  inline Transform& pretranslate(const MatrixBase<OtherDerived> &other);
+
+  template<typename RotationType>
+  inline Transform& rotate(const RotationType& rotation);
+
+  template<typename RotationType>
+  inline Transform& prerotate(const RotationType& rotation);
+
+  Transform& shear(const Scalar& sx, const Scalar& sy);
+  Transform& preshear(const Scalar& sx, const Scalar& sy);
+
+  inline Transform& operator=(const TranslationType& t);
+  inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
+  inline Transform operator*(const TranslationType& t) const;
+
+  inline Transform& operator=(const UniformScaling<Scalar>& t);
+  inline Transform& operator*=(const UniformScaling<Scalar>& s) { return scale(s.factor()); }
+  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?int(Affine):int(Mode))> operator*(const UniformScaling<Scalar>& s) const
+  {
+    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?int(Affine):int(Mode)),Options> res = *this;
+    res.scale(s.factor());
+    return res;
+  }
+
+  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linear() *= s; return *this; }
+
+  template<typename Derived>
+  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
+  template<typename Derived>
+  inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
+  template<typename Derived>
+  inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
+
+  const LinearMatrixType rotation() const;
+  template<typename RotationMatrixType, typename ScalingMatrixType>
+  void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
+  template<typename ScalingMatrixType, typename RotationMatrixType>
+  void computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const;
+
+  template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
+  Transform& fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
+    const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale);
+
+  inline Transform inverse(TransformTraits traits = (TransformTraits)Mode) const;
+
+  /** \returns a const pointer to the column major internal matrix */
+  const Scalar* data() const { return m_matrix.data(); }
+  /** \returns a non-const pointer to the column major internal matrix */
+  Scalar* data() { return m_matrix.data(); }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type cast() const
+  { return typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit Transform(const Transform<OtherScalarType,Dim,Mode,Options>& other)
+  {
+    check_template_params();
+    m_matrix = other.matrix().template cast<Scalar>();
+  }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const Transform& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  { return m_matrix.isApprox(other.m_matrix, prec); }
+
+  /** Sets the last row to [0 ... 0 1]
+    */
+  void makeAffine()
+  {
+    internal::transform_make_affine<int(Mode)>::run(m_matrix);
+  }
+
+  /** \internal
+    * \returns the Dim x Dim linear part if the transformation is affine,
+    *          and the HDim x Dim part for projective transformations.
+    */
+  inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt()
+  { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,Dim>(0,0); }
+  /** \internal
+    * \returns the Dim x Dim linear part if the transformation is affine,
+    *          and the HDim x Dim part for projective transformations.
+    */
+  inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt() const
+  { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,Dim>(0,0); }
+
+  /** \internal
+    * \returns the translation part if the transformation is affine,
+    *          and the last column for projective transformations.
+    */
+  inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt()
+  { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,1>(0,Dim); }
+  /** \internal
+    * \returns the translation part if the transformation is affine,
+    *          and the last column for projective transformations.
+    */
+  inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt() const
+  { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,1>(0,Dim); }
+
+
+  #ifdef EIGEN_TRANSFORM_PLUGIN
+  #include EIGEN_TRANSFORM_PLUGIN
+  #endif
+  
+protected:
+  #ifndef EIGEN_PARSED_BY_DOXYGEN
+    static EIGEN_STRONG_INLINE void check_template_params()
+    {
+      EIGEN_STATIC_ASSERT((Options & (DontAlign|RowMajor)) == Options, INVALID_MATRIX_TEMPLATE_PARAMETERS)
+    }
+  #endif
+
+};
+
+/** \ingroup Geometry_Module */
+typedef Transform<float,2,Isometry> Isometry2f;
+/** \ingroup Geometry_Module */
+typedef Transform<float,3,Isometry> Isometry3f;
+/** \ingroup Geometry_Module */
+typedef Transform<double,2,Isometry> Isometry2d;
+/** \ingroup Geometry_Module */
+typedef Transform<double,3,Isometry> Isometry3d;
+
+/** \ingroup Geometry_Module */
+typedef Transform<float,2,Affine> Affine2f;
+/** \ingroup Geometry_Module */
+typedef Transform<float,3,Affine> Affine3f;
+/** \ingroup Geometry_Module */
+typedef Transform<double,2,Affine> Affine2d;
+/** \ingroup Geometry_Module */
+typedef Transform<double,3,Affine> Affine3d;
+
+/** \ingroup Geometry_Module */
+typedef Transform<float,2,AffineCompact> AffineCompact2f;
+/** \ingroup Geometry_Module */
+typedef Transform<float,3,AffineCompact> AffineCompact3f;
+/** \ingroup Geometry_Module */
+typedef Transform<double,2,AffineCompact> AffineCompact2d;
+/** \ingroup Geometry_Module */
+typedef Transform<double,3,AffineCompact> AffineCompact3d;
+
+/** \ingroup Geometry_Module */
+typedef Transform<float,2,Projective> Projective2f;
+/** \ingroup Geometry_Module */
+typedef Transform<float,3,Projective> Projective3f;
+/** \ingroup Geometry_Module */
+typedef Transform<double,2,Projective> Projective2d;
+/** \ingroup Geometry_Module */
+typedef Transform<double,3,Projective> Projective3d;
+
+/**************************
+*** Optional QT support ***
+**************************/
+
+#ifdef EIGEN_QT_SUPPORT
+/** Initializes \c *this from a QMatrix assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim, int Mode,int Options>
+Transform<Scalar,Dim,Mode,Options>::Transform(const QMatrix& other)
+{
+  check_template_params();
+  *this = other;
+}
+
+/** Set \c *this from a QMatrix assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim, int Mode,int Options>
+Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const QMatrix& other)
+{
+  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  m_matrix << other.m11(), other.m21(), other.dx(),
+              other.m12(), other.m22(), other.dy(),
+              0, 0, 1;
+  return *this;
+}
+
+/** \returns a QMatrix from \c *this assuming the dimension is 2.
+  *
+  * \warning this conversion might loss data if \c *this is not affine
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+QMatrix Transform<Scalar,Dim,Mode,Options>::toQMatrix(void) const
+{
+  check_template_params();
+  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  return QMatrix(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
+                 m_matrix.coeff(0,1), m_matrix.coeff(1,1),
+                 m_matrix.coeff(0,2), m_matrix.coeff(1,2));
+}
+
+/** Initializes \c *this from a QTransform assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim, int Mode,int Options>
+Transform<Scalar,Dim,Mode,Options>::Transform(const QTransform& other)
+{
+  check_template_params();
+  *this = other;
+}
+
+/** Set \c *this from a QTransform assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const QTransform& other)
+{
+  check_template_params();
+  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  if (Mode == int(AffineCompact))
+    m_matrix << other.m11(), other.m21(), other.dx(),
+                other.m12(), other.m22(), other.dy();
+  else
+    m_matrix << other.m11(), other.m21(), other.dx(),
+                other.m12(), other.m22(), other.dy(),
+                other.m13(), other.m23(), other.m33();
+  return *this;
+}
+
+/** \returns a QTransform from \c *this assuming the dimension is 2.
+  *
+  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+QTransform Transform<Scalar,Dim,Mode,Options>::toQTransform(void) const
+{
+  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  if (Mode == int(AffineCompact))
+    return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
+                      m_matrix.coeff(0,1), m_matrix.coeff(1,1),
+                      m_matrix.coeff(0,2), m_matrix.coeff(1,2));
+  else
+    return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0), m_matrix.coeff(2,0),
+                      m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(2,1),
+                      m_matrix.coeff(0,2), m_matrix.coeff(1,2), m_matrix.coeff(2,2));
+}
+#endif
+
+/*********************
+*** Procedural API ***
+*********************/
+
+/** Applies on the right the non uniform scale transformation represented
+  * by the vector \a other to \c *this and returns a reference to \c *this.
+  * \sa prescale()
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename OtherDerived>
+Transform<Scalar,Dim,Mode,Options>&
+Transform<Scalar,Dim,Mode,Options>::scale(const MatrixBase<OtherDerived> &other)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
+  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+  linearExt().noalias() = (linearExt() * other.asDiagonal());
+  return *this;
+}
+
+/** Applies on the right a uniform scale of a factor \a c to \c *this
+  * and returns a reference to \c *this.
+  * \sa prescale(Scalar)
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::scale(const Scalar& s)
+{
+  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+  linearExt() *= s;
+  return *this;
+}
+
+/** Applies on the left the non uniform scale transformation represented
+  * by the vector \a other to \c *this and returns a reference to \c *this.
+  * \sa scale()
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename OtherDerived>
+Transform<Scalar,Dim,Mode,Options>&
+Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &other)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
+  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+  m_matrix.template block<Dim,HDim>(0,0).noalias() = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0));
+  return *this;
+}
+
+/** Applies on the left a uniform scale of a factor \a c to \c *this
+  * and returns a reference to \c *this.
+  * \sa scale(Scalar)
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::prescale(const Scalar& s)
+{
+  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+  m_matrix.template topRows<Dim>() *= s;
+  return *this;
+}
+
+/** Applies on the right the translation matrix represented by the vector \a other
+  * to \c *this and returns a reference to \c *this.
+  * \sa pretranslate()
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename OtherDerived>
+Transform<Scalar,Dim,Mode,Options>&
+Transform<Scalar,Dim,Mode,Options>::translate(const MatrixBase<OtherDerived> &other)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
+  translationExt() += linearExt() * other;
+  return *this;
+}
+
+/** Applies on the left the translation matrix represented by the vector \a other
+  * to \c *this and returns a reference to \c *this.
+  * \sa translate()
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename OtherDerived>
+Transform<Scalar,Dim,Mode,Options>&
+Transform<Scalar,Dim,Mode,Options>::pretranslate(const MatrixBase<OtherDerived> &other)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
+  if(int(Mode)==int(Projective))
+    affine() += other * m_matrix.row(Dim);
+  else
+    translation() += other;
+  return *this;
+}
+
+/** Applies on the right the rotation represented by the rotation \a rotation
+  * to \c *this and returns a reference to \c *this.
+  *
+  * The template parameter \a RotationType is the type of the rotation which
+  * must be known by internal::toRotationMatrix<>.
+  *
+  * Natively supported types includes:
+  *   - any scalar (2D),
+  *   - a Dim x Dim matrix expression,
+  *   - a Quaternion (3D),
+  *   - a AngleAxis (3D)
+  *
+  * This mechanism is easily extendable to support user types such as Euler angles,
+  * or a pair of Quaternion for 4D rotations.
+  *
+  * \sa rotate(Scalar), class Quaternion, class AngleAxis, prerotate(RotationType)
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename RotationType>
+Transform<Scalar,Dim,Mode,Options>&
+Transform<Scalar,Dim,Mode,Options>::rotate(const RotationType& rotation)
+{
+  linearExt() *= internal::toRotationMatrix<Scalar,Dim>(rotation);
+  return *this;
+}
+
+/** Applies on the left the rotation represented by the rotation \a rotation
+  * to \c *this and returns a reference to \c *this.
+  *
+  * See rotate() for further details.
+  *
+  * \sa rotate()
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename RotationType>
+Transform<Scalar,Dim,Mode,Options>&
+Transform<Scalar,Dim,Mode,Options>::prerotate(const RotationType& rotation)
+{
+  m_matrix.template block<Dim,HDim>(0,0) = internal::toRotationMatrix<Scalar,Dim>(rotation)
+                                         * m_matrix.template block<Dim,HDim>(0,0);
+  return *this;
+}
+
+/** Applies on the right the shear transformation represented
+  * by the vector \a other to \c *this and returns a reference to \c *this.
+  * \warning 2D only.
+  * \sa preshear()
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+Transform<Scalar,Dim,Mode,Options>&
+Transform<Scalar,Dim,Mode,Options>::shear(const Scalar& sx, const Scalar& sy)
+{
+  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+  VectorType tmp = linear().col(0)*sy + linear().col(1);
+  linear() << linear().col(0) + linear().col(1)*sx, tmp;
+  return *this;
+}
+
+/** Applies on the left the shear transformation represented
+  * by the vector \a other to \c *this and returns a reference to \c *this.
+  * \warning 2D only.
+  * \sa shear()
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+Transform<Scalar,Dim,Mode,Options>&
+Transform<Scalar,Dim,Mode,Options>::preshear(const Scalar& sx, const Scalar& sy)
+{
+  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+  m_matrix.template block<Dim,HDim>(0,0) = LinearMatrixType(1, sx, sy, 1) * m_matrix.template block<Dim,HDim>(0,0);
+  return *this;
+}
+
+/******************************************************
+*** Scaling, Translation and Rotation compatibility ***
+******************************************************/
+
+template<typename Scalar, int Dim, int Mode, int Options>
+inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const TranslationType& t)
+{
+  linear().setIdentity();
+  translation() = t.vector();
+  makeAffine();
+  return *this;
+}
+
+template<typename Scalar, int Dim, int Mode, int Options>
+inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const TranslationType& t) const
+{
+  Transform res = *this;
+  res.translate(t.vector());
+  return res;
+}
+
+template<typename Scalar, int Dim, int Mode, int Options>
+inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const UniformScaling<Scalar>& s)
+{
+  m_matrix.setZero();
+  linear().diagonal().fill(s.factor());
+  makeAffine();
+  return *this;
+}
+
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename Derived>
+inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const RotationBase<Derived,Dim>& r)
+{
+  linear() = internal::toRotationMatrix<Scalar,Dim>(r);
+  translation().setZero();
+  makeAffine();
+  return *this;
+}
+
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename Derived>
+inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const RotationBase<Derived,Dim>& r) const
+{
+  Transform res = *this;
+  res.rotate(r.derived());
+  return res;
+}
+
+/************************
+*** Special functions ***
+************************/
+
+/** \returns the rotation part of the transformation
+  *
+  *
+  * \svd_module
+  *
+  * \sa computeRotationScaling(), computeScalingRotation(), class SVD
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType
+Transform<Scalar,Dim,Mode,Options>::rotation() const
+{
+  LinearMatrixType result;
+  computeRotationScaling(&result, (LinearMatrixType*)0);
+  return result;
+}
+
+
+/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
+  * not necessarily positive.
+  *
+  * If either pointer is zero, the corresponding computation is skipped.
+  *
+  *
+  *
+  * \svd_module
+  *
+  * \sa computeScalingRotation(), rotation(), class SVD
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename RotationMatrixType, typename ScalingMatrixType>
+void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
+{
+  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
+
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  VectorType sv(svd.singularValues());
+  sv.coeffRef(0) *= x;
+  if(scaling) scaling->lazyAssign(svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint());
+  if(rotation)
+  {
+    LinearMatrixType m(svd.matrixU());
+    m.col(0) /= x;
+    rotation->lazyAssign(m * svd.matrixV().adjoint());
+  }
+}
+
+/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
+  * not necessarily positive.
+  *
+  * If either pointer is zero, the corresponding computation is skipped.
+  *
+  *
+  *
+  * \svd_module
+  *
+  * \sa computeRotationScaling(), rotation(), class SVD
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename ScalingMatrixType, typename RotationMatrixType>
+void Transform<Scalar,Dim,Mode,Options>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
+{
+  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
+
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  VectorType sv(svd.singularValues());
+  sv.coeffRef(0) *= x;
+  if(scaling) scaling->lazyAssign(svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint());
+  if(rotation)
+  {
+    LinearMatrixType m(svd.matrixU());
+    m.col(0) /= x;
+    rotation->lazyAssign(m * svd.matrixV().adjoint());
+  }
+}
+
+/** Convenient method to set \c *this from a position, orientation and scale
+  * of a 3D object.
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
+Transform<Scalar,Dim,Mode,Options>&
+Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
+  const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale)
+{
+  linear() = internal::toRotationMatrix<Scalar,Dim>(orientation);
+  linear() *= scale.asDiagonal();
+  translation() = position;
+  makeAffine();
+  return *this;
+}
+
+namespace internal {
+
+template<int Mode>
+struct transform_make_affine
+{
+  template<typename MatrixType>
+  static void run(MatrixType &mat)
+  {
+    static const int Dim = MatrixType::ColsAtCompileTime-1;
+    mat.template block<1,Dim>(Dim,0).setZero();
+    mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1);
+  }
+};
+
+template<>
+struct transform_make_affine<AffineCompact>
+{
+  template<typename MatrixType> static void run(MatrixType &) { }
+};
+    
+// selector needed to avoid taking the inverse of a 3x4 matrix
+template<typename TransformType, int Mode=TransformType::Mode>
+struct projective_transform_inverse
+{
+  static inline void run(const TransformType&, TransformType&)
+  {}
+};
+
+template<typename TransformType>
+struct projective_transform_inverse<TransformType, Projective>
+{
+  static inline void run(const TransformType& m, TransformType& res)
+  {
+    res.matrix() = m.matrix().inverse();
+  }
+};
+
+} // end namespace internal
+
+
+/**
+  *
+  * \returns the inverse transformation according to some given knowledge
+  * on \c *this.
+  *
+  * \param hint allows to optimize the inversion process when the transformation
+  * is known to be not a general transformation (optional). The possible values are:
+  *  - #Projective if the transformation is not necessarily affine, i.e., if the
+  *    last row is not guaranteed to be [0 ... 0 1]
+  *  - #Affine if the last row can be assumed to be [0 ... 0 1]
+  *  - #Isometry if the transformation is only a concatenations of translations
+  *    and rotations.
+  *  The default is the template class parameter \c Mode.
+  *
+  * \warning unless \a traits is always set to NoShear or NoScaling, this function
+  * requires the generic inverse method of MatrixBase defined in the LU module. If
+  * you forget to include this module, then you will get hard to debug linking errors.
+  *
+  * \sa MatrixBase::inverse()
+  */
+template<typename Scalar, int Dim, int Mode, int Options>
+Transform<Scalar,Dim,Mode,Options>
+Transform<Scalar,Dim,Mode,Options>::inverse(TransformTraits hint) const
+{
+  Transform res;
+  if (hint == Projective)
+  {
+    internal::projective_transform_inverse<Transform>::run(*this, res);
+  }
+  else
+  {
+    if (hint == Isometry)
+    {
+      res.matrix().template topLeftCorner<Dim,Dim>() = linear().transpose();
+    }
+    else if(hint&Affine)
+    {
+      res.matrix().template topLeftCorner<Dim,Dim>() = linear().inverse();
+    }
+    else
+    {
+      eigen_assert(false && "Invalid transform traits in Transform::Inverse");
+    }
+    // translation and remaining parts
+    res.matrix().template topRightCorner<Dim,1>()
+      = - res.matrix().template topLeftCorner<Dim,Dim>() * translation();
+    res.makeAffine(); // we do need this, because in the beginning res is uninitialized
+  }
+  return res;
+}
+
+namespace internal {
+
+/*****************************************************
+*** Specializations of take affine part            ***
+*****************************************************/
+
+template<typename TransformType> struct transform_take_affine_part {
+  typedef typename TransformType::MatrixType MatrixType;
+  typedef typename TransformType::AffinePart AffinePart;
+  typedef typename TransformType::ConstAffinePart ConstAffinePart;
+  static inline AffinePart run(MatrixType& m)
+  { return m.template block<TransformType::Dim,TransformType::HDim>(0,0); }
+  static inline ConstAffinePart run(const MatrixType& m)
+  { return m.template block<TransformType::Dim,TransformType::HDim>(0,0); }
+};
+
+template<typename Scalar, int Dim, int Options>
+struct transform_take_affine_part<Transform<Scalar,Dim,AffineCompact, Options> > {
+  typedef typename Transform<Scalar,Dim,AffineCompact,Options>::MatrixType MatrixType;
+  static inline MatrixType& run(MatrixType& m) { return m; }
+  static inline const MatrixType& run(const MatrixType& m) { return m; }
+};
+
+/*****************************************************
+*** Specializations of construct from matrix       ***
+*****************************************************/
+
+template<typename Other, int Mode, int Options, int Dim, int HDim>
+struct transform_construct_from_matrix<Other, Mode,Options,Dim,HDim, Dim,Dim>
+{
+  static inline void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
+  {
+    transform->linear() = other;
+    transform->translation().setZero();
+    transform->makeAffine();
+  }
+};
+
+template<typename Other, int Mode, int Options, int Dim, int HDim>
+struct transform_construct_from_matrix<Other, Mode,Options,Dim,HDim, Dim,HDim>
+{
+  static inline void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
+  {
+    transform->affine() = other;
+    transform->makeAffine();
+  }
+};
+
+template<typename Other, int Mode, int Options, int Dim, int HDim>
+struct transform_construct_from_matrix<Other, Mode,Options,Dim,HDim, HDim,HDim>
+{
+  static inline void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
+  { transform->matrix() = other; }
+};
+
+template<typename Other, int Options, int Dim, int HDim>
+struct transform_construct_from_matrix<Other, AffineCompact,Options,Dim,HDim, HDim,HDim>
+{
+  static inline void run(Transform<typename Other::Scalar,Dim,AffineCompact,Options> *transform, const Other& other)
+  { transform->matrix() = other.template block<Dim,HDim>(0,0); }
+};
+
+/**********************************************************
+***   Specializations of operator* with rhs EigenBase   ***
+**********************************************************/
+
+template<int LhsMode,int RhsMode>
+struct transform_product_result
+{
+  enum 
+  { 
+    Mode =
+      (LhsMode == (int)Projective    || RhsMode == (int)Projective    ) ? Projective :
+      (LhsMode == (int)Affine        || RhsMode == (int)Affine        ) ? Affine :
+      (LhsMode == (int)AffineCompact || RhsMode == (int)AffineCompact ) ? AffineCompact :
+      (LhsMode == (int)Isometry      || RhsMode == (int)Isometry      ) ? Isometry : Projective
+  };
+};
+
+template< typename TransformType, typename MatrixType >
+struct transform_right_product_impl< TransformType, MatrixType, 0 >
+{
+  typedef typename MatrixType::PlainObject ResultType;
+
+  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  {
+    return T.matrix() * other;
+  }
+};
+
+template< typename TransformType, typename MatrixType >
+struct transform_right_product_impl< TransformType, MatrixType, 1 >
+{
+  enum { 
+    Dim = TransformType::Dim, 
+    HDim = TransformType::HDim,
+    OtherRows = MatrixType::RowsAtCompileTime,
+    OtherCols = MatrixType::ColsAtCompileTime
+  };
+
+  typedef typename MatrixType::PlainObject ResultType;
+
+  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  {
+    EIGEN_STATIC_ASSERT(OtherRows==HDim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
+
+    typedef Block<ResultType, Dim, OtherCols, int(MatrixType::RowsAtCompileTime)==Dim> TopLeftLhs;
+
+    ResultType res(other.rows(),other.cols());
+    TopLeftLhs(res, 0, 0, Dim, other.cols()).noalias() = T.affine() * other;
+    res.row(OtherRows-1) = other.row(OtherRows-1);
+    
+    return res;
+  }
+};
+
+template< typename TransformType, typename MatrixType >
+struct transform_right_product_impl< TransformType, MatrixType, 2 >
+{
+  enum { 
+    Dim = TransformType::Dim, 
+    HDim = TransformType::HDim,
+    OtherRows = MatrixType::RowsAtCompileTime,
+    OtherCols = MatrixType::ColsAtCompileTime
+  };
+
+  typedef typename MatrixType::PlainObject ResultType;
+
+  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  {
+    EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
+
+    typedef Block<ResultType, Dim, OtherCols, true> TopLeftLhs;
+    ResultType res(Replicate<typename TransformType::ConstTranslationPart, 1, OtherCols>(T.translation(),1,other.cols()));
+    TopLeftLhs(res, 0, 0, Dim, other.cols()).noalias() += T.linear() * other;
+
+    return res;
+  }
+};
+
+/**********************************************************
+***   Specializations of operator* with lhs EigenBase   ***
+**********************************************************/
+
+// generic HDim x HDim matrix * T => Projective
+template<typename Other,int Mode, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other,Mode,Options,Dim,HDim, HDim,HDim>
+{
+  typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
+  typedef typename TransformType::MatrixType MatrixType;
+  typedef Transform<typename Other::Scalar,Dim,Projective,Options> ResultType;
+  static ResultType run(const Other& other,const TransformType& tr)
+  { return ResultType(other * tr.matrix()); }
+};
+
+// generic HDim x HDim matrix * AffineCompact => Projective
+template<typename Other, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other,AffineCompact,Options,Dim,HDim, HDim,HDim>
+{
+  typedef Transform<typename Other::Scalar,Dim,AffineCompact,Options> TransformType;
+  typedef typename TransformType::MatrixType MatrixType;
+  typedef Transform<typename Other::Scalar,Dim,Projective,Options> ResultType;
+  static ResultType run(const Other& other,const TransformType& tr)
+  {
+    ResultType res;
+    res.matrix().noalias() = other.template block<HDim,Dim>(0,0) * tr.matrix();
+    res.matrix().col(Dim) += other.col(Dim);
+    return res;
+  }
+};
+
+// affine matrix * T
+template<typename Other,int Mode, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other,Mode,Options,Dim,HDim, Dim,HDim>
+{
+  typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
+  typedef typename TransformType::MatrixType MatrixType;
+  typedef TransformType ResultType;
+  static ResultType run(const Other& other,const TransformType& tr)
+  {
+    ResultType res;
+    res.affine().noalias() = other * tr.matrix();
+    res.matrix().row(Dim) = tr.matrix().row(Dim);
+    return res;
+  }
+};
+
+// affine matrix * AffineCompact
+template<typename Other, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other,AffineCompact,Options,Dim,HDim, Dim,HDim>
+{
+  typedef Transform<typename Other::Scalar,Dim,AffineCompact,Options> TransformType;
+  typedef typename TransformType::MatrixType MatrixType;
+  typedef TransformType ResultType;
+  static ResultType run(const Other& other,const TransformType& tr)
+  {
+    ResultType res;
+    res.matrix().noalias() = other.template block<Dim,Dim>(0,0) * tr.matrix();
+    res.translation() += other.col(Dim);
+    return res;
+  }
+};
+
+// linear matrix * T
+template<typename Other,int Mode, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other,Mode,Options,Dim,HDim, Dim,Dim>
+{
+  typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
+  typedef typename TransformType::MatrixType MatrixType;
+  typedef TransformType ResultType;
+  static ResultType run(const Other& other, const TransformType& tr)
+  {
+    TransformType res;
+    if(Mode!=int(AffineCompact))
+      res.matrix().row(Dim) = tr.matrix().row(Dim);
+    res.matrix().template topRows<Dim>().noalias()
+      = other * tr.matrix().template topRows<Dim>();
+    return res;
+  }
+};
+
+/**********************************************************
+*** Specializations of operator* with another Transform ***
+**********************************************************/
+
+template<typename Scalar, int Dim, int LhsMode, int LhsOptions, int RhsMode, int RhsOptions>
+struct transform_transform_product_impl<Transform<Scalar,Dim,LhsMode,LhsOptions>,Transform<Scalar,Dim,RhsMode,RhsOptions>,false >
+{
+  enum { ResultMode = transform_product_result<LhsMode,RhsMode>::Mode };
+  typedef Transform<Scalar,Dim,LhsMode,LhsOptions> Lhs;
+  typedef Transform<Scalar,Dim,RhsMode,RhsOptions> Rhs;
+  typedef Transform<Scalar,Dim,ResultMode,LhsOptions> ResultType;
+  static ResultType run(const Lhs& lhs, const Rhs& rhs)
+  {
+    ResultType res;
+    res.linear() = lhs.linear() * rhs.linear();
+    res.translation() = lhs.linear() * rhs.translation() + lhs.translation();
+    res.makeAffine();
+    return res;
+  }
+};
+
+template<typename Scalar, int Dim, int LhsMode, int LhsOptions, int RhsMode, int RhsOptions>
+struct transform_transform_product_impl<Transform<Scalar,Dim,LhsMode,LhsOptions>,Transform<Scalar,Dim,RhsMode,RhsOptions>,true >
+{
+  typedef Transform<Scalar,Dim,LhsMode,LhsOptions> Lhs;
+  typedef Transform<Scalar,Dim,RhsMode,RhsOptions> Rhs;
+  typedef Transform<Scalar,Dim,Projective> ResultType;
+  static ResultType run(const Lhs& lhs, const Rhs& rhs)
+  {
+    return ResultType( lhs.matrix() * rhs.matrix() );
+  }
+};
+
+template<typename Scalar, int Dim, int LhsOptions, int RhsOptions>
+struct transform_transform_product_impl<Transform<Scalar,Dim,AffineCompact,LhsOptions>,Transform<Scalar,Dim,Projective,RhsOptions>,true >
+{
+  typedef Transform<Scalar,Dim,AffineCompact,LhsOptions> Lhs;
+  typedef Transform<Scalar,Dim,Projective,RhsOptions> Rhs;
+  typedef Transform<Scalar,Dim,Projective> ResultType;
+  static ResultType run(const Lhs& lhs, const Rhs& rhs)
+  {
+    ResultType res;
+    res.matrix().template topRows<Dim>() = lhs.matrix() * rhs.matrix();
+    res.matrix().row(Dim) = rhs.matrix().row(Dim);
+    return res;
+  }
+};
+
+template<typename Scalar, int Dim, int LhsOptions, int RhsOptions>
+struct transform_transform_product_impl<Transform<Scalar,Dim,Projective,LhsOptions>,Transform<Scalar,Dim,AffineCompact,RhsOptions>,true >
+{
+  typedef Transform<Scalar,Dim,Projective,LhsOptions> Lhs;
+  typedef Transform<Scalar,Dim,AffineCompact,RhsOptions> Rhs;
+  typedef Transform<Scalar,Dim,Projective> ResultType;
+  static ResultType run(const Lhs& lhs, const Rhs& rhs)
+  {
+    ResultType res(lhs.matrix().template leftCols<Dim>() * rhs.matrix());
+    res.matrix().col(Dim) += lhs.matrix().col(Dim);
+    return res;
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRANSFORM_H
diff --git a/include/eigen3/Eigen/src/Geometry/Translation.h b/include/eigen3/Eigen/src/Geometry/Translation.h
new file mode 100644
index 0000000..7fda179
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/Translation.h
@@ -0,0 +1,206 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRANSLATION_H
+#define EIGEN_TRANSLATION_H
+
+namespace Eigen { 
+
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \class Translation
+  *
+  * \brief Represents a translation transformation
+  *
+  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
+  *
+  * \note This class is not aimed to be used to store a translation transformation,
+  * but rather to make easier the constructions and updates of Transform objects.
+  *
+  * \sa class Scaling, class Transform
+  */
+template<typename _Scalar, int _Dim>
+class Translation
+{
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
+  /** dimension of the space */
+  enum { Dim = _Dim };
+  /** the scalar type of the coefficients */
+  typedef _Scalar Scalar;
+  /** corresponding vector type */
+  typedef Matrix<Scalar,Dim,1> VectorType;
+  /** corresponding linear transformation matrix type */
+  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
+  /** corresponding affine transformation type */
+  typedef Transform<Scalar,Dim,Affine> AffineTransformType;
+  /** corresponding isometric transformation type */
+  typedef Transform<Scalar,Dim,Isometry> IsometryTransformType;
+
+protected:
+
+  VectorType m_coeffs;
+
+public:
+
+  /** Default constructor without initialization. */
+  Translation() {}
+  /**  */
+  inline Translation(const Scalar& sx, const Scalar& sy)
+  {
+    eigen_assert(Dim==2);
+    m_coeffs.x() = sx;
+    m_coeffs.y() = sy;
+  }
+  /**  */
+  inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
+  {
+    eigen_assert(Dim==3);
+    m_coeffs.x() = sx;
+    m_coeffs.y() = sy;
+    m_coeffs.z() = sz;
+  }
+  /** Constructs and initialize the translation transformation from a vector of translation coefficients */
+  explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
+
+  /** \brief Retruns the x-translation by value. **/
+  inline Scalar x() const { return m_coeffs.x(); }
+  /** \brief Retruns the y-translation by value. **/
+  inline Scalar y() const { return m_coeffs.y(); }
+  /** \brief Retruns the z-translation by value. **/
+  inline Scalar z() const { return m_coeffs.z(); }
+
+  /** \brief Retruns the x-translation as a reference. **/
+  inline Scalar& x() { return m_coeffs.x(); }
+  /** \brief Retruns the y-translation as a reference. **/
+  inline Scalar& y() { return m_coeffs.y(); }
+  /** \brief Retruns the z-translation as a reference. **/
+  inline Scalar& z() { return m_coeffs.z(); }
+
+  const VectorType& vector() const { return m_coeffs; }
+  VectorType& vector() { return m_coeffs; }
+
+  const VectorType& translation() const { return m_coeffs; }
+  VectorType& translation() { return m_coeffs; }
+
+  /** Concatenates two translation */
+  inline Translation operator* (const Translation& other) const
+  { return Translation(m_coeffs + other.m_coeffs); }
+
+  /** Concatenates a translation and a uniform scaling */
+  inline AffineTransformType operator* (const UniformScaling<Scalar>& other) const;
+
+  /** Concatenates a translation and a linear transformation */
+  template<typename OtherDerived>
+  inline AffineTransformType operator* (const EigenBase<OtherDerived>& linear) const;
+
+  /** Concatenates a translation and a rotation */
+  template<typename Derived>
+  inline IsometryTransformType operator*(const RotationBase<Derived,Dim>& r) const
+  { return *this * IsometryTransformType(r); }
+
+  /** \returns the concatenation of a linear transformation \a l with the translation \a t */
+  // its a nightmare to define a templated friend function outside its declaration
+  template<typename OtherDerived> friend
+  inline AffineTransformType operator*(const EigenBase<OtherDerived>& linear, const Translation& t)
+  {
+    AffineTransformType res;
+    res.matrix().setZero();
+    res.linear() = linear.derived();
+    res.translation() = linear.derived() * t.m_coeffs;
+    res.matrix().row(Dim).setZero();
+    res(Dim,Dim) = Scalar(1);
+    return res;
+  }
+
+  /** Concatenates a translation and a transformation */
+  template<int Mode, int Options>
+  inline Transform<Scalar,Dim,Mode> operator* (const Transform<Scalar,Dim,Mode,Options>& t) const
+  {
+    Transform<Scalar,Dim,Mode> res = t;
+    res.pretranslate(m_coeffs);
+    return res;
+  }
+
+  /** Applies translation to vector */
+  inline VectorType operator* (const VectorType& other) const
+  { return m_coeffs + other; }
+
+  /** \returns the inverse translation (opposite) */
+  Translation inverse() const { return Translation(-m_coeffs); }
+
+  Translation& operator=(const Translation& other)
+  {
+    m_coeffs = other.m_coeffs;
+    return *this;
+  }
+
+  static const Translation Identity() { return Translation(VectorType::Zero()); }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
+  { return typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<typename OtherScalarType>
+  inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
+  { m_coeffs = other.vector().template cast<Scalar>(); }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  { return m_coeffs.isApprox(other.m_coeffs, prec); }
+
+};
+
+/** \addtogroup Geometry_Module */
+//@{
+typedef Translation<float, 2> Translation2f;
+typedef Translation<double,2> Translation2d;
+typedef Translation<float, 3> Translation3f;
+typedef Translation<double,3> Translation3d;
+//@}
+
+template<typename Scalar, int Dim>
+inline typename Translation<Scalar,Dim>::AffineTransformType
+Translation<Scalar,Dim>::operator* (const UniformScaling<Scalar>& other) const
+{
+  AffineTransformType res;
+  res.matrix().setZero();
+  res.linear().diagonal().fill(other.factor());
+  res.translation() = m_coeffs;
+  res(Dim,Dim) = Scalar(1);
+  return res;
+}
+
+template<typename Scalar, int Dim>
+template<typename OtherDerived>
+inline typename Translation<Scalar,Dim>::AffineTransformType
+Translation<Scalar,Dim>::operator* (const EigenBase<OtherDerived>& linear) const
+{
+  AffineTransformType res;
+  res.matrix().setZero();
+  res.linear() = linear.derived();
+  res.translation() = m_coeffs;
+  res.matrix().row(Dim).setZero();
+  res(Dim,Dim) = Scalar(1);
+  return res;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRANSLATION_H
diff --git a/include/eigen3/Eigen/src/Geometry/Umeyama.h b/include/eigen3/Eigen/src/Geometry/Umeyama.h
new file mode 100644
index 0000000..5e20662
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/Umeyama.h
@@ -0,0 +1,177 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_UMEYAMA_H
+#define EIGEN_UMEYAMA_H
+
+// This file requires the user to include 
+// * Eigen/Core
+// * Eigen/LU 
+// * Eigen/SVD
+// * Eigen/Array
+
+namespace Eigen { 
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+// These helpers are required since it allows to use mixed types as parameters
+// for the Umeyama. The problem with mixed parameters is that the return type
+// cannot trivially be deduced when float and double types are mixed.
+namespace internal {
+
+// Compile time return type deduction for different MatrixBase types.
+// Different means here different alignment and parameters but the same underlying
+// real scalar type.
+template<typename MatrixType, typename OtherMatrixType>
+struct umeyama_transform_matrix_type
+{
+  enum {
+    MinRowsAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(MatrixType::RowsAtCompileTime, OtherMatrixType::RowsAtCompileTime),
+
+    // When possible we want to choose some small fixed size value since the result
+    // is likely to fit on the stack. So here, EIGEN_SIZE_MIN_PREFER_DYNAMIC is not what we want.
+    HomogeneousDimension = int(MinRowsAtCompileTime) == Dynamic ? Dynamic : int(MinRowsAtCompileTime)+1
+  };
+
+  typedef Matrix<typename traits<MatrixType>::Scalar,
+    HomogeneousDimension,
+    HomogeneousDimension,
+    AutoAlign | (traits<MatrixType>::Flags & RowMajorBit ? RowMajor : ColMajor),
+    HomogeneousDimension,
+    HomogeneousDimension
+  > type;
+};
+
+}
+
+#endif
+
+/**
+* \geometry_module \ingroup Geometry_Module
+*
+* \brief Returns the transformation between two point sets.
+*
+* The algorithm is based on:
+* "Least-squares estimation of transformation parameters between two point patterns",
+* Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573
+*
+* It estimates parameters \f$ c, \mathbf{R}, \f$ and \f$ \mathbf{t} \f$ such that
+* \f{align*}
+*   \frac{1}{n} \sum_{i=1}^n \vert\vert y_i - (c\mathbf{R}x_i + \mathbf{t}) \vert\vert_2^2
+* \f}
+* is minimized.
+*
+* The algorithm is based on the analysis of the covariance matrix
+* \f$ \Sigma_{\mathbf{x}\mathbf{y}} \in \mathbb{R}^{d \times d} \f$
+* of the input point sets \f$ \mathbf{x} \f$ and \f$ \mathbf{y} \f$ where 
+* \f$d\f$ is corresponding to the dimension (which is typically small).
+* The analysis is involving the SVD having a complexity of \f$O(d^3)\f$
+* though the actual computational effort lies in the covariance
+* matrix computation which has an asymptotic lower bound of \f$O(dm)\f$ when 
+* the input point sets have dimension \f$d \times m\f$.
+*
+* Currently the method is working only for floating point matrices.
+*
+* \todo Should the return type of umeyama() become a Transform?
+*
+* \param src Source points \f$ \mathbf{x} = \left( x_1, \hdots, x_n \right) \f$.
+* \param dst Destination points \f$ \mathbf{y} = \left( y_1, \hdots, y_n \right) \f$.
+* \param with_scaling Sets \f$ c=1 \f$ when <code>false</code> is passed.
+* \return The homogeneous transformation 
+* \f{align*}
+*   T = \begin{bmatrix} c\mathbf{R} & \mathbf{t} \\ \mathbf{0} & 1 \end{bmatrix}
+* \f}
+* minimizing the resudiual above. This transformation is always returned as an 
+* Eigen::Matrix.
+*/
+template <typename Derived, typename OtherDerived>
+typename internal::umeyama_transform_matrix_type<Derived, OtherDerived>::type
+umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, bool with_scaling = true)
+{
+  typedef typename internal::umeyama_transform_matrix_type<Derived, OtherDerived>::type TransformationMatrixType;
+  typedef typename internal::traits<TransformationMatrixType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename Derived::Index Index;
+
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename internal::traits<OtherDerived>::Scalar>::value),
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  enum { Dimension = EIGEN_SIZE_MIN_PREFER_DYNAMIC(Derived::RowsAtCompileTime, OtherDerived::RowsAtCompileTime) };
+
+  typedef Matrix<Scalar, Dimension, 1> VectorType;
+  typedef Matrix<Scalar, Dimension, Dimension> MatrixType;
+  typedef typename internal::plain_matrix_type_row_major<Derived>::type RowMajorMatrixType;
+
+  const Index m = src.rows(); // dimension
+  const Index n = src.cols(); // number of measurements
+
+  // required for demeaning ...
+  const RealScalar one_over_n = RealScalar(1) / static_cast<RealScalar>(n);
+
+  // computation of mean
+  const VectorType src_mean = src.rowwise().sum() * one_over_n;
+  const VectorType dst_mean = dst.rowwise().sum() * one_over_n;
+
+  // demeaning of src and dst points
+  const RowMajorMatrixType src_demean = src.colwise() - src_mean;
+  const RowMajorMatrixType dst_demean = dst.colwise() - dst_mean;
+
+  // Eq. (36)-(37)
+  const Scalar src_var = src_demean.rowwise().squaredNorm().sum() * one_over_n;
+
+  // Eq. (38)
+  const MatrixType sigma = one_over_n * dst_demean * src_demean.transpose();
+
+  JacobiSVD<MatrixType> svd(sigma, ComputeFullU | ComputeFullV);
+
+  // Initialize the resulting transformation with an identity matrix...
+  TransformationMatrixType Rt = TransformationMatrixType::Identity(m+1,m+1);
+
+  // Eq. (39)
+  VectorType S = VectorType::Ones(m);
+  if (sigma.determinant()<Scalar(0)) S(m-1) = Scalar(-1);
+
+  // Eq. (40) and (43)
+  const VectorType& d = svd.singularValues();
+  Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank;
+  if (rank == m-1) {
+    if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) {
+      Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose();
+    } else {
+      const Scalar s = S(m-1); S(m-1) = Scalar(-1);
+      Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
+      S(m-1) = s;
+    }
+  } else {
+    Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
+  }
+
+  if (with_scaling)
+  {
+    // Eq. (42)
+    const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S);
+
+    // Eq. (41)
+    Rt.col(m).head(m) = dst_mean;
+    Rt.col(m).head(m).noalias() -= c*Rt.topLeftCorner(m,m)*src_mean;
+    Rt.block(0,0,m,m) *= c;
+  }
+  else
+  {
+    Rt.col(m).head(m) = dst_mean;
+    Rt.col(m).head(m).noalias() -= Rt.topLeftCorner(m,m)*src_mean;
+  }
+
+  return Rt;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_UMEYAMA_H
diff --git a/include/eigen3/Eigen/src/Geometry/arch/Geometry_SSE.h b/include/eigen3/Eigen/src/Geometry/arch/Geometry_SSE.h
new file mode 100644
index 0000000..3d8284f
--- /dev/null
+++ b/include/eigen3/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -0,0 +1,115 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Rohit Garg <rpg.314 at gmail.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GEOMETRY_SSE_H
+#define EIGEN_GEOMETRY_SSE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<class Derived, class OtherDerived>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned>
+{
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
+  {
+    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
+    Quaternion<float> res;
+    __m128 a = _a.coeffs().template packet<Aligned>(0);
+    __m128 b = _b.coeffs().template packet<Aligned>(0);
+    __m128 flip1 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),
+                                         vec4f_swizzle1(b,2,0,1,2)),mask);
+    __m128 flip2 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),
+                                         vec4f_swizzle1(b,0,1,2,1)),mask);
+    pstore(&res.x(),
+              _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
+                                    _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
+                                               vec4f_swizzle1(b,1,2,0,0))),
+                         _mm_add_ps(flip1,flip2)));
+    return res;
+  }
+};
+
+template<typename VectorLhs,typename VectorRhs>
+struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
+{
+  static inline typename plain_matrix_type<VectorLhs>::type
+  run(const VectorLhs& lhs, const VectorRhs& rhs)
+  {
+    __m128 a = lhs.template packet<VectorLhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
+    __m128 b = rhs.template packet<VectorRhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
+    __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
+    __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
+    typename plain_matrix_type<VectorLhs>::type res;
+    pstore(&res.x(),_mm_sub_ps(mul1,mul2));
+    return res;
+  }
+};
+
+
+
+
+template<class Derived, class OtherDerived>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
+{
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
+  {
+  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
+
+  Quaternion<double> res;
+
+  const double* a = _a.coeffs().data();
+  Packet2d b_xy = _b.coeffs().template packet<Aligned>(0);
+  Packet2d b_zw = _b.coeffs().template packet<Aligned>(2);
+  Packet2d a_xx = pset1<Packet2d>(a[0]);
+  Packet2d a_yy = pset1<Packet2d>(a[1]);
+  Packet2d a_zz = pset1<Packet2d>(a[2]);
+  Packet2d a_ww = pset1<Packet2d>(a[3]);
+
+  // two temporaries:
+  Packet2d t1, t2;
+
+  /*
+   * t1 = ww*xy + yy*zw
+   * t2 = zz*xy - xx*zw
+   * res.xy = t1 +/- swap(t2)
+   */
+  t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
+  t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
+#ifdef EIGEN_VECTORIZE_SSE3
+  EIGEN_UNUSED_VARIABLE(mask)
+  pstore(&res.x(), _mm_addsub_pd(t1, preverse(t2)));
+#else
+  pstore(&res.x(), padd(t1, pxor(mask,preverse(t2))));
+#endif
+  
+  /*
+   * t1 = ww*zw - yy*xy
+   * t2 = zz*zw + xx*xy
+   * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
+   */
+  t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
+  t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
+#ifdef EIGEN_VECTORIZE_SSE3
+  EIGEN_UNUSED_VARIABLE(mask)
+  pstore(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
+#else
+  pstore(&res.z(), psub(t1, pxor(mask,preverse(t2))));
+#endif
+
+  return res;
+}
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GEOMETRY_SSE_H
diff --git a/include/eigen3/Eigen/src/Householder/BlockHouseholder.h b/include/eigen3/Eigen/src/Householder/BlockHouseholder.h
new file mode 100644
index 0000000..60dbea5
--- /dev/null
+++ b/include/eigen3/Eigen/src/Householder/BlockHouseholder.h
@@ -0,0 +1,68 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Vincent Lejeune
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BLOCK_HOUSEHOLDER_H
+#define EIGEN_BLOCK_HOUSEHOLDER_H
+
+// This file contains some helper function to deal with block householder reflectors
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal */
+template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
+void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const CoeffsType& hCoeffs)
+{
+  typedef typename TriangularFactorType::Index Index;
+  typedef typename VectorsType::Scalar Scalar;
+  const Index nbVecs = vectors.cols();
+  eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
+
+  for(Index i = 0; i < nbVecs; i++)
+  {
+    Index rs = vectors.rows() - i;
+    Scalar Vii = vectors(i,i);
+    vectors.const_cast_derived().coeffRef(i,i) = Scalar(1);
+    triFactor.col(i).head(i).noalias() = -hCoeffs(i) * vectors.block(i, 0, rs, i).adjoint()
+                                       * vectors.col(i).tail(rs);
+    vectors.const_cast_derived().coeffRef(i, i) = Vii;
+    // FIXME add .noalias() once the triangular product can work inplace
+    triFactor.col(i).head(i) = triFactor.block(0,0,i,i).template triangularView<Upper>()
+                             * triFactor.col(i).head(i);
+    triFactor(i,i) = hCoeffs(i);
+  }
+}
+
+/** \internal */
+template<typename MatrixType,typename VectorsType,typename CoeffsType>
+void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs)
+{
+  typedef typename MatrixType::Index Index;
+  enum { TFactorSize = MatrixType::ColsAtCompileTime };
+  Index nbVecs = vectors.cols();
+  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, ColMajor> T(nbVecs,nbVecs);
+  make_block_householder_triangular_factor(T, vectors, hCoeffs);
+
+  const TriangularView<const VectorsType, UnitLower>& V(vectors);
+
+  // A -= V T V^* A
+  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,0,
+         VectorsType::MaxColsAtCompileTime,MatrixType::MaxColsAtCompileTime> tmp = V.adjoint() * mat;
+  // FIXME add .noalias() once the triangular product can work inplace
+  tmp = T.template triangularView<Upper>().adjoint() * tmp;
+  mat.noalias() -= V * tmp;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BLOCK_HOUSEHOLDER_H
diff --git a/include/eigen3/Eigen/src/Householder/Householder.h b/include/eigen3/Eigen/src/Householder/Householder.h
new file mode 100644
index 0000000..32112af
--- /dev/null
+++ b/include/eigen3/Eigen/src/Householder/Householder.h
@@ -0,0 +1,171 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_HOUSEHOLDER_H
+#define EIGEN_HOUSEHOLDER_H
+
+namespace Eigen { 
+
+namespace internal {
+template<int n> struct decrement_size
+{
+  enum {
+    ret = n==Dynamic ? n : n-1
+  };
+};
+}
+
+/** Computes the elementary reflector H such that:
+  * \f$ H *this = [ beta 0 ... 0]^T \f$
+  * where the transformation H is:
+  * \f$ H = I - tau v v^*\f$
+  * and the vector v is:
+  * \f$ v^T = [1 essential^T] \f$
+  *
+  * The essential part of the vector \c v is stored in *this.
+  * 
+  * On output:
+  * \param tau the scaling factor of the Householder transformation
+  * \param beta the result of H * \c *this
+  *
+  * \sa MatrixBase::makeHouseholder(), MatrixBase::applyHouseholderOnTheLeft(),
+  *     MatrixBase::applyHouseholderOnTheRight()
+  */
+template<typename Derived>
+void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
+{
+  VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1);
+  makeHouseholder(essentialPart, tau, beta);
+}
+
+/** Computes the elementary reflector H such that:
+  * \f$ H *this = [ beta 0 ... 0]^T \f$
+  * where the transformation H is:
+  * \f$ H = I - tau v v^*\f$
+  * and the vector v is:
+  * \f$ v^T = [1 essential^T] \f$
+  *
+  * On output:
+  * \param essential the essential part of the vector \c v
+  * \param tau the scaling factor of the Householder transformation
+  * \param beta the result of H * \c *this
+  *
+  * \sa MatrixBase::makeHouseholderInPlace(), MatrixBase::applyHouseholderOnTheLeft(),
+  *     MatrixBase::applyHouseholderOnTheRight()
+  */
+template<typename Derived>
+template<typename EssentialPart>
+void MatrixBase<Derived>::makeHouseholder(
+  EssentialPart& essential,
+  Scalar& tau,
+  RealScalar& beta) const
+{
+  using std::sqrt;
+  using numext::conj;
+  
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart)
+  VectorBlock<const Derived, EssentialPart::SizeAtCompileTime> tail(derived(), 1, size()-1);
+  
+  RealScalar tailSqNorm = size()==1 ? RealScalar(0) : tail.squaredNorm();
+  Scalar c0 = coeff(0);
+
+  if(tailSqNorm == RealScalar(0) && numext::imag(c0)==RealScalar(0))
+  {
+    tau = RealScalar(0);
+    beta = numext::real(c0);
+    essential.setZero();
+  }
+  else
+  {
+    beta = sqrt(numext::abs2(c0) + tailSqNorm);
+    if (numext::real(c0)>=RealScalar(0))
+      beta = -beta;
+    essential = tail / (c0 - beta);
+    tau = conj((beta - c0) / beta);
+  }
+}
+
+/** Apply the elementary reflector H given by
+  * \f$ H = I - tau v v^*\f$
+  * with
+  * \f$ v^T = [1 essential^T] \f$
+  * from the left to a vector or matrix.
+  *
+  * On input:
+  * \param essential the essential part of the vector \c v
+  * \param tau the scaling factor of the Householder transformation
+  * \param workspace a pointer to working space with at least
+  *                  this->cols() * essential.size() entries
+  *
+  * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
+  *     MatrixBase::applyHouseholderOnTheRight()
+  */
+template<typename Derived>
+template<typename EssentialPart>
+void MatrixBase<Derived>::applyHouseholderOnTheLeft(
+  const EssentialPart& essential,
+  const Scalar& tau,
+  Scalar* workspace)
+{
+  if(rows() == 1)
+  {
+    *this *= Scalar(1)-tau;
+  }
+  else
+  {
+    Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
+    Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
+    tmp.noalias() = essential.adjoint() * bottom;
+    tmp += this->row(0);
+    this->row(0) -= tau * tmp;
+    bottom.noalias() -= tau * essential * tmp;
+  }
+}
+
+/** Apply the elementary reflector H given by
+  * \f$ H = I - tau v v^*\f$
+  * with
+  * \f$ v^T = [1 essential^T] \f$
+  * from the right to a vector or matrix.
+  *
+  * On input:
+  * \param essential the essential part of the vector \c v
+  * \param tau the scaling factor of the Householder transformation
+  * \param workspace a pointer to working space with at least
+  *                  this->cols() * essential.size() entries
+  *
+  * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
+  *     MatrixBase::applyHouseholderOnTheLeft()
+  */
+template<typename Derived>
+template<typename EssentialPart>
+void MatrixBase<Derived>::applyHouseholderOnTheRight(
+  const EssentialPart& essential,
+  const Scalar& tau,
+  Scalar* workspace)
+{
+  if(cols() == 1)
+  {
+    *this *= Scalar(1)-tau;
+  }
+  else
+  {
+    Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
+    Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
+    tmp.noalias() = right * essential.conjugate();
+    tmp += this->col(0);
+    this->col(0) -= tau * tmp;
+    right.noalias() -= tau * tmp * essential.transpose();
+  }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_HOUSEHOLDER_H
diff --git a/include/eigen3/Eigen/src/Householder/HouseholderSequence.h b/include/eigen3/Eigen/src/Householder/HouseholderSequence.h
new file mode 100644
index 0000000..d800ca1
--- /dev/null
+++ b/include/eigen3/Eigen/src/Householder/HouseholderSequence.h
@@ -0,0 +1,441 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_HOUSEHOLDER_SEQUENCE_H
+#define EIGEN_HOUSEHOLDER_SEQUENCE_H
+
+namespace Eigen { 
+
+/** \ingroup Householder_Module
+  * \householder_module
+  * \class HouseholderSequence
+  * \brief Sequence of Householder reflections acting on subspaces with decreasing size
+  * \tparam VectorsType type of matrix containing the Householder vectors
+  * \tparam CoeffsType  type of vector containing the Householder coefficients
+  * \tparam Side        either OnTheLeft (the default) or OnTheRight
+  *
+  * This class represents a product sequence of Householder reflections where the first Householder reflection
+  * acts on the whole space, the second Householder reflection leaves the one-dimensional subspace spanned by
+  * the first unit vector invariant, the third Householder reflection leaves the two-dimensional subspace
+  * spanned by the first two unit vectors invariant, and so on up to the last reflection which leaves all but
+  * one dimensions invariant and acts only on the last dimension. Such sequences of Householder reflections
+  * are used in several algorithms to zero out certain parts of a matrix. Indeed, the methods
+  * HessenbergDecomposition::matrixQ(), Tridiagonalization::matrixQ(), HouseholderQR::householderQ(),
+  * and ColPivHouseholderQR::householderQ() all return a %HouseholderSequence.
+  *
+  * More precisely, the class %HouseholderSequence represents an \f$ n \times n \f$ matrix \f$ H \f$ of the
+  * form \f$ H = \prod_{i=0}^{n-1} H_i \f$ where the i-th Householder reflection is \f$ H_i = I - h_i v_i
+  * v_i^* \f$. The i-th Householder coefficient \f$ h_i \f$ is a scalar and the i-th Householder vector \f$
+  * v_i \f$ is a vector of the form
+  * \f[ 
+  * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. 
+  * \f]
+  * The last \f$ n-i \f$ entries of \f$ v_i \f$ are called the essential part of the Householder vector.
+  *
+  * Typical usages are listed below, where H is a HouseholderSequence:
+  * \code
+  * A.applyOnTheRight(H);             // A = A * H
+  * A.applyOnTheLeft(H);              // A = H * A
+  * A.applyOnTheRight(H.adjoint());   // A = A * H^*
+  * A.applyOnTheLeft(H.adjoint());    // A = H^* * A
+  * MatrixXd Q = H;                   // conversion to a dense matrix
+  * \endcode
+  * In addition to the adjoint, you can also apply the inverse (=adjoint), the transpose, and the conjugate operators.
+  *
+  * See the documentation for HouseholderSequence(const VectorsType&, const CoeffsType&) for an example.
+  *
+  * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+  */
+
+namespace internal {
+
+template<typename VectorsType, typename CoeffsType, int Side>
+struct traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
+{
+  typedef typename VectorsType::Scalar Scalar;
+  typedef typename VectorsType::Index Index;
+  typedef typename VectorsType::StorageKind StorageKind;
+  enum {
+    RowsAtCompileTime = Side==OnTheLeft ? traits<VectorsType>::RowsAtCompileTime
+                                        : traits<VectorsType>::ColsAtCompileTime,
+    ColsAtCompileTime = RowsAtCompileTime,
+    MaxRowsAtCompileTime = Side==OnTheLeft ? traits<VectorsType>::MaxRowsAtCompileTime
+                                           : traits<VectorsType>::MaxColsAtCompileTime,
+    MaxColsAtCompileTime = MaxRowsAtCompileTime,
+    Flags = 0
+  };
+};
+
+template<typename VectorsType, typename CoeffsType, int Side>
+struct hseq_side_dependent_impl
+{
+  typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
+  typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
+  typedef typename VectorsType::Index Index;
+  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
+  {
+    Index start = k+1+h.m_shift;
+    return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1);
+  }
+};
+
+template<typename VectorsType, typename CoeffsType>
+struct hseq_side_dependent_impl<VectorsType, CoeffsType, OnTheRight>
+{
+  typedef Transpose<Block<const VectorsType, 1, Dynamic> > EssentialVectorType;
+  typedef HouseholderSequence<VectorsType, CoeffsType, OnTheRight> HouseholderSequenceType;
+  typedef typename VectorsType::Index Index;
+  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
+  {
+    Index start = k+1+h.m_shift;
+    return Block<const VectorsType,1,Dynamic>(h.m_vectors, k, start, 1, h.rows()-start).transpose();
+  }
+};
+
+template<typename OtherScalarType, typename MatrixType> struct matrix_type_times_scalar_type
+{
+  typedef typename scalar_product_traits<OtherScalarType, typename MatrixType::Scalar>::ReturnType
+    ResultScalar;
+  typedef Matrix<ResultScalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
+                 0, MatrixType::MaxRowsAtCompileTime, MatrixType::MaxColsAtCompileTime> Type;
+};
+
+} // end namespace internal
+
+template<typename VectorsType, typename CoeffsType, int Side> class HouseholderSequence
+  : public EigenBase<HouseholderSequence<VectorsType,CoeffsType,Side> >
+{
+    typedef typename internal::hseq_side_dependent_impl<VectorsType,CoeffsType,Side>::EssentialVectorType EssentialVectorType;
+  
+  public:
+    enum {
+      RowsAtCompileTime = internal::traits<HouseholderSequence>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<HouseholderSequence>::ColsAtCompileTime,
+      MaxRowsAtCompileTime = internal::traits<HouseholderSequence>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<HouseholderSequence>::MaxColsAtCompileTime
+    };
+    typedef typename internal::traits<HouseholderSequence>::Scalar Scalar;
+    typedef typename VectorsType::Index Index;
+
+    typedef HouseholderSequence<
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename VectorsType::ConjugateReturnType>::type,
+        VectorsType>::type,
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename CoeffsType::ConjugateReturnType>::type,
+        CoeffsType>::type,
+      Side
+    > ConjugateReturnType;
+
+    /** \brief Constructor.
+      * \param[in]  v      %Matrix containing the essential parts of the Householder vectors
+      * \param[in]  h      Vector containing the Householder coefficients
+      *
+      * Constructs the Householder sequence with coefficients given by \p h and vectors given by \p v. The
+      * i-th Householder coefficient \f$ h_i \f$ is given by \p h(i) and the essential part of the i-th
+      * Householder vector \f$ v_i \f$ is given by \p v(k,i) with \p k > \p i (the subdiagonal part of the
+      * i-th column). If \p v has fewer columns than rows, then the Householder sequence contains as many
+      * Householder reflections as there are columns.
+      *
+      * \note The %HouseholderSequence object stores \p v and \p h by reference.
+      *
+      * Example: \include HouseholderSequence_HouseholderSequence.cpp
+      * Output: \verbinclude HouseholderSequence_HouseholderSequence.out
+      *
+      * \sa setLength(), setShift()
+      */
+    HouseholderSequence(const VectorsType& v, const CoeffsType& h)
+      : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()),
+        m_shift(0)
+    {
+    }
+
+    /** \brief Copy constructor. */
+    HouseholderSequence(const HouseholderSequence& other)
+      : m_vectors(other.m_vectors),
+        m_coeffs(other.m_coeffs),
+        m_trans(other.m_trans),
+        m_length(other.m_length),
+        m_shift(other.m_shift)
+    {
+    }
+
+    /** \brief Number of rows of transformation viewed as a matrix.
+      * \returns Number of rows 
+      * \details This equals the dimension of the space that the transformation acts on.
+      */
+    Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
+
+    /** \brief Number of columns of transformation viewed as a matrix.
+      * \returns Number of columns
+      * \details This equals the dimension of the space that the transformation acts on.
+      */
+    Index cols() const { return rows(); }
+
+    /** \brief Essential part of a Householder vector.
+      * \param[in]  k  Index of Householder reflection
+      * \returns    Vector containing non-trivial entries of k-th Householder vector
+      *
+      * This function returns the essential part of the Householder vector \f$ v_i \f$. This is a vector of
+      * length \f$ n-i \f$ containing the last \f$ n-i \f$ entries of the vector
+      * \f[ 
+      * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. 
+      * \f]
+      * The index \f$ i \f$ equals \p k + shift(), corresponding to the k-th column of the matrix \p v
+      * passed to the constructor.
+      *
+      * \sa setShift(), shift()
+      */
+    const EssentialVectorType essentialVector(Index k) const
+    {
+      eigen_assert(k >= 0 && k < m_length);
+      return internal::hseq_side_dependent_impl<VectorsType,CoeffsType,Side>::essentialVector(*this, k);
+    }
+
+    /** \brief %Transpose of the Householder sequence. */
+    HouseholderSequence transpose() const
+    {
+      return HouseholderSequence(*this).setTrans(!m_trans);
+    }
+
+    /** \brief Complex conjugate of the Householder sequence. */
+    ConjugateReturnType conjugate() const
+    {
+      return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate())
+             .setTrans(m_trans)
+             .setLength(m_length)
+             .setShift(m_shift);
+    }
+
+    /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
+    ConjugateReturnType adjoint() const
+    {
+      return conjugate().setTrans(!m_trans);
+    }
+
+    /** \brief Inverse of the Householder sequence (equals the adjoint). */
+    ConjugateReturnType inverse() const { return adjoint(); }
+
+    /** \internal */
+    template<typename DestType> inline void evalTo(DestType& dst) const
+    {
+      Matrix<Scalar, DestType::RowsAtCompileTime, 1,
+             AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows());
+      evalTo(dst, workspace);
+    }
+
+    /** \internal */
+    template<typename Dest, typename Workspace>
+    void evalTo(Dest& dst, Workspace& workspace) const
+    {
+      workspace.resize(rows());
+      Index vecs = m_length;
+      if(    internal::is_same<typename internal::remove_all<VectorsType>::type,Dest>::value
+          && internal::extract_data(dst) == internal::extract_data(m_vectors))
+      {
+        // in-place
+        dst.diagonal().setOnes();
+        dst.template triangularView<StrictlyUpper>().setZero();
+        for(Index k = vecs-1; k >= 0; --k)
+        {
+          Index cornerSize = rows() - k - m_shift;
+          if(m_trans)
+            dst.bottomRightCorner(cornerSize, cornerSize)
+               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
+          else
+            dst.bottomRightCorner(cornerSize, cornerSize)
+               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data());
+
+          // clear the off diagonal vector
+          dst.col(k).tail(rows()-k-1).setZero();
+        }
+        // clear the remaining columns if needed
+        for(Index k = 0; k<cols()-vecs ; ++k)
+          dst.col(k).tail(rows()-k-1).setZero();
+      }
+      else
+      {
+        dst.setIdentity(rows(), rows());
+        for(Index k = vecs-1; k >= 0; --k)
+        {
+          Index cornerSize = rows() - k - m_shift;
+          if(m_trans)
+            dst.bottomRightCorner(cornerSize, cornerSize)
+               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+          else
+            dst.bottomRightCorner(cornerSize, cornerSize)
+               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+        }
+      }
+    }
+
+    /** \internal */
+    template<typename Dest> inline void applyThisOnTheRight(Dest& dst) const
+    {
+      Matrix<Scalar,1,Dest::RowsAtCompileTime,RowMajor,1,Dest::MaxRowsAtCompileTime> workspace(dst.rows());
+      applyThisOnTheRight(dst, workspace);
+    }
+
+    /** \internal */
+    template<typename Dest, typename Workspace>
+    inline void applyThisOnTheRight(Dest& dst, Workspace& workspace) const
+    {
+      workspace.resize(dst.rows());
+      for(Index k = 0; k < m_length; ++k)
+      {
+        Index actual_k = m_trans ? m_length-k-1 : k;
+        dst.rightCols(rows()-m_shift-actual_k)
+           .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+      }
+    }
+
+    /** \internal */
+    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
+    {
+      Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace(dst.cols());
+      applyThisOnTheLeft(dst, workspace);
+    }
+
+    /** \internal */
+    template<typename Dest, typename Workspace>
+    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const
+    {
+      workspace.resize(dst.cols());
+      for(Index k = 0; k < m_length; ++k)
+      {
+        Index actual_k = m_trans ? k : m_length-k-1;
+        dst.bottomRows(rows()-m_shift-actual_k)
+           .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+      }
+    }
+
+    /** \brief Computes the product of a Householder sequence with a matrix.
+      * \param[in]  other  %Matrix being multiplied.
+      * \returns    Expression object representing the product.
+      *
+      * This function computes \f$ HM \f$ where \f$ H \f$ is the Householder sequence represented by \p *this
+      * and \f$ M \f$ is the matrix \p other.
+      */
+    template<typename OtherDerived>
+    typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type operator*(const MatrixBase<OtherDerived>& other) const
+    {
+      typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type
+        res(other.template cast<typename internal::matrix_type_times_scalar_type<Scalar,OtherDerived>::ResultScalar>());
+      applyThisOnTheLeft(res);
+      return res;
+    }
+
+    template<typename _VectorsType, typename _CoeffsType, int _Side> friend struct internal::hseq_side_dependent_impl;
+
+    /** \brief Sets the length of the Householder sequence.
+      * \param [in]  length  New value for the length.
+      *
+      * By default, the length \f$ n \f$ of the Householder sequence \f$ H = H_0 H_1 \ldots H_{n-1} \f$ is set
+      * to the number of columns of the matrix \p v passed to the constructor, or the number of rows if that
+      * is smaller. After this function is called, the length equals \p length.
+      *
+      * \sa length()
+      */
+    HouseholderSequence& setLength(Index length)
+    {
+      m_length = length;
+      return *this;
+    }
+
+    /** \brief Sets the shift of the Householder sequence.
+      * \param [in]  shift  New value for the shift.
+      *
+      * By default, a %HouseholderSequence object represents \f$ H = H_0 H_1 \ldots H_{n-1} \f$ and the i-th
+      * column of the matrix \p v passed to the constructor corresponds to the i-th Householder
+      * reflection. After this function is called, the object represents \f$ H = H_{\mathrm{shift}}
+      * H_{\mathrm{shift}+1} \ldots H_{n-1} \f$ and the i-th column of \p v corresponds to the (shift+i)-th
+      * Householder reflection.
+      *
+      * \sa shift()
+      */
+    HouseholderSequence& setShift(Index shift)
+    {
+      m_shift = shift;
+      return *this;
+    }
+
+    Index length() const { return m_length; }  /**< \brief Returns the length of the Householder sequence. */
+    Index shift() const { return m_shift; }    /**< \brief Returns the shift of the Householder sequence. */
+
+    /* Necessary for .adjoint() and .conjugate() */
+    template <typename VectorsType2, typename CoeffsType2, int Side2> friend class HouseholderSequence;
+
+  protected:
+
+    /** \brief Sets the transpose flag.
+      * \param [in]  trans  New value of the transpose flag.
+      *
+      * By default, the transpose flag is not set. If the transpose flag is set, then this object represents 
+      * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+      *
+      * \sa trans()
+      */
+    HouseholderSequence& setTrans(bool trans)
+    {
+      m_trans = trans;
+      return *this;
+    }
+
+    bool trans() const { return m_trans; }     /**< \brief Returns the transpose flag. */
+
+    typename VectorsType::Nested m_vectors;
+    typename CoeffsType::Nested m_coeffs;
+    bool m_trans;
+    Index m_length;
+    Index m_shift;
+};
+
+/** \brief Computes the product of a matrix with a Householder sequence.
+  * \param[in]  other  %Matrix being multiplied.
+  * \param[in]  h      %HouseholderSequence being multiplied.
+  * \returns    Expression object representing the product.
+  *
+  * This function computes \f$ MH \f$ where \f$ M \f$ is the matrix \p other and \f$ H \f$ is the
+  * Householder sequence represented by \p h.
+  */
+template<typename OtherDerived, typename VectorsType, typename CoeffsType, int Side>
+typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,OtherDerived>::Type operator*(const MatrixBase<OtherDerived>& other, const HouseholderSequence<VectorsType,CoeffsType,Side>& h)
+{
+  typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,OtherDerived>::Type
+    res(other.template cast<typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,OtherDerived>::ResultScalar>());
+  h.applyThisOnTheRight(res);
+  return res;
+}
+
+/** \ingroup Householder_Module \householder_module
+  * \brief Convenience function for constructing a Householder sequence. 
+  * \returns A HouseholderSequence constructed from the specified arguments.
+  */
+template<typename VectorsType, typename CoeffsType>
+HouseholderSequence<VectorsType,CoeffsType> householderSequence(const VectorsType& v, const CoeffsType& h)
+{
+  return HouseholderSequence<VectorsType,CoeffsType,OnTheLeft>(v, h);
+}
+
+/** \ingroup Householder_Module \householder_module
+  * \brief Convenience function for constructing a Householder sequence. 
+  * \returns A HouseholderSequence constructed from the specified arguments.
+  * \details This function differs from householderSequence() in that the template argument \p OnTheSide of
+  * the constructed HouseholderSequence is set to OnTheRight, instead of the default OnTheLeft.
+  */
+template<typename VectorsType, typename CoeffsType>
+HouseholderSequence<VectorsType,CoeffsType,OnTheRight> rightHouseholderSequence(const VectorsType& v, const CoeffsType& h)
+{
+  return HouseholderSequence<VectorsType,CoeffsType,OnTheRight>(v, h);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_HOUSEHOLDER_SEQUENCE_H
diff --git a/include/eigen3/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/include/eigen3/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
new file mode 100644
index 0000000..73ca9bf
--- /dev/null
+++ b/include/eigen3/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BASIC_PRECONDITIONERS_H
+#define EIGEN_BASIC_PRECONDITIONERS_H
+
+namespace Eigen { 
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A preconditioner based on the digonal entries
+  *
+  * This class allows to approximately solve for A.x = b problems assuming A is a diagonal matrix.
+  * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
+  * \code
+  * A.diagonal().asDiagonal() . x = b
+  * \endcode
+  *
+  * \tparam _Scalar the type of the scalar.
+  *
+  * This preconditioner is suitable for both selfadjoint and general problems.
+  * The diagonal entries are pre-inverted and stored into a dense vector.
+  *
+  * \note A variant that has yet to be implemented would attempt to preserve the norm of each column.
+  *
+  */
+template <typename _Scalar>
+class DiagonalPreconditioner
+{
+    typedef _Scalar Scalar;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef typename Vector::Index Index;
+
+  public:
+    // this typedef is only to export the scalar type and compile-time dimensions to solve_retval
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+
+    DiagonalPreconditioner() : m_isInitialized(false) {}
+
+    template<typename MatType>
+    DiagonalPreconditioner(const MatType& mat) : m_invdiag(mat.cols())
+    {
+      compute(mat);
+    }
+
+    Index rows() const { return m_invdiag.size(); }
+    Index cols() const { return m_invdiag.size(); }
+    
+    template<typename MatType>
+    DiagonalPreconditioner& analyzePattern(const MatType& )
+    {
+      return *this;
+    }
+    
+    template<typename MatType>
+    DiagonalPreconditioner& factorize(const MatType& mat)
+    {
+      m_invdiag.resize(mat.cols());
+      for(int j=0; j<mat.outerSize(); ++j)
+      {
+        typename MatType::InnerIterator it(mat,j);
+        while(it && it.index()!=j) ++it;
+        if(it && it.index()==j)
+          m_invdiag(j) = Scalar(1)/it.value();
+        else
+          m_invdiag(j) = 0;
+      }
+      m_isInitialized = true;
+      return *this;
+    }
+    
+    template<typename MatType>
+    DiagonalPreconditioner& compute(const MatType& mat)
+    {
+      return factorize(mat);
+    }
+
+    template<typename Rhs, typename Dest>
+    void _solve(const Rhs& b, Dest& x) const
+    {
+      x = m_invdiag.array() * b.array() ;
+    }
+
+    template<typename Rhs> inline const internal::solve_retval<DiagonalPreconditioner, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "DiagonalPreconditioner is not initialized.");
+      eigen_assert(m_invdiag.size()==b.rows()
+                && "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<DiagonalPreconditioner, Rhs>(*this, b.derived());
+    }
+
+  protected:
+    Vector m_invdiag;
+    bool m_isInitialized;
+};
+
+namespace internal {
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<DiagonalPreconditioner<_MatrixType>, Rhs>
+  : solve_retval_base<DiagonalPreconditioner<_MatrixType>, Rhs>
+{
+  typedef DiagonalPreconditioner<_MatrixType> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A naive preconditioner which approximates any matrix as the identity matrix
+  *
+  * \sa class DiagonalPreconditioner
+  */
+class IdentityPreconditioner
+{
+  public:
+
+    IdentityPreconditioner() {}
+
+    template<typename MatrixType>
+    IdentityPreconditioner(const MatrixType& ) {}
+    
+    template<typename MatrixType>
+    IdentityPreconditioner& analyzePattern(const MatrixType& ) { return *this; }
+    
+    template<typename MatrixType>
+    IdentityPreconditioner& factorize(const MatrixType& ) { return *this; }
+
+    template<typename MatrixType>
+    IdentityPreconditioner& compute(const MatrixType& ) { return *this; }
+    
+    template<typename Rhs>
+    inline const Rhs& solve(const Rhs& b) const { return b; }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_BASIC_PRECONDITIONERS_H
diff --git a/include/eigen3/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/include/eigen3/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
new file mode 100644
index 0000000..dd135c2
--- /dev/null
+++ b/include/eigen3/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -0,0 +1,275 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BICGSTAB_H
+#define EIGEN_BICGSTAB_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Low-level bi conjugate gradient stabilized algorithm
+  * \param mat The matrix A
+  * \param rhs The right hand side vector b
+  * \param x On input and initial solution, on output the computed solution.
+  * \param precond A preconditioner being able to efficiently solve for an
+  *                approximation of Ax=b (regardless of b)
+  * \param iters On input the max number of iteration, on output the number of performed iterations.
+  * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+  * \return false in the case of numerical issue, for example a break down of BiCGSTAB. 
+  */
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
+              const Preconditioner& precond, int& iters,
+              typename Dest::RealScalar& tol_error)
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  RealScalar tol = tol_error;
+  int maxIters = iters;
+
+  int n = mat.cols();
+  VectorType r  = rhs - mat * x;
+  VectorType r0 = r;
+  
+  RealScalar r0_sqnorm = r0.squaredNorm();
+  RealScalar rhs_sqnorm = rhs.squaredNorm();
+  if(rhs_sqnorm == 0)
+  {
+    x.setZero();
+    return true;
+  }
+  Scalar rho    = 1;
+  Scalar alpha  = 1;
+  Scalar w      = 1;
+  
+  VectorType v = VectorType::Zero(n), p = VectorType::Zero(n);
+  VectorType y(n),  z(n);
+  VectorType kt(n), ks(n);
+
+  VectorType s(n), t(n);
+
+  RealScalar tol2 = tol*tol;
+  RealScalar eps2 = NumTraits<Scalar>::epsilon()*NumTraits<Scalar>::epsilon();
+  int i = 0;
+  int restarts = 0;
+
+  while ( r.squaredNorm()/rhs_sqnorm > tol2 && i<maxIters )
+  {
+    Scalar rho_old = rho;
+
+    rho = r0.dot(r);
+    if (abs(rho) < eps2*r0_sqnorm)
+    {
+      // The new residual vector became too orthogonal to the arbitrarily choosen direction r0
+      // Let's restart with a new r0:
+      r0 = r;
+      rho = r0_sqnorm = r.squaredNorm();
+      if(restarts++ == 0)
+        i = 0;
+    }
+    Scalar beta = (rho/rho_old) * (alpha / w);
+    p = r + beta * (p - w * v);
+    
+    y = precond.solve(p);
+    
+    v.noalias() = mat * y;
+
+    alpha = rho / r0.dot(v);
+    s = r - alpha * v;
+
+    z = precond.solve(s);
+    t.noalias() = mat * z;
+
+    RealScalar tmp = t.squaredNorm();
+    if(tmp>RealScalar(0))
+      w = t.dot(s) / tmp;
+    else
+      w = Scalar(0);
+    x += alpha * y + w * z;
+    r = s - w * t;
+    ++i;
+  }
+  tol_error = sqrt(r.squaredNorm()/rhs_sqnorm);
+  iters = i;
+  return true; 
+}
+
+}
+
+template< typename _MatrixType,
+          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+class BiCGSTAB;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A bi conjugate gradient stabilized solver for sparse square problems
+  *
+  * This class allows to solve for A.x = b sparse linear problems using a bi conjugate gradient
+  * stabilized algorithm. The vectors x and b can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  * 
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+  * \code
+  * int n = 10000;
+  * VectorXd x(n), b(n);
+  * SparseMatrix<double> A(n,n);
+  * // fill A and b
+  * BiCGSTAB<SparseMatrix<double> > solver;
+  * solver.compute(A);
+  * x = solver.solve(b);
+  * std::cout << "#iterations:     " << solver.iterations() << std::endl;
+  * std::cout << "estimated error: " << solver.error()      << std::endl;
+  * // update b, and solve again
+  * x = solver.solve(b);
+  * \endcode
+  * 
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method. Here is a step by
+  * step execution example starting with a random guess and printing the evolution
+  * of the estimated error:
+  * * \code
+  * x = VectorXd::Random(n);
+  * solver.setMaxIterations(1);
+  * int i = 0;
+  * do {
+  *   x = solver.solveWithGuess(b,x);
+  *   std::cout << i << " : " << solver.error() << std::endl;
+  *   ++i;
+  * } while (solver.info()!=Success && i<100);
+  * \endcode
+  * Note that such a step by step excution is slightly slower.
+  * 
+  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  */
+template< typename _MatrixType, typename _Preconditioner>
+class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner> >
+{
+  typedef IterativeSolverBase<BiCGSTAB> Base;
+  using Base::mp_matrix;
+  using Base::m_error;
+  using Base::m_iterations;
+  using Base::m_info;
+  using Base::m_isInitialized;
+public:
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef _Preconditioner Preconditioner;
+
+public:
+
+  /** Default constructor. */
+  BiCGSTAB() : Base() {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  BiCGSTAB(const MatrixType& A) : Base(A) {}
+
+  ~BiCGSTAB() {}
+  
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
+    * \a x0 as an initial solution.
+    *
+    * \sa compute()
+    */
+  template<typename Rhs,typename Guess>
+  inline const internal::solve_retval_with_guess<BiCGSTAB, Rhs, Guess>
+  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
+  {
+    eigen_assert(m_isInitialized && "BiCGSTAB is not initialized.");
+    eigen_assert(Base::rows()==b.rows()
+              && "BiCGSTAB::solve(): invalid number of rows of the right hand side matrix b");
+    return internal::solve_retval_with_guess
+            <BiCGSTAB, Rhs, Guess>(*this, b.derived(), x0);
+  }
+  
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solveWithGuess(const Rhs& b, Dest& x) const
+  {    
+    bool failed = false;
+    for(int j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+      
+      typename Dest::ColXpr xj(x,j);
+      if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
+        failed = true;
+    }
+    m_info = failed ? NumericalIssue
+           : m_error <= Base::m_tolerance ? Success
+           : NoConvergence;
+    m_isInitialized = true;
+  }
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve(const Rhs& b, Dest& x) const
+  {
+//     x.setZero();
+  x = b;
+    _solveWithGuess(b,x);
+  }
+
+protected:
+
+};
+
+
+namespace internal {
+
+  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
+struct solve_retval<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
+  : solve_retval_base<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
+{
+  typedef BiCGSTAB<_MatrixType, _Preconditioner> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BICGSTAB_H
diff --git a/include/eigen3/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/include/eigen3/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
new file mode 100644
index 0000000..a74a815
--- /dev/null
+++ b/include/eigen3/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -0,0 +1,265 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONJUGATE_GRADIENT_H
+#define EIGEN_CONJUGATE_GRADIENT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Low-level conjugate gradient algorithm
+  * \param mat The matrix A
+  * \param rhs The right hand side vector b
+  * \param x On input and initial solution, on output the computed solution.
+  * \param precond A preconditioner being able to efficiently solve for an
+  *                approximation of Ax=b (regardless of b)
+  * \param iters On input the max number of iteration, on output the number of performed iterations.
+  * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+  */
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+EIGEN_DONT_INLINE
+void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                        const Preconditioner& precond, int& iters,
+                        typename Dest::RealScalar& tol_error)
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  
+  RealScalar tol = tol_error;
+  int maxIters = iters;
+  
+  int n = mat.cols();
+
+  VectorType residual = rhs - mat * x; //initial residual
+
+  RealScalar rhsNorm2 = rhs.squaredNorm();
+  if(rhsNorm2 == 0) 
+  {
+    x.setZero();
+    iters = 0;
+    tol_error = 0;
+    return;
+  }
+  RealScalar threshold = tol*tol*rhsNorm2;
+  RealScalar residualNorm2 = residual.squaredNorm();
+  if (residualNorm2 < threshold)
+  {
+    iters = 0;
+    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    return;
+  }
+  
+  VectorType p(n);
+  p = precond.solve(residual);      //initial search direction
+
+  VectorType z(n), tmp(n);
+  RealScalar absNew = numext::real(residual.dot(p));  // the square of the absolute value of r scaled by invM
+  int i = 0;
+  while(i < maxIters)
+  {
+    tmp.noalias() = mat * p;              // the bottleneck of the algorithm
+
+    Scalar alpha = absNew / p.dot(tmp);   // the amount we travel on dir
+    x += alpha * p;                       // update solution
+    residual -= alpha * tmp;              // update residue
+    
+    residualNorm2 = residual.squaredNorm();
+    if(residualNorm2 < threshold)
+      break;
+    
+    z = precond.solve(residual);          // approximately solve for "A z = residual"
+
+    RealScalar absOld = absNew;
+    absNew = numext::real(residual.dot(z));     // update the absolute value of r
+    RealScalar beta = absNew / absOld;            // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                             // update search direction
+    i++;
+  }
+  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  iters = i;
+}
+
+}
+
+template< typename _MatrixType, int _UpLo=Lower,
+          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+class ConjugateGradient;
+
+namespace internal {
+
+template< typename _MatrixType, int _UpLo, typename _Preconditioner>
+struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A conjugate gradient solver for sparse self-adjoint problems
+  *
+  * This class allows to solve for A.x = b sparse linear problems using a conjugate gradient algorithm.
+  * The sparse matrix A must be selfadjoint. The vectors x and b can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  * 
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+  * \code
+  * int n = 10000;
+  * VectorXd x(n), b(n);
+  * SparseMatrix<double> A(n,n);
+  * // fill A and b
+  * ConjugateGradient<SparseMatrix<double> > cg;
+  * cg.compute(A);
+  * x = cg.solve(b);
+  * std::cout << "#iterations:     " << cg.iterations() << std::endl;
+  * std::cout << "estimated error: " << cg.error()      << std::endl;
+  * // update b, and solve again
+  * x = cg.solve(b);
+  * \endcode
+  * 
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method. Here is a step by
+  * step execution example starting with a random guess and printing the evolution
+  * of the estimated error:
+  * * \code
+  * x = VectorXd::Random(n);
+  * cg.setMaxIterations(1);
+  * int i = 0;
+  * do {
+  *   x = cg.solveWithGuess(b,x);
+  *   std::cout << i << " : " << cg.error() << std::endl;
+  *   ++i;
+  * } while (cg.info()!=Success && i<100);
+  * \endcode
+  * Note that such a step by step excution is slightly slower.
+  * 
+  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  */
+template< typename _MatrixType, int _UpLo, typename _Preconditioner>
+class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
+{
+  typedef IterativeSolverBase<ConjugateGradient> Base;
+  using Base::mp_matrix;
+  using Base::m_error;
+  using Base::m_iterations;
+  using Base::m_info;
+  using Base::m_isInitialized;
+public:
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef _Preconditioner Preconditioner;
+
+  enum {
+    UpLo = _UpLo
+  };
+
+public:
+
+  /** Default constructor. */
+  ConjugateGradient() : Base() {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  ConjugateGradient(const MatrixType& A) : Base(A) {}
+
+  ~ConjugateGradient() {}
+  
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
+    * \a x0 as an initial solution.
+    *
+    * \sa compute()
+    */
+  template<typename Rhs,typename Guess>
+  inline const internal::solve_retval_with_guess<ConjugateGradient, Rhs, Guess>
+  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
+  {
+    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
+    eigen_assert(Base::rows()==b.rows()
+              && "ConjugateGradient::solve(): invalid number of rows of the right hand side matrix b");
+    return internal::solve_retval_with_guess
+            <ConjugateGradient, Rhs, Guess>(*this, b.derived(), x0);
+  }
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solveWithGuess(const Rhs& b, Dest& x) const
+  {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+
+    for(int j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+
+      typename Dest::ColXpr xj(x,j);
+      internal::conjugate_gradient(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj,
+                                   Base::m_preconditioner, m_iterations, m_error);
+    }
+
+    m_isInitialized = true;
+    m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+  
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve(const Rhs& b, Dest& x) const
+  {
+    x.setOnes();
+    _solveWithGuess(b,x);
+  }
+
+protected:
+
+};
+
+
+namespace internal {
+
+template<typename _MatrixType, int _UpLo, typename _Preconditioner, typename Rhs>
+struct solve_retval<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
+  : solve_retval_base<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
+{
+  typedef ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CONJUGATE_GRADIENT_H
diff --git a/include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
new file mode 100644
index 0000000..b55afc1
--- /dev/null
+++ b/include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -0,0 +1,467 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INCOMPLETE_LUT_H
+#define EIGEN_INCOMPLETE_LUT_H
+
+
+namespace Eigen { 
+
+namespace internal {
+    
+/** \internal
+  * Compute a quick-sort split of a vector 
+  * On output, the vector row is permuted such that its elements satisfy
+  * abs(row(i)) >= abs(row(ncut)) if i<ncut
+  * abs(row(i)) <= abs(row(ncut)) if i>ncut 
+  * \param row The vector of values
+  * \param ind The array of index for the elements in @p row
+  * \param ncut  The number of largest elements to keep
+  **/ 
+template <typename VectorV, typename VectorI, typename Index>
+Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
+{
+  typedef typename VectorV::RealScalar RealScalar;
+  using std::swap;
+  using std::abs;
+  Index mid;
+  Index n = row.size(); /* length of the vector */
+  Index first, last ;
+  
+  ncut--; /* to fit the zero-based indices */
+  first = 0; 
+  last = n-1; 
+  if (ncut < first || ncut > last ) return 0;
+  
+  do {
+    mid = first; 
+    RealScalar abskey = abs(row(mid)); 
+    for (Index j = first + 1; j <= last; j++) {
+      if ( abs(row(j)) > abskey) {
+        ++mid;
+        swap(row(mid), row(j));
+        swap(ind(mid), ind(j));
+      }
+    }
+    /* Interchange for the pivot element */
+    swap(row(mid), row(first));
+    swap(ind(mid), ind(first));
+    
+    if (mid > ncut) last = mid - 1;
+    else if (mid < ncut ) first = mid + 1; 
+  } while (mid != ncut );
+  
+  return 0; /* mid is equal to ncut */ 
+}
+
+}// end namespace internal
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \class IncompleteLUT
+  * \brief Incomplete LU factorization with dual-threshold strategy
+  *
+  * During the numerical factorization, two dropping rules are used :
+  *  1) any element whose magnitude is less than some tolerance is dropped.
+  *    This tolerance is obtained by multiplying the input tolerance @p droptol 
+  *    by the average magnitude of all the original elements in the current row.
+  *  2) After the elimination of the row, only the @p fill largest elements in 
+  *    the L part and the @p fill largest elements in the U part are kept 
+  *    (in addition to the diagonal element ). Note that @p fill is computed from 
+  *    the input parameter @p fillfactor which is used the ratio to control the fill_in 
+  *    relatively to the initial number of nonzero elements.
+  * 
+  * The two extreme cases are when @p droptol=0 (to keep all the @p fill*2 largest elements)
+  * and when @p fill=n/2 with @p droptol being different to zero. 
+  * 
+  * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization, 
+  *              Numerical Linear Algebra with Applications, 1(4), pp 387-402, 1994.
+  * 
+  * NOTE : The following implementation is derived from the ILUT implementation
+  * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota 
+  *  released under the terms of the GNU LGPL: 
+  *    http://www-users.cs.umn.edu/~saad/software/SPARSKIT/README
+  * However, Yousef Saad gave us permission to relicense his ILUT code to MPL2.
+  * See the Eigen mailing list archive, thread: ILUT, date: July 8, 2012:
+  *   http://listengine.tuxfamily.org/lists.tuxfamily.org/eigen/2012/07/msg00064.html
+  * alternatively, on GMANE:
+  *   http://comments.gmane.org/gmane.comp.lib.eigen/3302
+  */
+template <typename _Scalar>
+class IncompleteLUT : internal::noncopyable
+{
+    typedef _Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef SparseMatrix<Scalar,RowMajor> FactorType;
+    typedef SparseMatrix<Scalar,ColMajor> PermutType;
+    typedef typename FactorType::Index Index;
+
+  public:
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+    
+    IncompleteLUT()
+      : m_droptol(NumTraits<Scalar>::dummy_precision()), m_fillfactor(10),
+        m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false)
+    {}
+    
+    template<typename MatrixType>
+    IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
+      : m_droptol(droptol),m_fillfactor(fillfactor),
+        m_analysisIsOk(false),m_factorizationIsOk(false),m_isInitialized(false)
+    {
+      eigen_assert(fillfactor != 0);
+      compute(mat); 
+    }
+    
+    Index rows() const { return m_lu.rows(); }
+    
+    Index cols() const { return m_lu.cols(); }
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
+      return m_info;
+    }
+    
+    template<typename MatrixType>
+    void analyzePattern(const MatrixType& amat);
+    
+    template<typename MatrixType>
+    void factorize(const MatrixType& amat);
+    
+    /**
+      * Compute an incomplete LU factorization with dual threshold on the matrix mat
+      * No pivoting is done in this version
+      * 
+      **/
+    template<typename MatrixType>
+    IncompleteLUT<Scalar>& compute(const MatrixType& amat)
+    {
+      analyzePattern(amat); 
+      factorize(amat);
+      m_isInitialized = m_factorizationIsOk;
+      return *this;
+    }
+
+    void setDroptol(const RealScalar& droptol); 
+    void setFillfactor(int fillfactor); 
+    
+    template<typename Rhs, typename Dest>
+    void _solve(const Rhs& b, Dest& x) const
+    {
+      x = m_Pinv * b;  
+      x = m_lu.template triangularView<UnitLower>().solve(x);
+      x = m_lu.template triangularView<Upper>().solve(x);
+      x = m_P * x; 
+    }
+
+    template<typename Rhs> inline const internal::solve_retval<IncompleteLUT, Rhs>
+     solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
+      eigen_assert(cols()==b.rows()
+                && "IncompleteLUT::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<IncompleteLUT, Rhs>(*this, b.derived());
+    }
+
+protected:
+
+    /** keeps off-diagonal entries; drops diagonal entries */
+    struct keep_diag {
+      inline bool operator() (const Index& row, const Index& col, const Scalar&) const
+      {
+        return row!=col;
+      }
+    };
+
+protected:
+
+    FactorType m_lu;
+    RealScalar m_droptol;
+    int m_fillfactor;
+    bool m_analysisIsOk;
+    bool m_factorizationIsOk;
+    bool m_isInitialized;
+    ComputationInfo m_info;
+    PermutationMatrix<Dynamic,Dynamic,Index> m_P;     // Fill-reducing permutation
+    PermutationMatrix<Dynamic,Dynamic,Index> m_Pinv;  // Inverse permutation
+};
+
+/**
+ * Set control parameter droptol
+ *  \param droptol   Drop any element whose magnitude is less than this tolerance 
+ **/ 
+template<typename Scalar>
+void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
+{
+  this->m_droptol = droptol;   
+}
+
+/**
+ * Set control parameter fillfactor
+ * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row. 
+ **/ 
+template<typename Scalar>
+void IncompleteLUT<Scalar>::setFillfactor(int fillfactor)
+{
+  this->m_fillfactor = fillfactor;   
+}
+
+template <typename Scalar>
+template<typename _MatrixType>
+void IncompleteLUT<Scalar>::analyzePattern(const _MatrixType& amat)
+{
+  // Compute the Fill-reducing permutation
+  SparseMatrix<Scalar,ColMajor, Index> mat1 = amat;
+  SparseMatrix<Scalar,ColMajor, Index> mat2 = amat.transpose();
+  // Symmetrize the pattern
+  // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
+  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
+  SparseMatrix<Scalar,ColMajor, Index> AtA = mat2 + mat1;
+  AtA.prune(keep_diag());
+  internal::minimum_degree_ordering<Scalar, Index>(AtA, m_P);  // Then compute the AMD ordering...
+
+  m_Pinv  = m_P.inverse(); // ... and the inverse permutation
+
+  m_analysisIsOk = true;
+}
+
+template <typename Scalar>
+template<typename _MatrixType>
+void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
+{
+  using std::sqrt;
+  using std::swap;
+  using std::abs;
+
+  eigen_assert((amat.rows() == amat.cols()) && "The factorization should be done on a square matrix");
+  Index n = amat.cols();  // Size of the matrix
+  m_lu.resize(n,n);
+  // Declare Working vectors and variables
+  Vector u(n) ;     // real values of the row -- maximum size is n --
+  VectorXi ju(n);   // column position of the values in u -- maximum size  is n
+  VectorXi jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
+
+  // Apply the fill-reducing permutation
+  eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+  SparseMatrix<Scalar,RowMajor, Index> mat;
+  mat = amat.twistedBy(m_Pinv);
+
+  // Initialization
+  jr.fill(-1);
+  ju.fill(0);
+  u.fill(0);
+
+  // number of largest elements to keep in each row:
+  Index fill_in =   static_cast<Index> (amat.nonZeros()*m_fillfactor)/n+1;
+  if (fill_in > n) fill_in = n;
+
+  // number of largest nonzero elements to keep in the L and the U part of the current row:
+  Index nnzL = fill_in/2;
+  Index nnzU = nnzL;
+  m_lu.reserve(n * (nnzL + nnzU + 1));
+
+  // global loop over the rows of the sparse matrix
+  for (Index ii = 0; ii < n; ii++)
+  {
+    // 1 - copy the lower and the upper part of the row i of mat in the working vector u
+
+    Index sizeu = 1; // number of nonzero elements in the upper part of the current row
+    Index sizel = 0; // number of nonzero elements in the lower part of the current row
+    ju(ii)    = ii;
+    u(ii)     = 0;
+    jr(ii)    = ii;
+    RealScalar rownorm = 0;
+
+    typename FactorType::InnerIterator j_it(mat, ii); // Iterate through the current row ii
+    for (; j_it; ++j_it)
+    {
+      Index k = j_it.index();
+      if (k < ii)
+      {
+        // copy the lower part
+        ju(sizel) = k;
+        u(sizel) = j_it.value();
+        jr(k) = sizel;
+        ++sizel;
+      }
+      else if (k == ii)
+      {
+        u(ii) = j_it.value();
+      }
+      else
+      {
+        // copy the upper part
+        Index jpos = ii + sizeu;
+        ju(jpos) = k;
+        u(jpos) = j_it.value();
+        jr(k) = jpos;
+        ++sizeu;
+      }
+      rownorm += numext::abs2(j_it.value());
+    }
+
+    // 2 - detect possible zero row
+    if(rownorm==0)
+    {
+      m_info = NumericalIssue;
+      return;
+    }
+    // Take the 2-norm of the current row as a relative tolerance
+    rownorm = sqrt(rownorm);
+
+    // 3 - eliminate the previous nonzero rows
+    Index jj = 0;
+    Index len = 0;
+    while (jj < sizel)
+    {
+      // In order to eliminate in the correct order,
+      // we must select first the smallest column index among  ju(jj:sizel)
+      Index k;
+      Index minrow = ju.segment(jj,sizel-jj).minCoeff(&k); // k is relative to the segment
+      k += jj;
+      if (minrow != ju(jj))
+      {
+        // swap the two locations
+        Index j = ju(jj);
+        swap(ju(jj), ju(k));
+        jr(minrow) = jj;   jr(j) = k;
+        swap(u(jj), u(k));
+      }
+      // Reset this location
+      jr(minrow) = -1;
+
+      // Start elimination
+      typename FactorType::InnerIterator ki_it(m_lu, minrow);
+      while (ki_it && ki_it.index() < minrow) ++ki_it;
+      eigen_internal_assert(ki_it && ki_it.col()==minrow);
+      Scalar fact = u(jj) / ki_it.value();
+
+      // drop too small elements
+      if(abs(fact) <= m_droptol)
+      {
+        jj++;
+        continue;
+      }
+
+      // linear combination of the current row ii and the row minrow
+      ++ki_it;
+      for (; ki_it; ++ki_it)
+      {
+        Scalar prod = fact * ki_it.value();
+        Index j       = ki_it.index();
+        Index jpos    = jr(j);
+        if (jpos == -1) // fill-in element
+        {
+          Index newpos;
+          if (j >= ii) // dealing with the upper part
+          {
+            newpos = ii + sizeu;
+            sizeu++;
+            eigen_internal_assert(sizeu<=n);
+          }
+          else // dealing with the lower part
+          {
+            newpos = sizel;
+            sizel++;
+            eigen_internal_assert(sizel<=ii);
+          }
+          ju(newpos) = j;
+          u(newpos) = -prod;
+          jr(j) = newpos;
+        }
+        else
+          u(jpos) -= prod;
+      }
+      // store the pivot element
+      u(len) = fact;
+      ju(len) = minrow;
+      ++len;
+
+      jj++;
+    } // end of the elimination on the row ii
+
+    // reset the upper part of the pointer jr to zero
+    for(Index k = 0; k <sizeu; k++) jr(ju(ii+k)) = -1;
+
+    // 4 - partially sort and insert the elements in the m_lu matrix
+
+    // sort the L-part of the row
+    sizel = len;
+    len = (std::min)(sizel, nnzL);
+    typename Vector::SegmentReturnType ul(u.segment(0, sizel));
+    typename VectorXi::SegmentReturnType jul(ju.segment(0, sizel));
+    internal::QuickSplit(ul, jul, len);
+
+    // store the largest m_fill elements of the L part
+    m_lu.startVec(ii);
+    for(Index k = 0; k < len; k++)
+      m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k);
+
+    // store the diagonal element
+    // apply a shifting rule to avoid zero pivots (we are doing an incomplete factorization)
+    if (u(ii) == Scalar(0))
+      u(ii) = sqrt(m_droptol) * rownorm;
+    m_lu.insertBackByOuterInnerUnordered(ii, ii) = u(ii);
+
+    // sort the U-part of the row
+    // apply the dropping rule first
+    len = 0;
+    for(Index k = 1; k < sizeu; k++)
+    {
+      if(abs(u(ii+k)) > m_droptol * rownorm )
+      {
+        ++len;
+        u(ii + len)  = u(ii + k);
+        ju(ii + len) = ju(ii + k);
+      }
+    }
+    sizeu = len + 1; // +1 to take into account the diagonal element
+    len = (std::min)(sizeu, nnzU);
+    typename Vector::SegmentReturnType uu(u.segment(ii+1, sizeu-1));
+    typename VectorXi::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
+    internal::QuickSplit(uu, juu, len);
+
+    // store the largest elements of the U part
+    for(Index k = ii + 1; k < ii + len; k++)
+      m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k);
+  }
+
+  m_lu.finalize();
+  m_lu.makeCompressed();
+
+  m_factorizationIsOk = true;
+  m_info = Success;
+}
+
+namespace internal {
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<IncompleteLUT<_MatrixType>, Rhs>
+  : solve_retval_base<IncompleteLUT<_MatrixType>, Rhs>
+{
+  typedef IncompleteLUT<_MatrixType> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INCOMPLETE_LUT_H
diff --git a/include/eigen3/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/include/eigen3/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
new file mode 100644
index 0000000..2036922
--- /dev/null
+++ b/include/eigen3/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -0,0 +1,254 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ITERATIVE_SOLVER_BASE_H
+#define EIGEN_ITERATIVE_SOLVER_BASE_H
+
+namespace Eigen { 
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief Base class for linear iterative solvers
+  *
+  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  */
+template< typename Derived>
+class IterativeSolverBase : internal::noncopyable
+{
+public:
+  typedef typename internal::traits<Derived>::MatrixType MatrixType;
+  typedef typename internal::traits<Derived>::Preconditioner Preconditioner;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+public:
+
+  Derived& derived() { return *static_cast<Derived*>(this); }
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  /** Default constructor. */
+  IterativeSolverBase()
+    : mp_matrix(0)
+  {
+    init();
+  }
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  IterativeSolverBase(const MatrixType& A)
+  {
+    init();
+    compute(A);
+  }
+
+  ~IterativeSolverBase() {}
+  
+  /** Initializes the iterative solver for the sparcity pattern of the matrix \a A for further solving \c Ax=b problems.
+    *
+    * Currently, this function mostly call analyzePattern on the preconditioner. In the future
+    * we might, for instance, implement column reodering for faster matrix vector products.
+    */
+  Derived& analyzePattern(const MatrixType& A)
+  {
+    m_preconditioner.analyzePattern(A);
+    m_isInitialized = true;
+    m_analysisIsOk = true;
+    m_info = Success;
+    return derived();
+  }
+  
+  /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b problems.
+    *
+    * Currently, this function mostly call factorize on the preconditioner.
+    *
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  Derived& factorize(const MatrixType& A)
+  {
+    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
+    mp_matrix = &A;
+    m_preconditioner.factorize(A);
+    m_factorizationIsOk = true;
+    m_info = Success;
+    return derived();
+  }
+
+  /** Initializes the iterative solver with the matrix \a A for further solving \c Ax=b problems.
+    *
+    * Currently, this function mostly initialized/compute the preconditioner. In the future
+    * we might, for instance, implement column reodering for faster matrix vector products.
+    *
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  Derived& compute(const MatrixType& A)
+  {
+    mp_matrix = &A;
+    m_preconditioner.compute(A);
+    m_isInitialized = true;
+    m_analysisIsOk = true;
+    m_factorizationIsOk = true;
+    m_info = Success;
+    return derived();
+  }
+
+  /** \internal */
+  Index rows() const { return mp_matrix ? mp_matrix->rows() : 0; }
+  /** \internal */
+  Index cols() const { return mp_matrix ? mp_matrix->cols() : 0; }
+
+  /** \returns the tolerance threshold used by the stopping criteria */
+  RealScalar tolerance() const { return m_tolerance; }
+  
+  /** Sets the tolerance threshold used by the stopping criteria */
+  Derived& setTolerance(const RealScalar& tolerance)
+  {
+    m_tolerance = tolerance;
+    return derived();
+  }
+
+  /** \returns a read-write reference to the preconditioner for custom configuration. */
+  Preconditioner& preconditioner() { return m_preconditioner; }
+  
+  /** \returns a read-only reference to the preconditioner. */
+  const Preconditioner& preconditioner() const { return m_preconditioner; }
+
+  /** \returns the max number of iterations */
+  int maxIterations() const
+  {
+    return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations;
+  }
+  
+  /** Sets the max number of iterations */
+  Derived& setMaxIterations(int maxIters)
+  {
+    m_maxIterations = maxIters;
+    return derived();
+  }
+
+  /** \returns the number of iterations performed during the last solve */
+  int iterations() const
+  {
+    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
+    return m_iterations;
+  }
+
+  /** \returns the tolerance error reached during the last solve */
+  RealScalar error() const
+  {
+    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
+    return m_error;
+  }
+
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+    *
+    * \sa compute()
+    */
+  template<typename Rhs> inline const internal::solve_retval<Derived, Rhs>
+  solve(const MatrixBase<Rhs>& b) const
+  {
+    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
+    eigen_assert(rows()==b.rows()
+              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
+    return internal::solve_retval<Derived, Rhs>(derived(), b.derived());
+  }
+  
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+    *
+    * \sa compute()
+    */
+  template<typename Rhs>
+  inline const internal::sparse_solve_retval<IterativeSolverBase, Rhs>
+  solve(const SparseMatrixBase<Rhs>& b) const
+  {
+    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
+    eigen_assert(rows()==b.rows()
+              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
+    return internal::sparse_solve_retval<IterativeSolverBase, Rhs>(*this, b.derived());
+  }
+
+  /** \returns Success if the iterations converged, and NoConvergence otherwise. */
+  ComputationInfo info() const
+  {
+    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
+    return m_info;
+  }
+  
+  /** \internal */
+  template<typename Rhs, typename DestScalar, int DestOptions, typename DestIndex>
+  void _solve_sparse(const Rhs& b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+  {
+    eigen_assert(rows()==b.rows());
+    
+    int rhsCols = b.cols();
+    int size = b.rows();
+    Eigen::Matrix<DestScalar,Dynamic,1> tb(size);
+    Eigen::Matrix<DestScalar,Dynamic,1> tx(size);
+    for(int k=0; k<rhsCols; ++k)
+    {
+      tb = b.col(k);
+      tx = derived().solve(tb);
+      dest.col(k) = tx.sparseView(0);
+    }
+  }
+
+protected:
+  void init()
+  {
+    m_isInitialized = false;
+    m_analysisIsOk = false;
+    m_factorizationIsOk = false;
+    m_maxIterations = -1;
+    m_tolerance = NumTraits<Scalar>::epsilon();
+  }
+  const MatrixType* mp_matrix;
+  Preconditioner m_preconditioner;
+
+  int m_maxIterations;
+  RealScalar m_tolerance;
+  
+  mutable RealScalar m_error;
+  mutable int m_iterations;
+  mutable ComputationInfo m_info;
+  mutable bool m_isInitialized, m_analysisIsOk, m_factorizationIsOk;
+};
+
+namespace internal {
+ 
+template<typename Derived, typename Rhs>
+struct sparse_solve_retval<IterativeSolverBase<Derived>, Rhs>
+  : sparse_solve_retval_base<IterativeSolverBase<Derived>, Rhs>
+{
+  typedef IterativeSolverBase<Derived> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec().derived()._solve_sparse(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ITERATIVE_SOLVER_BASE_H
diff --git a/include/eigen3/Eigen/src/Jacobi/Jacobi.h b/include/eigen3/Eigen/src/Jacobi/Jacobi.h
new file mode 100644
index 0000000..956f72d
--- /dev/null
+++ b/include/eigen3/Eigen/src/Jacobi/Jacobi.h
@@ -0,0 +1,433 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_JACOBI_H
+#define EIGEN_JACOBI_H
+
+namespace Eigen { 
+
+/** \ingroup Jacobi_Module
+  * \jacobi_module
+  * \class JacobiRotation
+  * \brief Rotation given by a cosine-sine pair.
+  *
+  * This class represents a Jacobi or Givens rotation.
+  * This is a 2D rotation in the plane \c J of angle \f$ \theta \f$ defined by
+  * its cosine \c c and sine \c s as follow:
+  * \f$ J = \left ( \begin{array}{cc} c & \overline s \\ -s  & \overline c \end{array} \right ) \f$
+  *
+  * You can apply the respective counter-clockwise rotation to a column vector \c v by
+  * applying its adjoint on the left: \f$ v = J^* v \f$ that translates to the following Eigen code:
+  * \code
+  * v.applyOnTheLeft(J.adjoint());
+  * \endcode
+  *
+  * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+  */
+template<typename Scalar> class JacobiRotation
+{
+  public:
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    /** Default constructor without any initialization. */
+    JacobiRotation() {}
+
+    /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
+    JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
+
+    Scalar& c() { return m_c; }
+    Scalar c() const { return m_c; }
+    Scalar& s() { return m_s; }
+    Scalar s() const { return m_s; }
+
+    /** Concatenates two planar rotation */
+    JacobiRotation operator*(const JacobiRotation& other)
+    {
+      using numext::conj;
+      return JacobiRotation(m_c * other.m_c - conj(m_s) * other.m_s,
+                            conj(m_c * conj(other.m_s) + conj(m_s) * conj(other.m_c)));
+    }
+
+    /** Returns the transposed transformation */
+    JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
+
+    /** Returns the adjoint transformation */
+    JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
+
+    template<typename Derived>
+    bool makeJacobi(const MatrixBase<Derived>&, typename Derived::Index p, typename Derived::Index q);
+    bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
+
+    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
+
+  protected:
+    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type);
+    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type);
+
+    Scalar m_c, m_s;
+};
+
+/** Makes \c *this as a Jacobi rotation \a J such that applying \a J on both the right and left sides of the selfadjoint 2x2 matrix
+  * \f$ B = \left ( \begin{array}{cc} x & y \\ \overline y & z \end{array} \right )\f$ yields a diagonal matrix \f$ A = J^* B J \f$
+  *
+  * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+  */
+template<typename Scalar>
+bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z)
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  if(y == Scalar(0))
+  {
+    m_c = Scalar(1);
+    m_s = Scalar(0);
+    return false;
+  }
+  else
+  {
+    RealScalar tau = (x-z)/(RealScalar(2)*abs(y));
+    RealScalar w = sqrt(numext::abs2(tau) + RealScalar(1));
+    RealScalar t;
+    if(tau>RealScalar(0))
+    {
+      t = RealScalar(1) / (tau + w);
+    }
+    else
+    {
+      t = RealScalar(1) / (tau - w);
+    }
+    RealScalar sign_t = t > RealScalar(0) ? RealScalar(1) : RealScalar(-1);
+    RealScalar n = RealScalar(1) / sqrt(numext::abs2(t)+RealScalar(1));
+    m_s = - sign_t * (numext::conj(y) / abs(y)) * abs(t) * n;
+    m_c = n;
+    return true;
+  }
+}
+
+/** Makes \c *this as a Jacobi rotation \c J such that applying \a J on both the right and left sides of the 2x2 selfadjoint matrix
+  * \f$ B = \left ( \begin{array}{cc} \text{this}_{pp} & \text{this}_{pq} \\ (\text{this}_{pq})^* & \text{this}_{qq} \end{array} \right )\f$ yields
+  * a diagonal matrix \f$ A = J^* B J \f$
+  *
+  * Example: \include Jacobi_makeJacobi.cpp
+  * Output: \verbinclude Jacobi_makeJacobi.out
+  *
+  * \sa JacobiRotation::makeJacobi(RealScalar, Scalar, RealScalar), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+  */
+template<typename Scalar>
+template<typename Derived>
+inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, typename Derived::Index p, typename Derived::Index q)
+{
+  return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
+}
+
+/** Makes \c *this as a Givens rotation \c G such that applying \f$ G^* \f$ to the left of the vector
+  * \f$ V = \left ( \begin{array}{c} p \\ q \end{array} \right )\f$ yields:
+  * \f$ G^* V = \left ( \begin{array}{c} r \\ 0 \end{array} \right )\f$.
+  *
+  * The value of \a z is returned if \a z is not null (the default is null).
+  * Also note that G is built such that the cosine is always real.
+  *
+  * Example: \include Jacobi_makeGivens.cpp
+  * Output: \verbinclude Jacobi_makeGivens.out
+  *
+  * This function implements the continuous Givens rotation generation algorithm
+  * found in Anderson (2000), Discontinuous Plane Rotations and the Symmetric Eigenvalue Problem.
+  * LAPACK Working Note 150, University of Tennessee, UT-CS-00-454, December 4, 2000.
+  *
+  * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+  */
+template<typename Scalar>
+void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* z)
+{
+  makeGivens(p, q, z, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
+}
+
+
+// specialization for complexes
+template<typename Scalar>
+void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type)
+{
+  using std::sqrt;
+  using std::abs;
+  using numext::conj;
+  
+  if(q==Scalar(0))
+  {
+    m_c = numext::real(p)<0 ? Scalar(-1) : Scalar(1);
+    m_s = 0;
+    if(r) *r = m_c * p;
+  }
+  else if(p==Scalar(0))
+  {
+    m_c = 0;
+    m_s = -q/abs(q);
+    if(r) *r = abs(q);
+  }
+  else
+  {
+    RealScalar p1 = numext::norm1(p);
+    RealScalar q1 = numext::norm1(q);
+    if(p1>=q1)
+    {
+      Scalar ps = p / p1;
+      RealScalar p2 = numext::abs2(ps);
+      Scalar qs = q / p1;
+      RealScalar q2 = numext::abs2(qs);
+
+      RealScalar u = sqrt(RealScalar(1) + q2/p2);
+      if(numext::real(p)<RealScalar(0))
+        u = -u;
+
+      m_c = Scalar(1)/u;
+      m_s = -qs*conj(ps)*(m_c/p2);
+      if(r) *r = p * u;
+    }
+    else
+    {
+      Scalar ps = p / q1;
+      RealScalar p2 = numext::abs2(ps);
+      Scalar qs = q / q1;
+      RealScalar q2 = numext::abs2(qs);
+
+      RealScalar u = q1 * sqrt(p2 + q2);
+      if(numext::real(p)<RealScalar(0))
+        u = -u;
+
+      p1 = abs(p);
+      ps = p/p1;
+      m_c = p1/u;
+      m_s = -conj(ps) * (q/u);
+      if(r) *r = ps * u;
+    }
+  }
+}
+
+// specialization for reals
+template<typename Scalar>
+void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type)
+{
+  using std::sqrt;
+  using std::abs;
+  if(q==Scalar(0))
+  {
+    m_c = p<Scalar(0) ? Scalar(-1) : Scalar(1);
+    m_s = Scalar(0);
+    if(r) *r = abs(p);
+  }
+  else if(p==Scalar(0))
+  {
+    m_c = Scalar(0);
+    m_s = q<Scalar(0) ? Scalar(1) : Scalar(-1);
+    if(r) *r = abs(q);
+  }
+  else if(abs(p) > abs(q))
+  {
+    Scalar t = q/p;
+    Scalar u = sqrt(Scalar(1) + numext::abs2(t));
+    if(p<Scalar(0))
+      u = -u;
+    m_c = Scalar(1)/u;
+    m_s = -t * m_c;
+    if(r) *r = p * u;
+  }
+  else
+  {
+    Scalar t = p/q;
+    Scalar u = sqrt(Scalar(1) + numext::abs2(t));
+    if(q<Scalar(0))
+      u = -u;
+    m_s = -Scalar(1)/u;
+    m_c = -t * m_s;
+    if(r) *r = q * u;
+  }
+
+}
+
+/****************************************************************************************
+*   Implementation of MatrixBase methods
+****************************************************************************************/
+
+/** \jacobi_module
+  * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y:
+  * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$
+  *
+  * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+  */
+namespace internal {
+template<typename VectorX, typename VectorY, typename OtherScalar>
+void apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j);
+}
+
+/** \jacobi_module
+  * Applies the rotation in the plane \a j to the rows \a p and \a q of \c *this, i.e., it computes B = J * B,
+  * with \f$ B = \left ( \begin{array}{cc} \text{*this.row}(p) \\ \text{*this.row}(q) \end{array} \right ) \f$.
+  *
+  * \sa class JacobiRotation, MatrixBase::applyOnTheRight(), internal::apply_rotation_in_the_plane()
+  */
+template<typename Derived>
+template<typename OtherScalar>
+inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j)
+{
+  RowXpr x(this->row(p));
+  RowXpr y(this->row(q));
+  internal::apply_rotation_in_the_plane(x, y, j);
+}
+
+/** \ingroup Jacobi_Module
+  * Applies the rotation in the plane \a j to the columns \a p and \a q of \c *this, i.e., it computes B = B * J
+  * with \f$ B = \left ( \begin{array}{cc} \text{*this.col}(p) & \text{*this.col}(q) \end{array} \right ) \f$.
+  *
+  * \sa class JacobiRotation, MatrixBase::applyOnTheLeft(), internal::apply_rotation_in_the_plane()
+  */
+template<typename Derived>
+template<typename OtherScalar>
+inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j)
+{
+  ColXpr x(this->col(p));
+  ColXpr y(this->col(q));
+  internal::apply_rotation_in_the_plane(x, y, j.transpose());
+}
+
+namespace internal {
+template<typename VectorX, typename VectorY, typename OtherScalar>
+void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j)
+{
+  typedef typename VectorX::Index Index;
+  typedef typename VectorX::Scalar Scalar;
+  enum { PacketSize = packet_traits<Scalar>::size };
+  typedef typename packet_traits<Scalar>::type Packet;
+  eigen_assert(_x.size() == _y.size());
+  Index size = _x.size();
+  Index incrx = _x.innerStride();
+  Index incry = _y.innerStride();
+
+  Scalar* EIGEN_RESTRICT x = &_x.coeffRef(0);
+  Scalar* EIGEN_RESTRICT y = &_y.coeffRef(0);
+  
+  OtherScalar c = j.c();
+  OtherScalar s = j.s();
+  if (c==OtherScalar(1) && s==OtherScalar(0))
+    return;
+
+  /*** dynamic-size vectorized paths ***/
+
+  if(VectorX::SizeAtCompileTime == Dynamic &&
+    (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
+    ((incrx==1 && incry==1) || PacketSize == 1))
+  {
+    // both vectors are sequentially stored in memory => vectorization
+    enum { Peeling = 2 };
+
+    Index alignedStart = internal::first_aligned(y, size);
+    Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
+
+    const Packet pc = pset1<Packet>(c);
+    const Packet ps = pset1<Packet>(s);
+    conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
+
+    for(Index i=0; i<alignedStart; ++i)
+    {
+      Scalar xi = x[i];
+      Scalar yi = y[i];
+      x[i] =  c * xi + numext::conj(s) * yi;
+      y[i] = -s * xi + numext::conj(c) * yi;
+    }
+
+    Scalar* EIGEN_RESTRICT px = x + alignedStart;
+    Scalar* EIGEN_RESTRICT py = y + alignedStart;
+
+    if(internal::first_aligned(x, size)==alignedStart)
+    {
+      for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
+      {
+        Packet xi = pload<Packet>(px);
+        Packet yi = pload<Packet>(py);
+        pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
+        pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
+        px += PacketSize;
+        py += PacketSize;
+      }
+    }
+    else
+    {
+      Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize);
+      for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize)
+      {
+        Packet xi   = ploadu<Packet>(px);
+        Packet xi1  = ploadu<Packet>(px+PacketSize);
+        Packet yi   = pload <Packet>(py);
+        Packet yi1  = pload <Packet>(py+PacketSize);
+        pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
+        pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1)));
+        pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
+        pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1)));
+        px += Peeling*PacketSize;
+        py += Peeling*PacketSize;
+      }
+      if(alignedEnd!=peelingEnd)
+      {
+        Packet xi = ploadu<Packet>(x+peelingEnd);
+        Packet yi = pload <Packet>(y+peelingEnd);
+        pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
+        pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
+      }
+    }
+
+    for(Index i=alignedEnd; i<size; ++i)
+    {
+      Scalar xi = x[i];
+      Scalar yi = y[i];
+      x[i] =  c * xi + numext::conj(s) * yi;
+      y[i] = -s * xi + numext::conj(c) * yi;
+    }
+  }
+
+  /*** fixed-size vectorized path ***/
+  else if(VectorX::SizeAtCompileTime != Dynamic &&
+          (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
+          (VectorX::Flags & VectorY::Flags & AlignedBit))
+  {
+    const Packet pc = pset1<Packet>(c);
+    const Packet ps = pset1<Packet>(s);
+    conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
+    Scalar* EIGEN_RESTRICT px = x;
+    Scalar* EIGEN_RESTRICT py = y;
+    for(Index i=0; i<size; i+=PacketSize)
+    {
+      Packet xi = pload<Packet>(px);
+      Packet yi = pload<Packet>(py);
+      pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
+      pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
+      px += PacketSize;
+      py += PacketSize;
+    }
+  }
+
+  /*** non-vectorized path ***/
+  else
+  {
+    for(Index i=0; i<size; ++i)
+    {
+      Scalar xi = *x;
+      Scalar yi = *y;
+      *x =  c * xi + numext::conj(s) * yi;
+      *y = -s * xi + numext::conj(c) * yi;
+      x += incrx;
+      y += incry;
+    }
+  }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_JACOBI_H
diff --git a/include/eigen3/Eigen/src/LU/Determinant.h b/include/eigen3/Eigen/src/LU/Determinant.h
new file mode 100644
index 0000000..bb8e78a
--- /dev/null
+++ b/include/eigen3/Eigen/src/LU/Determinant.h
@@ -0,0 +1,101 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DETERMINANT_H
+#define EIGEN_DETERMINANT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Derived>
+inline const typename Derived::Scalar bruteforce_det3_helper
+(const MatrixBase<Derived>& matrix, int a, int b, int c)
+{
+  return matrix.coeff(0,a)
+         * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b));
+}
+
+template<typename Derived>
+const typename Derived::Scalar bruteforce_det4_helper
+(const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
+{
+  return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1))
+       * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3));
+}
+
+template<typename Derived,
+         int DeterminantType = Derived::RowsAtCompileTime
+> struct determinant_impl
+{
+  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  {
+    if(Derived::ColsAtCompileTime==Dynamic && m.rows()==0)
+      return typename traits<Derived>::Scalar(1);
+    return m.partialPivLu().determinant();
+  }
+};
+
+template<typename Derived> struct determinant_impl<Derived, 1>
+{
+  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  {
+    return m.coeff(0,0);
+  }
+};
+
+template<typename Derived> struct determinant_impl<Derived, 2>
+{
+  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  {
+    return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
+  }
+};
+
+template<typename Derived> struct determinant_impl<Derived, 3>
+{
+  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  {
+    return bruteforce_det3_helper(m,0,1,2)
+          - bruteforce_det3_helper(m,1,0,2)
+          + bruteforce_det3_helper(m,2,0,1);
+  }
+};
+
+template<typename Derived> struct determinant_impl<Derived, 4>
+{
+  static typename traits<Derived>::Scalar run(const Derived& m)
+  {
+    // trick by Martin Costabel to compute 4x4 det with only 30 muls
+    return bruteforce_det4_helper(m,0,1,2,3)
+          - bruteforce_det4_helper(m,0,2,1,3)
+          + bruteforce_det4_helper(m,0,3,1,2)
+          + bruteforce_det4_helper(m,1,2,0,3)
+          - bruteforce_det4_helper(m,1,3,0,2)
+          + bruteforce_det4_helper(m,2,3,0,1);
+  }
+};
+
+} // end namespace internal
+
+/** \lu_module
+  *
+  * \returns the determinant of this matrix
+  */
+template<typename Derived>
+inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
+{
+  eigen_assert(rows() == cols());
+  typedef typename internal::nested<Derived,Base::RowsAtCompileTime>::type Nested;
+  return internal::determinant_impl<typename internal::remove_all<Nested>::type>::run(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DETERMINANT_H
diff --git a/include/eigen3/Eigen/src/LU/FullPivLU.h b/include/eigen3/Eigen/src/LU/FullPivLU.h
new file mode 100644
index 0000000..79ab6a8
--- /dev/null
+++ b/include/eigen3/Eigen/src/LU/FullPivLU.h
@@ -0,0 +1,743 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LU_H
+#define EIGEN_LU_H
+
+namespace Eigen { 
+
+/** \ingroup LU_Module
+  *
+  * \class FullPivLU
+  *
+  * \brief LU decomposition of a matrix with complete pivoting, and related features
+  *
+  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  *
+  * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
+  * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
+  * upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU
+  * decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any
+  * zeros are at the end.
+  *
+  * This decomposition provides the generic approach to solving systems of linear equations, computing
+  * the rank, invertibility, inverse, kernel, and determinant.
+  *
+  * This LU decomposition is very stable and well tested with large matrices. However there are use cases where the SVD
+  * decomposition is inherently more stable and/or flexible. For example, when computing the kernel of a matrix,
+  * working with the SVD allows to select the smallest singular values of the matrix, something that
+  * the LU decomposition doesn't see.
+  *
+  * The data of the LU decomposition can be directly accessed through the methods matrixLU(),
+  * permutationP(), permutationQ().
+  *
+  * As an exemple, here is how the original matrix can be retrieved:
+  * \include class_FullPivLU.cpp
+  * Output: \verbinclude class_FullPivLU.out
+  *
+  * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
+  */
+template<typename _MatrixType> class FullPivLU
+{
+  public:
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
+    typedef typename MatrixType::Index Index;
+    typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
+    typedef typename internal::plain_col_type<MatrixType, Index>::type IntColVectorType;
+    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationQType;
+    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationPType;
+
+    /**
+      * \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via LU::compute(const MatrixType&).
+      */
+    FullPivLU();
+
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa FullPivLU()
+      */
+    FullPivLU(Index rows, Index cols);
+
+    /** Constructor.
+      *
+      * \param matrix the matrix of which to compute the LU decomposition.
+      *               It is required to be nonzero.
+      */
+    FullPivLU(const MatrixType& matrix);
+
+    /** Computes the LU decomposition of the given matrix.
+      *
+      * \param matrix the matrix of which to compute the LU decomposition.
+      *               It is required to be nonzero.
+      *
+      * \returns a reference to *this
+      */
+    FullPivLU& compute(const MatrixType& matrix);
+
+    /** \returns the LU decomposition matrix: the upper-triangular part is U, the
+      * unit-lower-triangular part is L (at least for square matrices; in the non-square
+      * case, special care is needed, see the documentation of class FullPivLU).
+      *
+      * \sa matrixL(), matrixU()
+      */
+    inline const MatrixType& matrixLU() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return m_lu;
+    }
+
+    /** \returns the number of nonzero pivots in the LU decomposition.
+      * Here nonzero is meant in the exact sense, not in a fuzzy sense.
+      * So that notion isn't really intrinsically interesting, but it is
+      * still useful when implementing algorithms.
+      *
+      * \sa rank()
+      */
+    inline Index nonzeroPivots() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return m_nonzero_pivots;
+    }
+
+    /** \returns the absolute value of the biggest pivot, i.e. the biggest
+      *          diagonal coefficient of U.
+      */
+    RealScalar maxPivot() const { return m_maxpivot; }
+
+    /** \returns the permutation matrix P
+      *
+      * \sa permutationQ()
+      */
+    inline const PermutationPType& permutationP() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return m_p;
+    }
+
+    /** \returns the permutation matrix Q
+      *
+      * \sa permutationP()
+      */
+    inline const PermutationQType& permutationQ() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return m_q;
+    }
+
+    /** \returns the kernel of the matrix, also called its null-space. The columns of the returned matrix
+      * will form a basis of the kernel.
+      *
+      * \note If the kernel has dimension zero, then the returned matrix is a column-vector filled with zeros.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      *
+      * Example: \include FullPivLU_kernel.cpp
+      * Output: \verbinclude FullPivLU_kernel.out
+      *
+      * \sa image()
+      */
+    inline const internal::kernel_retval<FullPivLU> kernel() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return internal::kernel_retval<FullPivLU>(*this);
+    }
+
+    /** \returns the image of the matrix, also called its column-space. The columns of the returned matrix
+      * will form a basis of the kernel.
+      *
+      * \param originalMatrix the original matrix, of which *this is the LU decomposition.
+      *                       The reason why it is needed to pass it here, is that this allows
+      *                       a large optimization, as otherwise this method would need to reconstruct it
+      *                       from the LU decomposition.
+      *
+      * \note If the image has dimension zero, then the returned matrix is a column-vector filled with zeros.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      *
+      * Example: \include FullPivLU_image.cpp
+      * Output: \verbinclude FullPivLU_image.out
+      *
+      * \sa kernel()
+      */
+    inline const internal::image_retval<FullPivLU>
+      image(const MatrixType& originalMatrix) const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return internal::image_retval<FullPivLU>(*this, originalMatrix);
+    }
+
+    /** \return a solution x to the equation Ax=b, where A is the matrix of which
+      * *this is the LU decomposition.
+      *
+      * \param b the right-hand-side of the equation to solve. Can be a vector or a matrix,
+      *          the only requirement in order for the equation to make sense is that
+      *          b.rows()==A.rows(), where A is the matrix of which *this is the LU decomposition.
+      *
+      * \returns a solution.
+      *
+      * \note_about_checking_solutions
+      *
+      * \note_about_arbitrary_choice_of_solution
+      * \note_about_using_kernel_to_study_multiple_solutions
+      *
+      * Example: \include FullPivLU_solve.cpp
+      * Output: \verbinclude FullPivLU_solve.out
+      *
+      * \sa TriangularView::solve(), kernel(), inverse()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<FullPivLU, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return internal::solve_retval<FullPivLU, Rhs>(*this, b.derived());
+    }
+
+    /** \returns the determinant of the matrix of which
+      * *this is the LU decomposition. It has only linear complexity
+      * (that is, O(n) where n is the dimension of the square matrix)
+      * as the LU decomposition has already been computed.
+      *
+      * \note This is only for square matrices.
+      *
+      * \note For fixed-size matrices of size up to 4, MatrixBase::determinant() offers
+      *       optimized paths.
+      *
+      * \warning a determinant can be very big or small, so for matrices
+      * of large enough dimension, there is a risk of overflow/underflow.
+      *
+      * \sa MatrixBase::determinant()
+      */
+    typename internal::traits<MatrixType>::Scalar determinant() const;
+
+    /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
+      * who need to determine when pivots are to be considered nonzero. This is not used for the
+      * LU decomposition itself.
+      *
+      * When it needs to get the threshold value, Eigen calls threshold(). By default, this
+      * uses a formula to automatically determine a reasonable threshold.
+      * Once you have called the present method setThreshold(const RealScalar&),
+      * your value is used instead.
+      *
+      * \param threshold The new value to use as the threshold.
+      *
+      * A pivot will be considered nonzero if its absolute value is strictly greater than
+      *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
+      * where maxpivot is the biggest pivot.
+      *
+      * If you want to come back to the default behavior, call setThreshold(Default_t)
+      */
+    FullPivLU& setThreshold(const RealScalar& threshold)
+    {
+      m_usePrescribedThreshold = true;
+      m_prescribedThreshold = threshold;
+      return *this;
+    }
+
+    /** Allows to come back to the default behavior, letting Eigen use its default formula for
+      * determining the threshold.
+      *
+      * You should pass the special object Eigen::Default as parameter here.
+      * \code lu.setThreshold(Eigen::Default); \endcode
+      *
+      * See the documentation of setThreshold(const RealScalar&).
+      */
+    FullPivLU& setThreshold(Default_t)
+    {
+      m_usePrescribedThreshold = false;
+      return *this;
+    }
+
+    /** Returns the threshold that will be used by certain methods such as rank().
+      *
+      * See the documentation of setThreshold(const RealScalar&).
+      */
+    RealScalar threshold() const
+    {
+      eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+      return m_usePrescribedThreshold ? m_prescribedThreshold
+      // this formula comes from experimenting (see "LU precision tuning" thread on the list)
+      // and turns out to be identical to Higham's formula used already in LDLt.
+                                      : NumTraits<Scalar>::epsilon() * m_lu.diagonalSize();
+    }
+
+    /** \returns the rank of the matrix of which *this is the LU decomposition.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline Index rank() const
+    {
+      using std::abs;
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
+      Index result = 0;
+      for(Index i = 0; i < m_nonzero_pivots; ++i)
+        result += (abs(m_lu.coeff(i,i)) > premultiplied_threshold);
+      return result;
+    }
+
+    /** \returns the dimension of the kernel of the matrix of which *this is the LU decomposition.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline Index dimensionOfKernel() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return cols() - rank();
+    }
+
+    /** \returns true if the matrix of which *this is the LU decomposition represents an injective
+      *          linear map, i.e. has trivial kernel; false otherwise.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline bool isInjective() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return rank() == cols();
+    }
+
+    /** \returns true if the matrix of which *this is the LU decomposition represents a surjective
+      *          linear map; false otherwise.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline bool isSurjective() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return rank() == rows();
+    }
+
+    /** \returns true if the matrix of which *this is the LU decomposition is invertible.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline bool isInvertible() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return isInjective() && (m_lu.rows() == m_lu.cols());
+    }
+
+    /** \returns the inverse of the matrix of which *this is the LU decomposition.
+      *
+      * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
+      *       Use isInvertible() to first determine whether this matrix is invertible.
+      *
+      * \sa MatrixBase::inverse()
+      */
+    inline const internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType> inverse() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the inverse of a non-square matrix!");
+      return internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType>
+               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
+    }
+
+    MatrixType reconstructedMatrix() const;
+
+    inline Index rows() const { return m_lu.rows(); }
+    inline Index cols() const { return m_lu.cols(); }
+
+  protected:
+    MatrixType m_lu;
+    PermutationPType m_p;
+    PermutationQType m_q;
+    IntColVectorType m_rowsTranspositions;
+    IntRowVectorType m_colsTranspositions;
+    Index m_det_pq, m_nonzero_pivots;
+    RealScalar m_maxpivot, m_prescribedThreshold;
+    bool m_isInitialized, m_usePrescribedThreshold;
+};
+
+template<typename MatrixType>
+FullPivLU<MatrixType>::FullPivLU()
+  : m_isInitialized(false), m_usePrescribedThreshold(false)
+{
+}
+
+template<typename MatrixType>
+FullPivLU<MatrixType>::FullPivLU(Index rows, Index cols)
+  : m_lu(rows, cols),
+    m_p(rows),
+    m_q(cols),
+    m_rowsTranspositions(rows),
+    m_colsTranspositions(cols),
+    m_isInitialized(false),
+    m_usePrescribedThreshold(false)
+{
+}
+
+template<typename MatrixType>
+FullPivLU<MatrixType>::FullPivLU(const MatrixType& matrix)
+  : m_lu(matrix.rows(), matrix.cols()),
+    m_p(matrix.rows()),
+    m_q(matrix.cols()),
+    m_rowsTranspositions(matrix.rows()),
+    m_colsTranspositions(matrix.cols()),
+    m_isInitialized(false),
+    m_usePrescribedThreshold(false)
+{
+  compute(matrix);
+}
+
+template<typename MatrixType>
+FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
+{
+  // the permutations are stored as int indices, so just to be sure:
+  eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
+  
+  m_isInitialized = true;
+  m_lu = matrix;
+
+  const Index size = matrix.diagonalSize();
+  const Index rows = matrix.rows();
+  const Index cols = matrix.cols();
+
+  // will store the transpositions, before we accumulate them at the end.
+  // can't accumulate on-the-fly because that will be done in reverse order for the rows.
+  m_rowsTranspositions.resize(matrix.rows());
+  m_colsTranspositions.resize(matrix.cols());
+  Index number_of_transpositions = 0; // number of NONTRIVIAL transpositions, i.e. m_rowsTranspositions[i]!=i
+
+  m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
+  m_maxpivot = RealScalar(0);
+
+  for(Index k = 0; k < size; ++k)
+  {
+    // First, we need to find the pivot.
+
+    // biggest coefficient in the remaining bottom-right corner (starting at row k, col k)
+    Index row_of_biggest_in_corner, col_of_biggest_in_corner;
+    RealScalar biggest_in_corner;
+    biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k)
+                        .cwiseAbs()
+                        .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
+    col_of_biggest_in_corner += k; // need to add k to them.
+
+    if(biggest_in_corner==RealScalar(0))
+    {
+      // before exiting, make sure to initialize the still uninitialized transpositions
+      // in a sane state without destroying what we already have.
+      m_nonzero_pivots = k;
+      for(Index i = k; i < size; ++i)
+      {
+        m_rowsTranspositions.coeffRef(i) = i;
+        m_colsTranspositions.coeffRef(i) = i;
+      }
+      break;
+    }
+
+    if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner;
+
+    // Now that we've found the pivot, we need to apply the row/col swaps to
+    // bring it to the location (k,k).
+
+    m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner;
+    m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner;
+    if(k != row_of_biggest_in_corner) {
+      m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner));
+      ++number_of_transpositions;
+    }
+    if(k != col_of_biggest_in_corner) {
+      m_lu.col(k).swap(m_lu.col(col_of_biggest_in_corner));
+      ++number_of_transpositions;
+    }
+
+    // Now that the pivot is at the right location, we update the remaining
+    // bottom-right corner by Gaussian elimination.
+
+    if(k<rows-1)
+      m_lu.col(k).tail(rows-k-1) /= m_lu.coeff(k,k);
+    if(k<size-1)
+      m_lu.block(k+1,k+1,rows-k-1,cols-k-1).noalias() -= m_lu.col(k).tail(rows-k-1) * m_lu.row(k).tail(cols-k-1);
+  }
+
+  // the main loop is over, we still have to accumulate the transpositions to find the
+  // permutations P and Q
+
+  m_p.setIdentity(rows);
+  for(Index k = size-1; k >= 0; --k)
+    m_p.applyTranspositionOnTheRight(k, m_rowsTranspositions.coeff(k));
+
+  m_q.setIdentity(cols);
+  for(Index k = 0; k < size; ++k)
+    m_q.applyTranspositionOnTheRight(k, m_colsTranspositions.coeff(k));
+
+  m_det_pq = (number_of_transpositions%2) ? -1 : 1;
+  return *this;
+}
+
+template<typename MatrixType>
+typename internal::traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant() const
+{
+  eigen_assert(m_isInitialized && "LU is not initialized.");
+  eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the determinant of a non-square matrix!");
+  return Scalar(m_det_pq) * Scalar(m_lu.diagonal().prod());
+}
+
+/** \returns the matrix represented by the decomposition,
+ * i.e., it returns the product: \f$ P^{-1} L U Q^{-1} \f$.
+ * This function is provided for debug purposes. */
+template<typename MatrixType>
+MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
+{
+  eigen_assert(m_isInitialized && "LU is not initialized.");
+  const Index smalldim = (std::min)(m_lu.rows(), m_lu.cols());
+  // LU
+  MatrixType res(m_lu.rows(),m_lu.cols());
+  // FIXME the .toDenseMatrix() should not be needed...
+  res = m_lu.leftCols(smalldim)
+            .template triangularView<UnitLower>().toDenseMatrix()
+      * m_lu.topRows(smalldim)
+            .template triangularView<Upper>().toDenseMatrix();
+
+  // P^{-1}(LU)
+  res = m_p.inverse() * res;
+
+  // (P^{-1}LU)Q^{-1}
+  res = res * m_q.inverse();
+
+  return res;
+}
+
+/********* Implementation of kernel() **************************************************/
+
+namespace internal {
+template<typename _MatrixType>
+struct kernel_retval<FullPivLU<_MatrixType> >
+  : kernel_retval_base<FullPivLU<_MatrixType> >
+{
+  EIGEN_MAKE_KERNEL_HELPERS(FullPivLU<_MatrixType>)
+
+  enum { MaxSmallDimAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(
+            MatrixType::MaxColsAtCompileTime,
+            MatrixType::MaxRowsAtCompileTime)
+  };
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    using std::abs;
+    const Index cols = dec().matrixLU().cols(), dimker = cols - rank();
+    if(dimker == 0)
+    {
+      // The Kernel is just {0}, so it doesn't have a basis properly speaking, but let's
+      // avoid crashing/asserting as that depends on floating point calculations. Let's
+      // just return a single column vector filled with zeros.
+      dst.setZero();
+      return;
+    }
+
+    /* Let us use the following lemma:
+      *
+      * Lemma: If the matrix A has the LU decomposition PAQ = LU,
+      * then Ker A = Q(Ker U).
+      *
+      * Proof: trivial: just keep in mind that P, Q, L are invertible.
+      */
+
+    /* Thus, all we need to do is to compute Ker U, and then apply Q.
+      *
+      * U is upper triangular, with eigenvalues sorted so that any zeros appear at the end.
+      * Thus, the diagonal of U ends with exactly
+      * dimKer zero's. Let us use that to construct dimKer linearly
+      * independent vectors in Ker U.
+      */
+
+    Matrix<Index, Dynamic, 1, 0, MaxSmallDimAtCompileTime, 1> pivots(rank());
+    RealScalar premultiplied_threshold = dec().maxPivot() * dec().threshold();
+    Index p = 0;
+    for(Index i = 0; i < dec().nonzeroPivots(); ++i)
+      if(abs(dec().matrixLU().coeff(i,i)) > premultiplied_threshold)
+        pivots.coeffRef(p++) = i;
+    eigen_internal_assert(p == rank());
+
+    // we construct a temporaty trapezoid matrix m, by taking the U matrix and
+    // permuting the rows and cols to bring the nonnegligible pivots to the top of
+    // the main diagonal. We need that to be able to apply our triangular solvers.
+    // FIXME when we get triangularView-for-rectangular-matrices, this can be simplified
+    Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, MatrixType::Options,
+           MaxSmallDimAtCompileTime, MatrixType::MaxColsAtCompileTime>
+      m(dec().matrixLU().block(0, 0, rank(), cols));
+    for(Index i = 0; i < rank(); ++i)
+    {
+      if(i) m.row(i).head(i).setZero();
+      m.row(i).tail(cols-i) = dec().matrixLU().row(pivots.coeff(i)).tail(cols-i);
+    }
+    m.block(0, 0, rank(), rank());
+    m.block(0, 0, rank(), rank()).template triangularView<StrictlyLower>().setZero();
+    for(Index i = 0; i < rank(); ++i)
+      m.col(i).swap(m.col(pivots.coeff(i)));
+
+    // ok, we have our trapezoid matrix, we can apply the triangular solver.
+    // notice that the math behind this suggests that we should apply this to the
+    // negative of the RHS, but for performance we just put the negative sign elsewhere, see below.
+    m.topLeftCorner(rank(), rank())
+     .template triangularView<Upper>().solveInPlace(
+        m.topRightCorner(rank(), dimker)
+      );
+
+    // now we must undo the column permutation that we had applied!
+    for(Index i = rank()-1; i >= 0; --i)
+      m.col(i).swap(m.col(pivots.coeff(i)));
+
+    // see the negative sign in the next line, that's what we were talking about above.
+    for(Index i = 0; i < rank(); ++i) dst.row(dec().permutationQ().indices().coeff(i)) = -m.row(i).tail(dimker);
+    for(Index i = rank(); i < cols; ++i) dst.row(dec().permutationQ().indices().coeff(i)).setZero();
+    for(Index k = 0; k < dimker; ++k) dst.coeffRef(dec().permutationQ().indices().coeff(rank()+k), k) = Scalar(1);
+  }
+};
+
+/***** Implementation of image() *****************************************************/
+
+template<typename _MatrixType>
+struct image_retval<FullPivLU<_MatrixType> >
+  : image_retval_base<FullPivLU<_MatrixType> >
+{
+  EIGEN_MAKE_IMAGE_HELPERS(FullPivLU<_MatrixType>)
+
+  enum { MaxSmallDimAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(
+            MatrixType::MaxColsAtCompileTime,
+            MatrixType::MaxRowsAtCompileTime)
+  };
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    using std::abs;
+    if(rank() == 0)
+    {
+      // The Image is just {0}, so it doesn't have a basis properly speaking, but let's
+      // avoid crashing/asserting as that depends on floating point calculations. Let's
+      // just return a single column vector filled with zeros.
+      dst.setZero();
+      return;
+    }
+
+    Matrix<Index, Dynamic, 1, 0, MaxSmallDimAtCompileTime, 1> pivots(rank());
+    RealScalar premultiplied_threshold = dec().maxPivot() * dec().threshold();
+    Index p = 0;
+    for(Index i = 0; i < dec().nonzeroPivots(); ++i)
+      if(abs(dec().matrixLU().coeff(i,i)) > premultiplied_threshold)
+        pivots.coeffRef(p++) = i;
+    eigen_internal_assert(p == rank());
+
+    for(Index i = 0; i < rank(); ++i)
+      dst.col(i) = originalMatrix().col(dec().permutationQ().indices().coeff(pivots.coeff(i)));
+  }
+};
+
+/***** Implementation of solve() *****************************************************/
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<FullPivLU<_MatrixType>, Rhs>
+  : solve_retval_base<FullPivLU<_MatrixType>, Rhs>
+{
+  EIGEN_MAKE_SOLVE_HELPERS(FullPivLU<_MatrixType>,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
+     * So we proceed as follows:
+     * Step 1: compute c = P * rhs.
+     * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
+     * Step 3: replace c by the solution x to Ux = c. May or may not exist.
+     * Step 4: result = Q * c;
+     */
+
+    const Index rows = dec().rows(), cols = dec().cols(),
+              nonzero_pivots = dec().nonzeroPivots();
+    eigen_assert(rhs().rows() == rows);
+    const Index smalldim = (std::min)(rows, cols);
+
+    if(nonzero_pivots == 0)
+    {
+      dst.setZero();
+      return;
+    }
+
+    typename Rhs::PlainObject c(rhs().rows(), rhs().cols());
+
+    // Step 1
+    c = dec().permutationP() * rhs();
+
+    // Step 2
+    dec().matrixLU()
+        .topLeftCorner(smalldim,smalldim)
+        .template triangularView<UnitLower>()
+        .solveInPlace(c.topRows(smalldim));
+    if(rows>cols)
+    {
+      c.bottomRows(rows-cols)
+        -= dec().matrixLU().bottomRows(rows-cols)
+         * c.topRows(cols);
+    }
+
+    // Step 3
+    dec().matrixLU()
+        .topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .solveInPlace(c.topRows(nonzero_pivots));
+
+    // Step 4
+    for(Index i = 0; i < nonzero_pivots; ++i)
+      dst.row(dec().permutationQ().indices().coeff(i)) = c.row(i);
+    for(Index i = nonzero_pivots; i < dec().matrixLU().cols(); ++i)
+      dst.row(dec().permutationQ().indices().coeff(i)).setZero();
+  }
+};
+
+} // end namespace internal
+
+/******* MatrixBase methods *****************************************************************/
+
+/** \lu_module
+  *
+  * \return the full-pivoting LU decomposition of \c *this.
+  *
+  * \sa class FullPivLU
+  */
+template<typename Derived>
+inline const FullPivLU<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::fullPivLu() const
+{
+  return FullPivLU<PlainObject>(eval());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_LU_H
diff --git a/include/eigen3/Eigen/src/LU/Inverse.h b/include/eigen3/Eigen/src/LU/Inverse.h
new file mode 100644
index 0000000..3cf8871
--- /dev/null
+++ b/include/eigen3/Eigen/src/LU/Inverse.h
@@ -0,0 +1,400 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INVERSE_H
+#define EIGEN_INVERSE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************
+*** General case implementation ***
+**********************************/
+
+template<typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
+struct compute_inverse
+{
+  static inline void run(const MatrixType& matrix, ResultType& result)
+  {
+    result = matrix.partialPivLu().inverse();
+  }
+};
+
+template<typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
+struct compute_inverse_and_det_with_check { /* nothing! general case not supported. */ };
+
+/****************************
+*** Size 1 implementation ***
+****************************/
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 1>
+{
+  static inline void run(const MatrixType& matrix, ResultType& result)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    result.coeffRef(0,0) = Scalar(1) / matrix.coeff(0,0);
+  }
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1>
+{
+  static inline void run(
+    const MatrixType& matrix,
+    const typename MatrixType::RealScalar& absDeterminantThreshold,
+    ResultType& result,
+    typename ResultType::Scalar& determinant,
+    bool& invertible
+  )
+  {
+    using std::abs;
+    determinant = matrix.coeff(0,0);
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if(invertible) result.coeffRef(0,0) = typename ResultType::Scalar(1) / determinant;
+  }
+};
+
+/****************************
+*** Size 2 implementation ***
+****************************/
+
+template<typename MatrixType, typename ResultType>
+inline void compute_inverse_size2_helper(
+    const MatrixType& matrix, const typename ResultType::Scalar& invdet,
+    ResultType& result)
+{
+  result.coeffRef(0,0) = matrix.coeff(1,1) * invdet;
+  result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet;
+  result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet;
+  result.coeffRef(1,1) = matrix.coeff(0,0) * invdet;
+}
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 2>
+{
+  static inline void run(const MatrixType& matrix, ResultType& result)
+  {
+    typedef typename ResultType::Scalar Scalar;
+    const Scalar invdet = typename MatrixType::Scalar(1) / matrix.determinant();
+    compute_inverse_size2_helper(matrix, invdet, result);
+  }
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
+{
+  static inline void run(
+    const MatrixType& matrix,
+    const typename MatrixType::RealScalar& absDeterminantThreshold,
+    ResultType& inverse,
+    typename ResultType::Scalar& determinant,
+    bool& invertible
+  )
+  {
+    using std::abs;
+    typedef typename ResultType::Scalar Scalar;
+    determinant = matrix.determinant();
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if(!invertible) return;
+    const Scalar invdet = Scalar(1) / determinant;
+    compute_inverse_size2_helper(matrix, invdet, inverse);
+  }
+};
+
+/****************************
+*** Size 3 implementation ***
+****************************/
+
+template<typename MatrixType, int i, int j>
+inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m)
+{
+  enum {
+    i1 = (i+1) % 3,
+    i2 = (i+2) % 3,
+    j1 = (j+1) % 3,
+    j2 = (j+2) % 3
+  };
+  return m.coeff(i1, j1) * m.coeff(i2, j2)
+       - m.coeff(i1, j2) * m.coeff(i2, j1);
+}
+
+template<typename MatrixType, typename ResultType>
+inline void compute_inverse_size3_helper(
+    const MatrixType& matrix,
+    const typename ResultType::Scalar& invdet,
+    const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0,
+    ResultType& result)
+{
+  result.row(0) = cofactors_col0 * invdet;
+  result.coeffRef(1,0) =  cofactor_3x3<MatrixType,0,1>(matrix) * invdet;
+  result.coeffRef(1,1) =  cofactor_3x3<MatrixType,1,1>(matrix) * invdet;
+  result.coeffRef(1,2) =  cofactor_3x3<MatrixType,2,1>(matrix) * invdet;
+  result.coeffRef(2,0) =  cofactor_3x3<MatrixType,0,2>(matrix) * invdet;
+  result.coeffRef(2,1) =  cofactor_3x3<MatrixType,1,2>(matrix) * invdet;
+  result.coeffRef(2,2) =  cofactor_3x3<MatrixType,2,2>(matrix) * invdet;
+}
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 3>
+{
+  static inline void run(const MatrixType& matrix, ResultType& result)
+  {
+    typedef typename ResultType::Scalar Scalar;
+    Matrix<typename MatrixType::Scalar,3,1> cofactors_col0;
+    cofactors_col0.coeffRef(0) =  cofactor_3x3<MatrixType,0,0>(matrix);
+    cofactors_col0.coeffRef(1) =  cofactor_3x3<MatrixType,1,0>(matrix);
+    cofactors_col0.coeffRef(2) =  cofactor_3x3<MatrixType,2,0>(matrix);
+    const Scalar det = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
+    const Scalar invdet = Scalar(1) / det;
+    compute_inverse_size3_helper(matrix, invdet, cofactors_col0, result);
+  }
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
+{
+  static inline void run(
+    const MatrixType& matrix,
+    const typename MatrixType::RealScalar& absDeterminantThreshold,
+    ResultType& inverse,
+    typename ResultType::Scalar& determinant,
+    bool& invertible
+  )
+  {
+    using std::abs;
+    typedef typename ResultType::Scalar Scalar;
+    Matrix<Scalar,3,1> cofactors_col0;
+    cofactors_col0.coeffRef(0) =  cofactor_3x3<MatrixType,0,0>(matrix);
+    cofactors_col0.coeffRef(1) =  cofactor_3x3<MatrixType,1,0>(matrix);
+    cofactors_col0.coeffRef(2) =  cofactor_3x3<MatrixType,2,0>(matrix);
+    determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if(!invertible) return;
+    const Scalar invdet = Scalar(1) / determinant;
+    compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse);
+  }
+};
+
+/****************************
+*** Size 4 implementation ***
+****************************/
+
+template<typename Derived>
+inline const typename Derived::Scalar general_det3_helper
+(const MatrixBase<Derived>& matrix, int i1, int i2, int i3, int j1, int j2, int j3)
+{
+  return matrix.coeff(i1,j1)
+         * (matrix.coeff(i2,j2) * matrix.coeff(i3,j3) - matrix.coeff(i2,j3) * matrix.coeff(i3,j2));
+}
+
+template<typename MatrixType, int i, int j>
+inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix)
+{
+  enum {
+    i1 = (i+1) % 4,
+    i2 = (i+2) % 4,
+    i3 = (i+3) % 4,
+    j1 = (j+1) % 4,
+    j2 = (j+2) % 4,
+    j3 = (j+3) % 4
+  };
+  return general_det3_helper(matrix, i1, i2, i3, j1, j2, j3)
+       + general_det3_helper(matrix, i2, i3, i1, j1, j2, j3)
+       + general_det3_helper(matrix, i3, i1, i2, j1, j2, j3);
+}
+
+template<int Arch, typename Scalar, typename MatrixType, typename ResultType>
+struct compute_inverse_size4
+{
+  static void run(const MatrixType& matrix, ResultType& result)
+  {
+    result.coeffRef(0,0) =  cofactor_4x4<MatrixType,0,0>(matrix);
+    result.coeffRef(1,0) = -cofactor_4x4<MatrixType,0,1>(matrix);
+    result.coeffRef(2,0) =  cofactor_4x4<MatrixType,0,2>(matrix);
+    result.coeffRef(3,0) = -cofactor_4x4<MatrixType,0,3>(matrix);
+    result.coeffRef(0,2) =  cofactor_4x4<MatrixType,2,0>(matrix);
+    result.coeffRef(1,2) = -cofactor_4x4<MatrixType,2,1>(matrix);
+    result.coeffRef(2,2) =  cofactor_4x4<MatrixType,2,2>(matrix);
+    result.coeffRef(3,2) = -cofactor_4x4<MatrixType,2,3>(matrix);
+    result.coeffRef(0,1) = -cofactor_4x4<MatrixType,1,0>(matrix);
+    result.coeffRef(1,1) =  cofactor_4x4<MatrixType,1,1>(matrix);
+    result.coeffRef(2,1) = -cofactor_4x4<MatrixType,1,2>(matrix);
+    result.coeffRef(3,1) =  cofactor_4x4<MatrixType,1,3>(matrix);
+    result.coeffRef(0,3) = -cofactor_4x4<MatrixType,3,0>(matrix);
+    result.coeffRef(1,3) =  cofactor_4x4<MatrixType,3,1>(matrix);
+    result.coeffRef(2,3) = -cofactor_4x4<MatrixType,3,2>(matrix);
+    result.coeffRef(3,3) =  cofactor_4x4<MatrixType,3,3>(matrix);
+    result /= (matrix.col(0).cwiseProduct(result.row(0).transpose())).sum();
+  }
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 4>
+ : compute_inverse_size4<Architecture::Target, typename MatrixType::Scalar,
+                            MatrixType, ResultType>
+{
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
+{
+  static inline void run(
+    const MatrixType& matrix,
+    const typename MatrixType::RealScalar& absDeterminantThreshold,
+    ResultType& inverse,
+    typename ResultType::Scalar& determinant,
+    bool& invertible
+  )
+  {
+    using std::abs;
+    determinant = matrix.determinant();
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if(invertible) compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
+  }
+};
+
+/*************************
+*** MatrixBase methods ***
+*************************/
+
+template<typename MatrixType>
+struct traits<inverse_impl<MatrixType> >
+{
+  typedef typename MatrixType::PlainObject ReturnType;
+};
+
+template<typename MatrixType>
+struct inverse_impl : public ReturnByValue<inverse_impl<MatrixType> >
+{
+  typedef typename MatrixType::Index Index;
+  typedef typename internal::eval<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+  MatrixTypeNested m_matrix;
+
+  inverse_impl(const MatrixType& matrix)
+    : m_matrix(matrix)
+  {}
+
+  inline Index rows() const { return m_matrix.rows(); }
+  inline Index cols() const { return m_matrix.cols(); }
+
+  template<typename Dest> inline void evalTo(Dest& dst) const
+  {
+    const int Size = EIGEN_PLAIN_ENUM_MIN(MatrixType::ColsAtCompileTime,Dest::ColsAtCompileTime);
+    EIGEN_ONLY_USED_FOR_DEBUG(Size);
+    eigen_assert(( (Size<=1) || (Size>4) || (extract_data(m_matrix)!=extract_data(dst)))
+              && "Aliasing problem detected in inverse(), you need to do inverse().eval() here.");
+
+    compute_inverse<MatrixTypeNestedCleaned, Dest>::run(m_matrix, dst);
+  }
+};
+
+} // end namespace internal
+
+/** \lu_module
+  *
+  * \returns the matrix inverse of this matrix.
+  *
+  * For small fixed sizes up to 4x4, this method uses cofactors.
+  * In the general case, this method uses class PartialPivLU.
+  *
+  * \note This matrix must be invertible, otherwise the result is undefined. If you need an
+  * invertibility check, do the following:
+  * \li for fixed sizes up to 4x4, use computeInverseAndDetWithCheck().
+  * \li for the general case, use class FullPivLU.
+  *
+  * Example: \include MatrixBase_inverse.cpp
+  * Output: \verbinclude MatrixBase_inverse.out
+  *
+  * \sa computeInverseAndDetWithCheck()
+  */
+template<typename Derived>
+inline const internal::inverse_impl<Derived> MatrixBase<Derived>::inverse() const
+{
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
+  eigen_assert(rows() == cols());
+  return internal::inverse_impl<Derived>(derived());
+}
+
+/** \lu_module
+  *
+  * Computation of matrix inverse and determinant, with invertibility check.
+  *
+  * This is only for fixed-size square matrices of size up to 4x4.
+  *
+  * \param inverse Reference to the matrix in which to store the inverse.
+  * \param determinant Reference to the variable in which to store the determinant.
+  * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
+  * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
+  *                                The matrix will be declared invertible if the absolute value of its
+  *                                determinant is greater than this threshold.
+  *
+  * Example: \include MatrixBase_computeInverseAndDetWithCheck.cpp
+  * Output: \verbinclude MatrixBase_computeInverseAndDetWithCheck.out
+  *
+  * \sa inverse(), computeInverseWithCheck()
+  */
+template<typename Derived>
+template<typename ResultType>
+inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(
+    ResultType& inverse,
+    typename ResultType::Scalar& determinant,
+    bool& invertible,
+    const RealScalar& absDeterminantThreshold
+  ) const
+{
+  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
+  eigen_assert(rows() == cols());
+  // for 2x2, it's worth giving a chance to avoid evaluating.
+  // for larger sizes, evaluating has negligible cost and limits code size.
+  typedef typename internal::conditional<
+    RowsAtCompileTime == 2,
+    typename internal::remove_all<typename internal::nested<Derived, 2>::type>::type,
+    PlainObject
+  >::type MatrixType;
+  internal::compute_inverse_and_det_with_check<MatrixType, ResultType>::run
+    (derived(), absDeterminantThreshold, inverse, determinant, invertible);
+}
+
+/** \lu_module
+  *
+  * Computation of matrix inverse, with invertibility check.
+  *
+  * This is only for fixed-size square matrices of size up to 4x4.
+  *
+  * \param inverse Reference to the matrix in which to store the inverse.
+  * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
+  * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
+  *                                The matrix will be declared invertible if the absolute value of its
+  *                                determinant is greater than this threshold.
+  *
+  * Example: \include MatrixBase_computeInverseWithCheck.cpp
+  * Output: \verbinclude MatrixBase_computeInverseWithCheck.out
+  *
+  * \sa inverse(), computeInverseAndDetWithCheck()
+  */
+template<typename Derived>
+template<typename ResultType>
+inline void MatrixBase<Derived>::computeInverseWithCheck(
+    ResultType& inverse,
+    bool& invertible,
+    const RealScalar& absDeterminantThreshold
+  ) const
+{
+  RealScalar determinant;
+  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
+  eigen_assert(rows() == cols());
+  computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_INVERSE_H
diff --git a/include/eigen3/Eigen/src/LU/PartialPivLU.h b/include/eigen3/Eigen/src/LU/PartialPivLU.h
new file mode 100644
index 0000000..740ee69
--- /dev/null
+++ b/include/eigen3/Eigen/src/LU/PartialPivLU.h
@@ -0,0 +1,501 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARTIALLU_H
+#define EIGEN_PARTIALLU_H
+
+namespace Eigen { 
+
+/** \ingroup LU_Module
+  *
+  * \class PartialPivLU
+  *
+  * \brief LU decomposition of a matrix with partial pivoting, and related features
+  *
+  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  *
+  * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A
+  * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P
+  * is a permutation matrix.
+  *
+  * Typically, partial pivoting LU decomposition is only considered numerically stable for square invertible
+  * matrices. Thus LAPACK's dgesv and dgesvx require the matrix to be square and invertible. The present class
+  * does the same. It will assert that the matrix is square, but it won't (actually it can't) check that the
+  * matrix is invertible: it is your task to check that you only use this decomposition on invertible matrices.
+  *
+  * The guaranteed safe alternative, working for all matrices, is the full pivoting LU decomposition, provided
+  * by class FullPivLU.
+  *
+  * This is \b not a rank-revealing LU decomposition. Many features are intentionally absent from this class,
+  * such as rank computation. If you need these features, use class FullPivLU.
+  *
+  * This LU decomposition is suitable to invert invertible matrices. It is what MatrixBase::inverse() uses
+  * in the general case.
+  * On the other hand, it is \b not suitable to determine whether a given matrix is invertible.
+  *
+  * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
+  *
+  * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
+  */
+template<typename _MatrixType> class PartialPivLU
+{
+  public:
+
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
+    typedef typename MatrixType::Index Index;
+    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
+    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
+
+
+    /**
+    * \brief Default Constructor.
+    *
+    * The default constructor is useful in cases in which the user intends to
+    * perform decompositions via PartialPivLU::compute(const MatrixType&).
+    */
+    PartialPivLU();
+
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa PartialPivLU()
+      */
+    PartialPivLU(Index size);
+
+    /** Constructor.
+      *
+      * \param matrix the matrix of which to compute the LU decomposition.
+      *
+      * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
+      * If you need to deal with non-full rank, use class FullPivLU instead.
+      */
+    PartialPivLU(const MatrixType& matrix);
+
+    PartialPivLU& compute(const MatrixType& matrix);
+
+    /** \returns the LU decomposition matrix: the upper-triangular part is U, the
+      * unit-lower-triangular part is L (at least for square matrices; in the non-square
+      * case, special care is needed, see the documentation of class FullPivLU).
+      *
+      * \sa matrixL(), matrixU()
+      */
+    inline const MatrixType& matrixLU() const
+    {
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return m_lu;
+    }
+
+    /** \returns the permutation matrix P.
+      */
+    inline const PermutationType& permutationP() const
+    {
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return m_p;
+    }
+
+    /** This method returns the solution x to the equation Ax=b, where A is the matrix of which
+      * *this is the LU decomposition.
+      *
+      * \param b the right-hand-side of the equation to solve. Can be a vector or a matrix,
+      *          the only requirement in order for the equation to make sense is that
+      *          b.rows()==A.rows(), where A is the matrix of which *this is the LU decomposition.
+      *
+      * \returns the solution.
+      *
+      * Example: \include PartialPivLU_solve.cpp
+      * Output: \verbinclude PartialPivLU_solve.out
+      *
+      * Since this PartialPivLU class assumes anyway that the matrix A is invertible, the solution
+      * theoretically exists and is unique regardless of b.
+      *
+      * \sa TriangularView::solve(), inverse(), computeInverse()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<PartialPivLU, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return internal::solve_retval<PartialPivLU, Rhs>(*this, b.derived());
+    }
+
+    /** \returns the inverse of the matrix of which *this is the LU decomposition.
+      *
+      * \warning The matrix being decomposed here is assumed to be invertible. If you need to check for
+      *          invertibility, use class FullPivLU instead.
+      *
+      * \sa MatrixBase::inverse(), LU::inverse()
+      */
+    inline const internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType> inverse() const
+    {
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType>
+               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
+    }
+
+    /** \returns the determinant of the matrix of which
+      * *this is the LU decomposition. It has only linear complexity
+      * (that is, O(n) where n is the dimension of the square matrix)
+      * as the LU decomposition has already been computed.
+      *
+      * \note For fixed-size matrices of size up to 4, MatrixBase::determinant() offers
+      *       optimized paths.
+      *
+      * \warning a determinant can be very big or small, so for matrices
+      * of large enough dimension, there is a risk of overflow/underflow.
+      *
+      * \sa MatrixBase::determinant()
+      */
+    typename internal::traits<MatrixType>::Scalar determinant() const;
+
+    MatrixType reconstructedMatrix() const;
+
+    inline Index rows() const { return m_lu.rows(); }
+    inline Index cols() const { return m_lu.cols(); }
+
+  protected:
+    MatrixType m_lu;
+    PermutationType m_p;
+    TranspositionType m_rowsTranspositions;
+    Index m_det_p;
+    bool m_isInitialized;
+};
+
+template<typename MatrixType>
+PartialPivLU<MatrixType>::PartialPivLU()
+  : m_lu(),
+    m_p(),
+    m_rowsTranspositions(),
+    m_det_p(0),
+    m_isInitialized(false)
+{
+}
+
+template<typename MatrixType>
+PartialPivLU<MatrixType>::PartialPivLU(Index size)
+  : m_lu(size, size),
+    m_p(size),
+    m_rowsTranspositions(size),
+    m_det_p(0),
+    m_isInitialized(false)
+{
+}
+
+template<typename MatrixType>
+PartialPivLU<MatrixType>::PartialPivLU(const MatrixType& matrix)
+  : m_lu(matrix.rows(), matrix.rows()),
+    m_p(matrix.rows()),
+    m_rowsTranspositions(matrix.rows()),
+    m_det_p(0),
+    m_isInitialized(false)
+{
+  compute(matrix);
+}
+
+namespace internal {
+
+/** \internal This is the blocked version of fullpivlu_unblocked() */
+template<typename Scalar, int StorageOrder, typename PivIndex>
+struct partial_lu_impl
+{
+  // FIXME add a stride to Map, so that the following mapping becomes easier,
+  // another option would be to create an expression being able to automatically
+  // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly
+  // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix,
+  // and Block.
+  typedef Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > MapLU;
+  typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
+  typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::Index Index;
+
+  /** \internal performs the LU decomposition in-place of the matrix \a lu
+    * using an unblocked algorithm.
+    *
+    * In addition, this function returns the row transpositions in the
+    * vector \a row_transpositions which must have a size equal to the number
+    * of columns of the matrix \a lu, and an integer \a nb_transpositions
+    * which returns the actual number of transpositions.
+    *
+    * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
+    */
+  static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
+  {
+    const Index rows = lu.rows();
+    const Index cols = lu.cols();
+    const Index size = (std::min)(rows,cols);
+    nb_transpositions = 0;
+    Index first_zero_pivot = -1;
+    for(Index k = 0; k < size; ++k)
+    {
+      Index rrows = rows-k-1;
+      Index rcols = cols-k-1;
+        
+      Index row_of_biggest_in_col;
+      RealScalar biggest_in_corner
+        = lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col);
+      row_of_biggest_in_col += k;
+
+      row_transpositions[k] = PivIndex(row_of_biggest_in_col);
+
+      if(biggest_in_corner != RealScalar(0))
+      {
+        if(k != row_of_biggest_in_col)
+        {
+          lu.row(k).swap(lu.row(row_of_biggest_in_col));
+          ++nb_transpositions;
+        }
+
+        // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k)
+        // overflow but not the actual quotient?
+        lu.col(k).tail(rrows) /= lu.coeff(k,k);
+      }
+      else if(first_zero_pivot==-1)
+      {
+        // the pivot is exactly zero, we record the index of the first pivot which is exactly 0,
+        // and continue the factorization such we still have A = PLU
+        first_zero_pivot = k;
+      }
+
+      if(k<rows-1)
+        lu.bottomRightCorner(rrows,rcols).noalias() -= lu.col(k).tail(rrows) * lu.row(k).tail(rcols);
+    }
+    return first_zero_pivot;
+  }
+
+  /** \internal performs the LU decomposition in-place of the matrix represented
+    * by the variables \a rows, \a cols, \a lu_data, and \a lu_stride using a
+    * recursive, blocked algorithm.
+    *
+    * In addition, this function returns the row transpositions in the
+    * vector \a row_transpositions which must have a size equal to the number
+    * of columns of the matrix \a lu, and an integer \a nb_transpositions
+    * which returns the actual number of transpositions.
+    *
+    * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
+    *
+    * \note This very low level interface using pointers, etc. is to:
+    *   1 - reduce the number of instanciations to the strict minimum
+    *   2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > >
+    */
+  static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256)
+  {
+    MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols);
+    MatrixType lu(lu1,0,0,rows,cols);
+
+    const Index size = (std::min)(rows,cols);
+
+    // if the matrix is too small, no blocking:
+    if(size<=16)
+    {
+      return unblocked_lu(lu, row_transpositions, nb_transpositions);
+    }
+
+    // automatically adjust the number of subdivisions to the size
+    // of the matrix so that there is enough sub blocks:
+    Index blockSize;
+    {
+      blockSize = size/8;
+      blockSize = (blockSize/16)*16;
+      blockSize = (std::min)((std::max)(blockSize,Index(8)), maxBlockSize);
+    }
+
+    nb_transpositions = 0;
+    Index first_zero_pivot = -1;
+    for(Index k = 0; k < size; k+=blockSize)
+    {
+      Index bs = (std::min)(size-k,blockSize); // actual size of the block
+      Index trows = rows - k - bs; // trailing rows
+      Index tsize = size - k - bs; // trailing size
+
+      // partition the matrix:
+      //                          A00 | A01 | A02
+      // lu  = A_0 | A_1 | A_2 =  A10 | A11 | A12
+      //                          A20 | A21 | A22
+      BlockType A_0(lu,0,0,rows,k);
+      BlockType A_2(lu,0,k+bs,rows,tsize);
+      BlockType A11(lu,k,k,bs,bs);
+      BlockType A12(lu,k,k+bs,bs,tsize);
+      BlockType A21(lu,k+bs,k,trows,bs);
+      BlockType A22(lu,k+bs,k+bs,trows,tsize);
+
+      PivIndex nb_transpositions_in_panel;
+      // recursively call the blocked LU algorithm on [A11^T A21^T]^T
+      // with a very small blocking size:
+      Index ret = blocked_lu(trows+bs, bs, &lu.coeffRef(k,k), luStride,
+                   row_transpositions+k, nb_transpositions_in_panel, 16);
+      if(ret>=0 && first_zero_pivot==-1)
+        first_zero_pivot = k+ret;
+
+      nb_transpositions += nb_transpositions_in_panel;
+      // update permutations and apply them to A_0
+      for(Index i=k; i<k+bs; ++i)
+      {
+        Index piv = (row_transpositions[i] += k);
+        A_0.row(i).swap(A_0.row(piv));
+      }
+
+      if(trows)
+      {
+        // apply permutations to A_2
+        for(Index i=k;i<k+bs; ++i)
+          A_2.row(i).swap(A_2.row(row_transpositions[i]));
+
+        // A12 = A11^-1 A12
+        A11.template triangularView<UnitLower>().solveInPlace(A12);
+
+        A22.noalias() -= A21 * A12;
+      }
+    }
+    return first_zero_pivot;
+  }
+};
+
+/** \internal performs the LU decomposition with partial pivoting in-place.
+  */
+template<typename MatrixType, typename TranspositionType>
+void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::Index& nb_transpositions)
+{
+  eigen_assert(lu.cols() == row_transpositions.size());
+  eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
+
+  partial_lu_impl
+    <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::Index>
+    ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
+}
+
+} // end namespace internal
+
+template<typename MatrixType>
+PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const MatrixType& matrix)
+{
+  // the row permutation is stored as int indices, so just to be sure:
+  eigen_assert(matrix.rows()<NumTraits<int>::highest());
+  
+  m_lu = matrix;
+
+  eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
+  const Index size = matrix.rows();
+
+  m_rowsTranspositions.resize(size);
+
+  typename TranspositionType::Index nb_transpositions;
+  internal::partial_lu_inplace(m_lu, m_rowsTranspositions, nb_transpositions);
+  m_det_p = (nb_transpositions%2) ? -1 : 1;
+
+  m_p = m_rowsTranspositions;
+
+  m_isInitialized = true;
+  return *this;
+}
+
+template<typename MatrixType>
+typename internal::traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
+{
+  eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+  return Scalar(m_det_p) * m_lu.diagonal().prod();
+}
+
+/** \returns the matrix represented by the decomposition,
+ * i.e., it returns the product: P^{-1} L U.
+ * This function is provided for debug purpose. */
+template<typename MatrixType>
+MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
+{
+  eigen_assert(m_isInitialized && "LU is not initialized.");
+  // LU
+  MatrixType res = m_lu.template triangularView<UnitLower>().toDenseMatrix()
+                 * m_lu.template triangularView<Upper>();
+
+  // P^{-1}(LU)
+  res = m_p.inverse() * res;
+
+  return res;
+}
+
+/***** Implementation of solve() *****************************************************/
+
+namespace internal {
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<PartialPivLU<_MatrixType>, Rhs>
+  : solve_retval_base<PartialPivLU<_MatrixType>, Rhs>
+{
+  EIGEN_MAKE_SOLVE_HELPERS(PartialPivLU<_MatrixType>,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+    * So we proceed as follows:
+    * Step 1: compute c = Pb.
+    * Step 2: replace c by the solution x to Lx = c.
+    * Step 3: replace c by the solution x to Ux = c.
+    */
+
+    eigen_assert(rhs().rows() == dec().matrixLU().rows());
+
+    // Step 1
+    dst = dec().permutationP() * rhs();
+
+    // Step 2
+    dec().matrixLU().template triangularView<UnitLower>().solveInPlace(dst);
+
+    // Step 3
+    dec().matrixLU().template triangularView<Upper>().solveInPlace(dst);
+  }
+};
+
+} // end namespace internal
+
+/******** MatrixBase methods *******/
+
+/** \lu_module
+  *
+  * \return the partial-pivoting LU decomposition of \c *this.
+  *
+  * \sa class PartialPivLU
+  */
+template<typename Derived>
+inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::partialPivLu() const
+{
+  return PartialPivLU<PlainObject>(eval());
+}
+
+#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+/** \lu_module
+  *
+  * Synonym of partialPivLu().
+  *
+  * \return the partial-pivoting LU decomposition of \c *this.
+  *
+  * \sa class PartialPivLU
+  */
+template<typename Derived>
+inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::lu() const
+{
+  return PartialPivLU<PlainObject>(eval());
+}
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARTIALLU_H
diff --git a/include/eigen3/Eigen/src/LU/PartialPivLU_MKL.h b/include/eigen3/Eigen/src/LU/PartialPivLU_MKL.h
new file mode 100644
index 0000000..9035953
--- /dev/null
+++ b/include/eigen3/Eigen/src/LU/PartialPivLU_MKL.h
@@ -0,0 +1,85 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *     LU decomposition with partial pivoting based on LAPACKE_?getrf function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_PARTIALLU_LAPACK_H
+#define EIGEN_PARTIALLU_LAPACK_H
+
+#include "Eigen/src/Core/util/MKL_support.h"
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Specialization for the data types supported by MKL */
+
+#define EIGEN_MKL_LU_PARTPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
+template<int StorageOrder> \
+struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
+{ \
+  /* \internal performs the LU decomposition in-place of the matrix represented */ \
+  static lapack_int blocked_lu(lapack_int rows, lapack_int cols, EIGTYPE* lu_data, lapack_int luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \
+  { \
+    EIGEN_UNUSED_VARIABLE(maxBlockSize);\
+    lapack_int matrix_order, first_zero_pivot; \
+    lapack_int m, n, lda, *ipiv, info; \
+    EIGTYPE* a; \
+/* Set up parameters for ?getrf */ \
+    matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
+    lda = luStride; \
+    a = lu_data; \
+    ipiv = row_transpositions; \
+    m = rows; \
+    n = cols; \
+    nb_transpositions = 0; \
+\
+    info = LAPACKE_##MKLPREFIX##getrf( matrix_order, m, n, (MKLTYPE*)a, lda, ipiv ); \
+\
+    for(int i=0;i<m;i++) { ipiv[i]--; if (ipiv[i]!=i) nb_transpositions++; } \
+\
+    eigen_assert(info >= 0); \
+/* something should be done with nb_transpositions */ \
+\
+    first_zero_pivot = info; \
+    return first_zero_pivot; \
+  } \
+};
+
+EIGEN_MKL_LU_PARTPIV(double, double, d)
+EIGEN_MKL_LU_PARTPIV(float, float, s)
+EIGEN_MKL_LU_PARTPIV(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_LU_PARTPIV(scomplex, MKL_Complex8, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARTIALLU_LAPACK_H
diff --git a/include/eigen3/Eigen/src/LU/arch/Inverse_SSE.h b/include/eigen3/Eigen/src/LU/arch/Inverse_SSE.h
new file mode 100644
index 0000000..60b7a23
--- /dev/null
+++ b/include/eigen3/Eigen/src/LU/arch/Inverse_SSE.h
@@ -0,0 +1,329 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2001 Intel Corporation
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// The SSE code for the 4x4 float and double matrix inverse in this file
+// comes from the following Intel's library:
+// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
+//
+// Here is the respective copyright and license statement:
+//
+//   Copyright (c) 2001 Intel Corporation.
+//
+// Permition is granted to use, copy, distribute and prepare derivative works
+// of this library for any purpose and without fee, provided, that the above
+// copyright notice and this statement appear in all copies.
+// Intel makes no representations about the suitability of this software for
+// any purpose, and specifically disclaims all warranties.
+// See LEGAL.TXT for all the legal information.
+
+#ifndef EIGEN_INVERSE_SSE_H
+#define EIGEN_INVERSE_SSE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
+{
+  enum {
+    MatrixAlignment     = bool(MatrixType::Flags&AlignedBit),
+    ResultAlignment     = bool(ResultType::Flags&AlignedBit),
+    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
+  };
+  
+  static void run(const MatrixType& matrix, ResultType& result)
+  {
+    EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
+
+    // Load the full matrix into registers
+    __m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
+    __m128 _L2 = matrix.template packet<MatrixAlignment>( 4);
+    __m128 _L3 = matrix.template packet<MatrixAlignment>( 8);
+    __m128 _L4 = matrix.template packet<MatrixAlignment>(12);
+
+    // The inverse is calculated using "Divide and Conquer" technique. The
+    // original matrix is divide into four 2x2 sub-matrices. Since each
+    // register holds four matrix element, the smaller matrices are
+    // represented as a registers. Hence we get a better locality of the
+    // calculations.
+
+    __m128 A, B, C, D; // the four sub-matrices
+    if(!StorageOrdersMatch)
+    {
+      A = _mm_unpacklo_ps(_L1, _L2);
+      B = _mm_unpacklo_ps(_L3, _L4);
+      C = _mm_unpackhi_ps(_L1, _L2);
+      D = _mm_unpackhi_ps(_L3, _L4);
+    }
+    else
+    {
+      A = _mm_movelh_ps(_L1, _L2);
+      B = _mm_movehl_ps(_L2, _L1);
+      C = _mm_movelh_ps(_L3, _L4);
+      D = _mm_movehl_ps(_L4, _L3);
+    }
+
+    __m128 iA, iB, iC, iD,                 // partial inverse of the sub-matrices
+            DC, AB;
+    __m128 dA, dB, dC, dD;                 // determinant of the sub-matrices
+    __m128 det, d, d1, d2;
+    __m128 rd;                             // reciprocal of the determinant
+
+    //  AB = A# * B
+    AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B);
+    AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
+    //  DC = D# * C
+    DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C);
+    DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));
+
+    //  dA = |A|
+    dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
+    dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA));
+    //  dB = |B|
+    dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
+    dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB));
+
+    //  dC = |C|
+    dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
+    dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC));
+    //  dD = |D|
+    dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
+    dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD));
+
+    //  d = trace(AB*DC) = trace(A#*B*D#*C)
+    d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);
+
+    //  iD = C*A#*B
+    iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
+    iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
+    //  iA = B*D#*C
+    iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
+    iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));
+
+    //  d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
+    d  = _mm_add_ps(d, _mm_movehl_ps(d, d));
+    d  = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
+    d1 = _mm_mul_ss(dA,dD);
+    d2 = _mm_mul_ss(dB,dC);
+
+    //  iD = D*|A| - C*A#*B
+    iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);
+
+    //  iA = A*|D| - B*D#*C;
+    iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);
+
+    //  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
+    det = _mm_sub_ss(_mm_add_ss(d1,d2),d);
+    rd  = _mm_div_ss(_mm_set_ss(1.0f), det);
+
+//     #ifdef ZERO_SINGULAR
+//         rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd);
+//     #endif
+
+    //  iB = D * (A#B)# = D*B#*A
+    iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33));
+    iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
+    //  iC = A * (D#C)# = A*C#*D
+    iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33));
+    iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
+
+    rd = _mm_shuffle_ps(rd,rd,0);
+    rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP));
+
+    //  iB = C*|B| - D*B#*A
+    iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
+
+    //  iC = B*|C| - A*C#*D;
+    iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);
+
+    //  iX = iX / det
+    iA = _mm_mul_ps(rd,iA);
+    iB = _mm_mul_ps(rd,iB);
+    iC = _mm_mul_ps(rd,iC);
+    iD = _mm_mul_ps(rd,iD);
+
+    result.template writePacket<ResultAlignment>( 0, _mm_shuffle_ps(iA,iB,0x77));
+    result.template writePacket<ResultAlignment>( 4, _mm_shuffle_ps(iA,iB,0x22));
+    result.template writePacket<ResultAlignment>( 8, _mm_shuffle_ps(iC,iD,0x77));
+    result.template writePacket<ResultAlignment>(12, _mm_shuffle_ps(iC,iD,0x22));
+  }
+
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
+{
+  enum {
+    MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
+    ResultAlignment = bool(ResultType::Flags&AlignedBit),
+    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
+  };
+  static void run(const MatrixType& matrix, ResultType& result)
+  {
+    const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
+    const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+
+    // The inverse is calculated using "Divide and Conquer" technique. The
+    // original matrix is divide into four 2x2 sub-matrices. Since each
+    // register of the matrix holds two element, the smaller matrices are
+    // consisted of two registers. Hence we get a better locality of the
+    // calculations.
+
+    // the four sub-matrices
+    __m128d A1, A2, B1, B2, C1, C2, D1, D2;
+    
+    if(StorageOrdersMatch)
+    {
+      A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2);
+      A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6);
+      C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
+      C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
+    }
+    else
+    {
+      __m128d tmp;
+      A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2);
+      A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6);
+      tmp = A1;
+      A1 = _mm_unpacklo_pd(A1,A2);
+      A2 = _mm_unpackhi_pd(tmp,A2);
+      tmp = C1;
+      C1 = _mm_unpacklo_pd(C1,C2);
+      C2 = _mm_unpackhi_pd(tmp,C2);
+      
+      B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
+      B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
+      tmp = B1;
+      B1 = _mm_unpacklo_pd(B1,B2);
+      B2 = _mm_unpackhi_pd(tmp,B2);
+      tmp = D1;
+      D1 = _mm_unpacklo_pd(D1,D2);
+      D2 = _mm_unpackhi_pd(tmp,D2);
+    }
+    
+    __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2,     // partial invese of the sub-matrices
+            DC1, DC2, AB1, AB2;
+    __m128d dA, dB, dC, dD;     // determinant of the sub-matrices
+    __m128d det, d1, d2, rd;
+
+    //  dA = |A|
+    dA = _mm_shuffle_pd(A2, A2, 1);
+    dA = _mm_mul_pd(A1, dA);
+    dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3));
+    //  dB = |B|
+    dB = _mm_shuffle_pd(B2, B2, 1);
+    dB = _mm_mul_pd(B1, dB);
+    dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3));
+
+    //  AB = A# * B
+    AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3));
+    AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0));
+    AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3)));
+    AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0)));
+
+    //  dC = |C|
+    dC = _mm_shuffle_pd(C2, C2, 1);
+    dC = _mm_mul_pd(C1, dC);
+    dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3));
+    //  dD = |D|
+    dD = _mm_shuffle_pd(D2, D2, 1);
+    dD = _mm_mul_pd(D1, dD);
+    dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3));
+
+    //  DC = D# * C
+    DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3));
+    DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0));
+    DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3)));
+    DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0)));
+
+    //  rd = trace(AB*DC) = trace(A#*B*D#*C)
+    d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0));
+    d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3));
+    rd = _mm_add_pd(d1, d2);
+    rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3));
+
+    //  iD = C*A#*B
+    iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0));
+    iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0));
+    iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3)));
+    iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3)));
+
+    //  iA = B*D#*C
+    iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0));
+    iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0));
+    iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3)));
+    iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3)));
+
+    //  iD = D*|A| - C*A#*B
+    dA = _mm_shuffle_pd(dA,dA,0);
+    iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1);
+    iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2);
+
+    //  iA = A*|D| - B*D#*C;
+    dD = _mm_shuffle_pd(dD,dD,0);
+    iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1);
+    iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2);
+
+    d1 = _mm_mul_sd(dA, dD);
+    d2 = _mm_mul_sd(dB, dC);
+
+    //  iB = D * (A#B)# = D*B#*A
+    iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1));
+    iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1));
+    iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2)));
+    iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2)));
+
+    //  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
+    det = _mm_add_sd(d1, d2);
+    det = _mm_sub_sd(det, rd);
+
+    //  iC = A * (D#C)# = A*C#*D
+    iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1));
+    iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1));
+    iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2)));
+    iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2)));
+
+    rd = _mm_div_sd(_mm_set_sd(1.0), det);
+//     #ifdef ZERO_SINGULAR
+//         rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd);
+//     #endif
+    rd = _mm_shuffle_pd(rd,rd,0);
+
+    //  iB = C*|B| - D*B#*A
+    dB = _mm_shuffle_pd(dB,dB,0);
+    iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1);
+    iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2);
+
+    d1 = _mm_xor_pd(rd, _Sign_PN);
+    d2 = _mm_xor_pd(rd, _Sign_NP);
+
+    //  iC = B*|C| - A*C#*D;
+    dC = _mm_shuffle_pd(dC,dC,0);
+    iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
+    iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
+
+    result.template writePacket<ResultAlignment>( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));     // iA# / det
+    result.template writePacket<ResultAlignment>( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
+    result.template writePacket<ResultAlignment>( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));     // iB# / det
+    result.template writePacket<ResultAlignment>( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
+    result.template writePacket<ResultAlignment>( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));     // iC# / det
+    result.template writePacket<ResultAlignment>(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
+    result.template writePacket<ResultAlignment>(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));     // iD# / det
+    result.template writePacket<ResultAlignment>(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INVERSE_SSE_H
diff --git a/include/eigen3/Eigen/src/MetisSupport/MetisSupport.h b/include/eigen3/Eigen/src/MetisSupport/MetisSupport.h
new file mode 100644
index 0000000..f2bbef2
--- /dev/null
+++ b/include/eigen3/Eigen/src/MetisSupport/MetisSupport.h
@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef METIS_SUPPORT_H
+#define METIS_SUPPORT_H
+
+namespace Eigen {
+/**
+ * Get the fill-reducing ordering from the METIS package
+ * 
+ * If A is the original matrix and Ap is the permuted matrix, 
+ * the fill-reducing permutation is defined as follows :
+ * Row (column) i of A is the matperm(i) row (column) of Ap. 
+ * WARNING: As computed by METIS, this corresponds to the vector iperm (instead of perm)
+ */
+template <typename Index>
+class MetisOrdering
+{
+public:
+  typedef PermutationMatrix<Dynamic,Dynamic,Index> PermutationType;
+  typedef Matrix<Index,Dynamic,1> IndexVector; 
+  
+  template <typename MatrixType>
+  void get_symmetrized_graph(const MatrixType& A)
+  {
+    Index m = A.cols(); 
+    eigen_assert((A.rows() == A.cols()) && "ONLY FOR SQUARED MATRICES");
+    // Get the transpose of the input matrix 
+    MatrixType At = A.transpose(); 
+    // Get the number of nonzeros elements in each row/col of At+A
+    Index TotNz = 0; 
+    IndexVector visited(m); 
+    visited.setConstant(-1); 
+    for (int j = 0; j < m; j++)
+    {
+      // Compute the union structure of of A(j,:) and At(j,:)
+      visited(j) = j; // Do not include the diagonal element
+      // Get the nonzeros in row/column j of A
+      for (typename MatrixType::InnerIterator it(A, j); it; ++it)
+      {
+        Index idx = it.index(); // Get the row index (for column major) or column index (for row major)
+        if (visited(idx) != j ) 
+        {
+          visited(idx) = j; 
+          ++TotNz; 
+        }
+      }
+      //Get the nonzeros in row/column j of At
+      for (typename MatrixType::InnerIterator it(At, j); it; ++it)
+      {
+        Index idx = it.index(); 
+        if(visited(idx) != j)
+        {
+          visited(idx) = j; 
+          ++TotNz; 
+        }
+      }
+    }
+    // Reserve place for A + At
+    m_indexPtr.resize(m+1);
+    m_innerIndices.resize(TotNz); 
+
+    // Now compute the real adjacency list of each column/row 
+    visited.setConstant(-1); 
+    Index CurNz = 0; 
+    for (int j = 0; j < m; j++)
+    {
+      m_indexPtr(j) = CurNz; 
+      
+      visited(j) = j; // Do not include the diagonal element
+      // Add the pattern of row/column j of A to A+At
+      for (typename MatrixType::InnerIterator it(A,j); it; ++it)
+      {
+        Index idx = it.index(); // Get the row index (for column major) or column index (for row major)
+        if (visited(idx) != j ) 
+        {
+          visited(idx) = j; 
+          m_innerIndices(CurNz) = idx; 
+          CurNz++; 
+        }
+      }
+      //Add the pattern of row/column j of At to A+At
+      for (typename MatrixType::InnerIterator it(At, j); it; ++it)
+      {
+        Index idx = it.index(); 
+        if(visited(idx) != j)
+        {
+          visited(idx) = j; 
+          m_innerIndices(CurNz) = idx; 
+          ++CurNz; 
+        }
+      }
+    }
+    m_indexPtr(m) = CurNz;    
+  }
+  
+  template <typename MatrixType>
+  void operator() (const MatrixType& A, PermutationType& matperm)
+  {
+     Index m = A.cols();
+     IndexVector perm(m),iperm(m); 
+    // First, symmetrize the matrix graph. 
+     get_symmetrized_graph(A); 
+     int output_error;
+     
+     // Call the fill-reducing routine from METIS 
+     output_error = METIS_NodeND(&m, m_indexPtr.data(), m_innerIndices.data(), NULL, NULL, perm.data(), iperm.data());
+     
+    if(output_error != METIS_OK) 
+    {
+      //FIXME The ordering interface should define a class of possible errors 
+     std::cerr << "ERROR WHILE CALLING THE METIS PACKAGE \n"; 
+     return; 
+    }
+    
+    // Get the fill-reducing permutation 
+    //NOTE:  If Ap is the permuted matrix then perm and iperm vectors are defined as follows 
+    // Row (column) i of Ap is the perm(i) row(column) of A, and row (column) i of A is the iperm(i) row(column) of Ap
+    
+     matperm.resize(m);
+     for (int j = 0; j < m; j++)
+       matperm.indices()(iperm(j)) = j;
+   
+  }
+  
+  protected:
+    IndexVector m_indexPtr; // Pointer to the adjacenccy list of each row/column
+    IndexVector m_innerIndices; // Adjacency list 
+};
+
+}// end namespace eigen 
+#endif
diff --git a/include/eigen3/Eigen/src/OrderingMethods/Amd.h b/include/eigen3/Eigen/src/OrderingMethods/Amd.h
new file mode 100644
index 0000000..41b4fd7
--- /dev/null
+++ b/include/eigen3/Eigen/src/OrderingMethods/Amd.h
@@ -0,0 +1,435 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+
+/*
+
+NOTE: this routine has been adapted from the CSparse library:
+
+Copyright (c) 2006, Timothy A. Davis.
+http://www.cise.ufl.edu/research/sparse/CSparse
+
+CSparse is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+CSparse is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this Module; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+*/
+
+#include "../Core/util/NonMPL2.h"
+
+#ifndef EIGEN_SPARSE_AMD_H
+#define EIGEN_SPARSE_AMD_H
+
+namespace Eigen { 
+
+namespace internal {
+  
+template<typename T> inline T amd_flip(const T& i) { return -i-2; }
+template<typename T> inline T amd_unflip(const T& i) { return i<0 ? amd_flip(i) : i; }
+template<typename T0, typename T1> inline bool amd_marked(const T0* w, const T1& j) { return w[j]<0; }
+template<typename T0, typename T1> inline void amd_mark(const T0* w, const T1& j) { return w[j] = amd_flip(w[j]); }
+
+/* clear w */
+template<typename Index>
+static int cs_wclear (Index mark, Index lemax, Index *w, Index n)
+{
+  Index k;
+  if(mark < 2 || (mark + lemax < 0))
+  {
+    for(k = 0; k < n; k++)
+      if(w[k] != 0)
+        w[k] = 1;
+    mark = 2;
+  }
+  return (mark);     /* at this point, w[0..n-1] < mark holds */
+}
+
+/* depth-first search and postorder of a tree rooted at node j */
+template<typename Index>
+Index cs_tdfs(Index j, Index k, Index *head, const Index *next, Index *post, Index *stack)
+{
+  int i, p, top = 0;
+  if(!head || !next || !post || !stack) return (-1);    /* check inputs */
+  stack[0] = j;                 /* place j on the stack */
+  while (top >= 0)                /* while (stack is not empty) */
+  {
+    p = stack[top];           /* p = top of stack */
+    i = head[p];              /* i = youngest child of p */
+    if(i == -1)
+    {
+      top--;                 /* p has no unordered children left */
+      post[k++] = p;        /* node p is the kth postordered node */
+    }
+    else
+    {
+      head[p] = next[i];   /* remove i from children of p */
+      stack[++top] = i;     /* start dfs on child node i */
+    }
+  }
+  return k;
+}
+
+
+/** \internal
+  * \ingroup OrderingMethods_Module 
+  * Approximate minimum degree ordering algorithm.
+  * \returns the permutation P reducing the fill-in of the input matrix \a C
+  * The input matrix \a C must be a selfadjoint compressed column major SparseMatrix object. Both the upper and lower parts have to be stored, but the diagonal entries are optional.
+  * On exit the values of C are destroyed */
+template<typename Scalar, typename Index>
+void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, PermutationMatrix<Dynamic,Dynamic,Index>& perm)
+{
+  using std::sqrt;
+  
+  int d, dk, dext, lemax = 0, e, elenk, eln, i, j, k, k1,
+      k2, k3, jlast, ln, dense, nzmax, mindeg = 0, nvi, nvj, nvk, mark, wnvi,
+      ok, nel = 0, p, p1, p2, p3, p4, pj, pk, pk1, pk2, pn, q, t;
+  unsigned int h;
+  
+  Index n = C.cols();
+  dense = std::max<Index> (16, Index(10 * sqrt(double(n))));   /* find dense threshold */
+  dense = std::min<Index> (n-2, dense);
+  
+  Index cnz = C.nonZeros();
+  perm.resize(n+1);
+  t = cnz + cnz/5 + 2*n;                 /* add elbow room to C */
+  C.resizeNonZeros(t);
+  
+  Index* W       = new Index[8*(n+1)]; /* get workspace */
+  Index* len     = W;
+  Index* nv      = W +   (n+1);
+  Index* next    = W + 2*(n+1);
+  Index* head    = W + 3*(n+1);
+  Index* elen    = W + 4*(n+1);
+  Index* degree  = W + 5*(n+1);
+  Index* w       = W + 6*(n+1);
+  Index* hhead   = W + 7*(n+1);
+  Index* last    = perm.indices().data();                              /* use P as workspace for last */
+  
+  /* --- Initialize quotient graph ---------------------------------------- */
+  Index* Cp = C.outerIndexPtr();
+  Index* Ci = C.innerIndexPtr();
+  for(k = 0; k < n; k++)
+    len[k] = Cp[k+1] - Cp[k];
+  len[n] = 0;
+  nzmax = t;
+  
+  for(i = 0; i <= n; i++)
+  {
+    head[i]   = -1;                     // degree list i is empty
+    last[i]   = -1;
+    next[i]   = -1;
+    hhead[i]  = -1;                     // hash list i is empty 
+    nv[i]     = 1;                      // node i is just one node
+    w[i]      = 1;                      // node i is alive
+    elen[i]   = 0;                      // Ek of node i is empty
+    degree[i] = len[i];                 // degree of node i
+  }
+  mark = internal::cs_wclear<Index>(0, 0, w, n);         /* clear w */
+  elen[n] = -2;                         /* n is a dead element */
+  Cp[n] = -1;                           /* n is a root of assembly tree */
+  w[n] = 0;                             /* n is a dead element */
+  
+  /* --- Initialize degree lists ------------------------------------------ */
+  for(i = 0; i < n; i++)
+  {
+    d = degree[i];
+    if(d == 0)                         /* node i is empty */
+    {
+      elen[i] = -2;                 /* element i is dead */
+      nel++;
+      Cp[i] = -1;                   /* i is a root of assembly tree */
+      w[i] = 0;
+    }
+    else if(d > dense)                 /* node i is dense */
+    {
+      nv[i] = 0;                    /* absorb i into element n */
+      elen[i] = -1;                 /* node i is dead */
+      nel++;
+      Cp[i] = amd_flip (n);
+      nv[n]++;
+    }
+    else
+    {
+      if(head[d] != -1) last[head[d]] = i;
+      next[i] = head[d];           /* put node i in degree list d */
+      head[d] = i;
+    }
+  }
+  
+  while (nel < n)                         /* while (selecting pivots) do */
+  {
+    /* --- Select node of minimum approximate degree -------------------- */
+    for(k = -1; mindeg < n && (k = head[mindeg]) == -1; mindeg++) {}
+    if(next[k] != -1) last[next[k]] = -1;
+    head[mindeg] = next[k];          /* remove k from degree list */
+    elenk = elen[k];                  /* elenk = |Ek| */
+    nvk = nv[k];                      /* # of nodes k represents */
+    nel += nvk;                        /* nv[k] nodes of A eliminated */
+    
+    /* --- Garbage collection ------------------------------------------- */
+    if(elenk > 0 && cnz + mindeg >= nzmax)
+    {
+      for(j = 0; j < n; j++)
+      {
+        if((p = Cp[j]) >= 0)      /* j is a live node or element */
+        {
+          Cp[j] = Ci[p];          /* save first entry of object */
+          Ci[p] = amd_flip (j);    /* first entry is now amd_flip(j) */
+        }
+      }
+      for(q = 0, p = 0; p < cnz; ) /* scan all of memory */
+      {
+        if((j = amd_flip (Ci[p++])) >= 0)  /* found object j */
+        {
+          Ci[q] = Cp[j];       /* restore first entry of object */
+          Cp[j] = q++;          /* new pointer to object j */
+          for(k3 = 0; k3 < len[j]-1; k3++) Ci[q++] = Ci[p++];
+        }
+      }
+      cnz = q;                       /* Ci[cnz...nzmax-1] now free */
+    }
+    
+    /* --- Construct new element ---------------------------------------- */
+    dk = 0;
+    nv[k] = -nvk;                     /* flag k as in Lk */
+    p = Cp[k];
+    pk1 = (elenk == 0) ? p : cnz;      /* do in place if elen[k] == 0 */
+    pk2 = pk1;
+    for(k1 = 1; k1 <= elenk + 1; k1++)
+    {
+      if(k1 > elenk)
+      {
+        e = k;                     /* search the nodes in k */
+        pj = p;                    /* list of nodes starts at Ci[pj]*/
+        ln = len[k] - elenk;      /* length of list of nodes in k */
+      }
+      else
+      {
+        e = Ci[p++];              /* search the nodes in e */
+        pj = Cp[e];
+        ln = len[e];              /* length of list of nodes in e */
+      }
+      for(k2 = 1; k2 <= ln; k2++)
+      {
+        i = Ci[pj++];
+        if((nvi = nv[i]) <= 0) continue; /* node i dead, or seen */
+        dk += nvi;                 /* degree[Lk] += size of node i */
+        nv[i] = -nvi;             /* negate nv[i] to denote i in Lk*/
+        Ci[pk2++] = i;            /* place i in Lk */
+        if(next[i] != -1) last[next[i]] = last[i];
+        if(last[i] != -1)         /* remove i from degree list */
+        {
+          next[last[i]] = next[i];
+        }
+        else
+        {
+          head[degree[i]] = next[i];
+        }
+      }
+      if(e != k)
+      {
+        Cp[e] = amd_flip (k);      /* absorb e into k */
+        w[e] = 0;                 /* e is now a dead element */
+      }
+    }
+    if(elenk != 0) cnz = pk2;         /* Ci[cnz...nzmax] is free */
+    degree[k] = dk;                   /* external degree of k - |Lk\i| */
+    Cp[k] = pk1;                      /* element k is in Ci[pk1..pk2-1] */
+    len[k] = pk2 - pk1;
+    elen[k] = -2;                     /* k is now an element */
+    
+    /* --- Find set differences ----------------------------------------- */
+    mark = internal::cs_wclear<Index>(mark, lemax, w, n);  /* clear w if necessary */
+    for(pk = pk1; pk < pk2; pk++)    /* scan 1: find |Le\Lk| */
+    {
+      i = Ci[pk];
+      if((eln = elen[i]) <= 0) continue;/* skip if elen[i] empty */
+      nvi = -nv[i];                      /* nv[i] was negated */
+      wnvi = mark - nvi;
+      for(p = Cp[i]; p <= Cp[i] + eln - 1; p++)  /* scan Ei */
+      {
+        e = Ci[p];
+        if(w[e] >= mark)
+        {
+          w[e] -= nvi;          /* decrement |Le\Lk| */
+        }
+        else if(w[e] != 0)        /* ensure e is a live element */
+        {
+          w[e] = degree[e] + wnvi; /* 1st time e seen in scan 1 */
+        }
+      }
+    }
+    
+    /* --- Degree update ------------------------------------------------ */
+    for(pk = pk1; pk < pk2; pk++)    /* scan2: degree update */
+    {
+      i = Ci[pk];                   /* consider node i in Lk */
+      p1 = Cp[i];
+      p2 = p1 + elen[i] - 1;
+      pn = p1;
+      for(h = 0, d = 0, p = p1; p <= p2; p++)    /* scan Ei */
+      {
+        e = Ci[p];
+        if(w[e] != 0)             /* e is an unabsorbed element */
+        {
+          dext = w[e] - mark;   /* dext = |Le\Lk| */
+          if(dext > 0)
+          {
+            d += dext;         /* sum up the set differences */
+            Ci[pn++] = e;     /* keep e in Ei */
+            h += e;            /* compute the hash of node i */
+          }
+          else
+          {
+            Cp[e] = amd_flip (k);  /* aggressive absorb. e->k */
+            w[e] = 0;             /* e is a dead element */
+          }
+        }
+      }
+      elen[i] = pn - p1 + 1;        /* elen[i] = |Ei| */
+      p3 = pn;
+      p4 = p1 + len[i];
+      for(p = p2 + 1; p < p4; p++) /* prune edges in Ai */
+      {
+        j = Ci[p];
+        if((nvj = nv[j]) <= 0) continue; /* node j dead or in Lk */
+        d += nvj;                  /* degree(i) += |j| */
+        Ci[pn++] = j;             /* place j in node list of i */
+        h += j;                    /* compute hash for node i */
+      }
+      if(d == 0)                     /* check for mass elimination */
+      {
+        Cp[i] = amd_flip (k);      /* absorb i into k */
+        nvi = -nv[i];
+        dk -= nvi;                 /* |Lk| -= |i| */
+        nvk += nvi;                /* |k| += nv[i] */
+        nel += nvi;
+        nv[i] = 0;
+        elen[i] = -1;             /* node i is dead */
+      }
+      else
+      {
+        degree[i] = std::min<Index> (degree[i], d);   /* update degree(i) */
+        Ci[pn] = Ci[p3];         /* move first node to end */
+        Ci[p3] = Ci[p1];         /* move 1st el. to end of Ei */
+        Ci[p1] = k;               /* add k as 1st element in of Ei */
+        len[i] = pn - p1 + 1;     /* new len of adj. list of node i */
+        h %= n;                    /* finalize hash of i */
+        next[i] = hhead[h];      /* place i in hash bucket */
+        hhead[h] = i;
+        last[i] = h;              /* save hash of i in last[i] */
+      }
+    }                                   /* scan2 is done */
+    degree[k] = dk;                   /* finalize |Lk| */
+    lemax = std::max<Index>(lemax, dk);
+    mark = internal::cs_wclear<Index>(mark+lemax, lemax, w, n);    /* clear w */
+    
+    /* --- Supernode detection ------------------------------------------ */
+    for(pk = pk1; pk < pk2; pk++)
+    {
+      i = Ci[pk];
+      if(nv[i] >= 0) continue;         /* skip if i is dead */
+      h = last[i];                      /* scan hash bucket of node i */
+      i = hhead[h];
+      hhead[h] = -1;                    /* hash bucket will be empty */
+      for(; i != -1 && next[i] != -1; i = next[i], mark++)
+      {
+        ln = len[i];
+        eln = elen[i];
+        for(p = Cp[i]+1; p <= Cp[i] + ln-1; p++) w[Ci[p]] = mark;
+        jlast = i;
+        for(j = next[i]; j != -1; ) /* compare i with all j */
+        {
+          ok = (len[j] == ln) && (elen[j] == eln);
+          for(p = Cp[j] + 1; ok && p <= Cp[j] + ln - 1; p++)
+          {
+            if(w[Ci[p]] != mark) ok = 0;    /* compare i and j*/
+          }
+          if(ok)                     /* i and j are identical */
+          {
+            Cp[j] = amd_flip (i);  /* absorb j into i */
+            nv[i] += nv[j];
+            nv[j] = 0;
+            elen[j] = -1;         /* node j is dead */
+            j = next[j];          /* delete j from hash bucket */
+            next[jlast] = j;
+          }
+          else
+          {
+            jlast = j;             /* j and i are different */
+            j = next[j];
+          }
+        }
+      }
+    }
+    
+    /* --- Finalize new element------------------------------------------ */
+    for(p = pk1, pk = pk1; pk < pk2; pk++)   /* finalize Lk */
+    {
+      i = Ci[pk];
+      if((nvi = -nv[i]) <= 0) continue;/* skip if i is dead */
+      nv[i] = nvi;                      /* restore nv[i] */
+      d = degree[i] + dk - nvi;         /* compute external degree(i) */
+      d = std::min<Index> (d, n - nel - nvi);
+      if(head[d] != -1) last[head[d]] = i;
+      next[i] = head[d];               /* put i back in degree list */
+      last[i] = -1;
+      head[d] = i;
+      mindeg = std::min<Index> (mindeg, d);       /* find new minimum degree */
+      degree[i] = d;
+      Ci[p++] = i;                      /* place i in Lk */
+    }
+    nv[k] = nvk;                      /* # nodes absorbed into k */
+    if((len[k] = p-pk1) == 0)         /* length of adj list of element k*/
+    {
+      Cp[k] = -1;                   /* k is a root of the tree */
+      w[k] = 0;                     /* k is now a dead element */
+    }
+    if(elenk != 0) cnz = p;           /* free unused space in Lk */
+  }
+  
+  /* --- Postordering ----------------------------------------------------- */
+  for(i = 0; i < n; i++) Cp[i] = amd_flip (Cp[i]);/* fix assembly tree */
+  for(j = 0; j <= n; j++) head[j] = -1;
+  for(j = n; j >= 0; j--)              /* place unordered nodes in lists */
+  {
+    if(nv[j] > 0) continue;          /* skip if j is an element */
+    next[j] = head[Cp[j]];          /* place j in list of its parent */
+    head[Cp[j]] = j;
+  }
+  for(e = n; e >= 0; e--)              /* place elements in lists */
+  {
+    if(nv[e] <= 0) continue;         /* skip unless e is an element */
+    if(Cp[e] != -1)
+    {
+      next[e] = head[Cp[e]];      /* place e in list of its parent */
+      head[Cp[e]] = e;
+    }
+  }
+  for(k = 0, i = 0; i <= n; i++)       /* postorder the assembly tree */
+  {
+    if(Cp[i] == -1) k = internal::cs_tdfs<Index>(i, k, head, next, perm.indices().data(), w);
+  }
+  
+  perm.indices().conservativeResize(n);
+
+  delete[] W;
+}
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_AMD_H
diff --git a/include/eigen3/Eigen/src/OrderingMethods/Eigen_Colamd.h b/include/eigen3/Eigen/src/OrderingMethods/Eigen_Colamd.h
new file mode 100644
index 0000000..44548f6
--- /dev/null
+++ b/include/eigen3/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -0,0 +1,1850 @@
+// // This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire Nuentsa Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This file is modified from the colamd/symamd library. The copyright is below
+
+//   The authors of the code itself are Stefan I. Larimore and Timothy A.
+//   Davis (davis at cise.ufl.edu), University of Florida.  The algorithm was
+//   developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+//   Ng, Oak Ridge National Laboratory.
+// 
+//     Date:
+// 
+//   September 8, 2003.  Version 2.3.
+// 
+//     Acknowledgements:
+// 
+//   This work was supported by the National Science Foundation, under
+//   grants DMS-9504974 and DMS-9803599.
+// 
+//     Notice:
+// 
+//   Copyright (c) 1998-2003 by the University of Florida.
+//   All Rights Reserved.
+// 
+//   THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+//   EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+// 
+//   Permission is hereby granted to use, copy, modify, and/or distribute
+//   this program, provided that the Copyright, this License, and the
+//   Availability of the original version is retained on all copies and made
+//   accessible to the end-user of any code or package that includes COLAMD
+//   or any modified version of COLAMD. 
+// 
+//     Availability:
+// 
+//   The colamd/symamd library is available at
+// 
+//       http://www.cise.ufl.edu/research/sparse/colamd/
+
+//   This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
+//   file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
+//   files, and by any C code that calls the routines whose prototypes are
+//   listed below, or that uses the colamd/symamd definitions listed below.
+  
+#ifndef EIGEN_COLAMD_H
+#define EIGEN_COLAMD_H
+
+namespace internal {
+/* Ensure that debugging is turned off: */
+#ifndef COLAMD_NDEBUG
+#define COLAMD_NDEBUG
+#endif /* NDEBUG */
+/* ========================================================================== */
+/* === Knob and statistics definitions ====================================== */
+/* ========================================================================== */
+
+/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
+#define COLAMD_KNOBS 20
+
+/* number of output statistics.  Only stats [0..6] are currently used. */
+#define COLAMD_STATS 20 
+
+/* knobs [0] and stats [0]: dense row knob and output statistic. */
+#define COLAMD_DENSE_ROW 0
+
+/* knobs [1] and stats [1]: dense column knob and output statistic. */
+#define COLAMD_DENSE_COL 1
+
+/* stats [2]: memory defragmentation count output statistic */
+#define COLAMD_DEFRAG_COUNT 2
+
+/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
+#define COLAMD_STATUS 3
+
+/* stats [4..6]: error info, or info on jumbled columns */ 
+#define COLAMD_INFO1 4
+#define COLAMD_INFO2 5
+#define COLAMD_INFO3 6
+
+/* error codes returned in stats [3]: */
+#define COLAMD_OK       (0)
+#define COLAMD_OK_BUT_JUMBLED     (1)
+#define COLAMD_ERROR_A_not_present    (-1)
+#define COLAMD_ERROR_p_not_present    (-2)
+#define COLAMD_ERROR_nrow_negative    (-3)
+#define COLAMD_ERROR_ncol_negative    (-4)
+#define COLAMD_ERROR_nnz_negative   (-5)
+#define COLAMD_ERROR_p0_nonzero     (-6)
+#define COLAMD_ERROR_A_too_small    (-7)
+#define COLAMD_ERROR_col_length_negative  (-8)
+#define COLAMD_ERROR_row_index_out_of_bounds  (-9)
+#define COLAMD_ERROR_out_of_memory    (-10)
+#define COLAMD_ERROR_internal_error   (-999)
+
+/* ========================================================================== */
+/* === Definitions ========================================================== */
+/* ========================================================================== */
+
+#define COLAMD_MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define COLAMD_MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ONES_COMPLEMENT(r) (-(r)-1)
+
+/* -------------------------------------------------------------------------- */
+
+#define COLAMD_EMPTY (-1)
+
+/* Row and column status */
+#define ALIVE (0)
+#define DEAD  (-1)
+
+/* Column status */
+#define DEAD_PRINCIPAL    (-1)
+#define DEAD_NON_PRINCIPAL  (-2)
+
+/* Macros for row and column status update and checking. */
+#define ROW_IS_DEAD(r)      ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
+#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < ALIVE)
+#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= ALIVE)
+#define COL_IS_DEAD(c)      (Col [c].start < ALIVE)
+#define COL_IS_ALIVE(c)     (Col [c].start >= ALIVE)
+#define COL_IS_DEAD_PRINCIPAL(c)  (Col [c].start == DEAD_PRINCIPAL)
+#define KILL_ROW(r)     { Row [r].shared2.mark = DEAD ; }
+#define KILL_PRINCIPAL_COL(c)   { Col [c].start = DEAD_PRINCIPAL ; }
+#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; }
+
+/* ========================================================================== */
+/* === Colamd reporting mechanism =========================================== */
+/* ========================================================================== */
+
+// == Row and Column structures ==
+template <typename Index>
+struct colamd_col
+{
+  Index start ;   /* index for A of first row in this column, or DEAD */
+  /* if column is dead */
+  Index length ;  /* number of rows in this column */
+  union
+  {
+    Index thickness ; /* number of original columns represented by this */
+    /* col, if the column is alive */
+    Index parent ;  /* parent in parent tree super-column structure, if */
+    /* the column is dead */
+  } shared1 ;
+  union
+  {
+    Index score ; /* the score used to maintain heap, if col is alive */
+    Index order ; /* pivot ordering of this column, if col is dead */
+  } shared2 ;
+  union
+  {
+    Index headhash ;  /* head of a hash bucket, if col is at the head of */
+    /* a degree list */
+    Index hash ;  /* hash value, if col is not in a degree list */
+    Index prev ;  /* previous column in degree list, if col is in a */
+    /* degree list (but not at the head of a degree list) */
+  } shared3 ;
+  union
+  {
+    Index degree_next ; /* next column, if col is in a degree list */
+    Index hash_next ;   /* next column, if col is in a hash list */
+  } shared4 ;
+  
+};
+ 
+template <typename Index>
+struct Colamd_Row
+{
+  Index start ;   /* index for A of first col in this row */
+  Index length ;  /* number of principal columns in this row */
+  union
+  {
+    Index degree ;  /* number of principal & non-principal columns in row */
+    Index p ;   /* used as a row pointer in init_rows_cols () */
+  } shared1 ;
+  union
+  {
+    Index mark ;  /* for computing set differences and marking dead rows*/
+    Index first_column ;/* first column in row (used in garbage collection) */
+  } shared2 ;
+  
+};
+ 
+/* ========================================================================== */
+/* === Colamd recommended memory size ======================================= */
+/* ========================================================================== */
+ 
+/*
+  The recommended length Alen of the array A passed to colamd is given by
+  the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro.  It returns -1 if any
+  argument is negative.  2*nnz space is required for the row and column
+  indices of the matrix. colamd_c (n_col) + colamd_r (n_row) space is
+  required for the Col and Row arrays, respectively, which are internal to
+  colamd.  An additional n_col space is the minimal amount of "elbow room",
+  and nnz/5 more space is recommended for run time efficiency.
+  
+  This macro is not needed when using symamd.
+  
+  Explicit typecast to Index added Sept. 23, 2002, COLAMD version 2.2, to avoid
+  gcc -pedantic warning messages.
+*/
+template <typename Index>
+inline Index colamd_c(Index n_col) 
+{ return Index( ((n_col) + 1) * sizeof (colamd_col<Index>) / sizeof (Index) ) ; }
+
+template <typename Index>
+inline Index  colamd_r(Index n_row)
+{ return Index(((n_row) + 1) * sizeof (Colamd_Row<Index>) / sizeof (Index)); }
+
+// Prototypes of non-user callable routines
+template <typename Index>
+static Index init_rows_cols (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> col [], Index A [], Index p [], Index stats[COLAMD_STATS] ); 
+
+template <typename Index>
+static void init_scoring (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], double knobs[COLAMD_KNOBS], Index *p_n_row2, Index *p_n_col2, Index *p_max_deg);
+
+template <typename Index>
+static Index find_ordering (Index n_row, Index n_col, Index Alen, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], Index n_col2, Index max_deg, Index pfree);
+
+template <typename Index>
+static void order_children (Index n_col, colamd_col<Index> Col [], Index p []);
+
+template <typename Index>
+static void detect_super_cols (colamd_col<Index> Col [], Index A [], Index head [], Index row_start, Index row_length ) ;
+
+template <typename Index>
+static Index garbage_collection (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index *pfree) ;
+
+template <typename Index>
+static inline  Index clear_mark (Index n_row, Colamd_Row<Index> Row [] ) ;
+
+/* === No debugging ========================================================= */
+
+#define COLAMD_DEBUG0(params) ;
+#define COLAMD_DEBUG1(params) ;
+#define COLAMD_DEBUG2(params) ;
+#define COLAMD_DEBUG3(params) ;
+#define COLAMD_DEBUG4(params) ;
+
+#define COLAMD_ASSERT(expression) ((void) 0)
+
+
+/**
+ * \brief Returns the recommended value of Alen 
+ * 
+ * Returns recommended value of Alen for use by colamd.  
+ * Returns -1 if any input argument is negative.  
+ * The use of this routine or macro is optional.  
+ * Note that the macro uses its arguments   more than once, 
+ * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.  
+ * 
+ * \param nnz nonzeros in A
+ * \param n_row number of rows in A
+ * \param n_col number of columns in A
+ * \return recommended value of Alen for use by colamd
+ */
+template <typename Index>
+inline Index colamd_recommended ( Index nnz, Index n_row, Index n_col)
+{
+  if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0)
+    return (-1);
+  else
+    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); 
+}
+
+/**
+ * \brief set default parameters  The use of this routine is optional.
+ * 
+ * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
+ * entries are removed prior to ordering.  Columns with more than
+ * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
+ * ordering, and placed last in the output column ordering. 
+ *
+ * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+ * respectively, in colamd.h.  Default values of these two knobs
+ * are both 0.5.  Currently, only knobs [0] and knobs [1] are
+ * used, but future versions may use more knobs.  If so, they will
+ * be properly set to their defaults by the future version of
+ * colamd_set_defaults, so that the code that calls colamd will
+ * not need to change, assuming that you either use
+ * colamd_set_defaults, or pass a (double *) NULL pointer as the
+ * knobs array to colamd or symamd.
+ * 
+ * \param knobs parameter settings for colamd
+ */
+
+static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
+{
+  /* === Local variables ================================================== */
+  
+  int i ;
+
+  if (!knobs)
+  {
+    return ;      /* no knobs to initialize */
+  }
+  for (i = 0 ; i < COLAMD_KNOBS ; i++)
+  {
+    knobs [i] = 0 ;
+  }
+  knobs [COLAMD_DENSE_ROW] = 0.5 ;  /* ignore rows over 50% dense */
+  knobs [COLAMD_DENSE_COL] = 0.5 ;  /* ignore columns over 50% dense */
+}
+
+/** 
+ * \brief  Computes a column ordering using the column approximate minimum degree ordering
+ * 
+ * Computes a column ordering (Q) of A such that P(AQ)=LU or
+ * (AQ)'AQ=LL' have less fill-in and require fewer floating point
+ * operations than factorizing the unpermuted matrix A or A'A,
+ * respectively.
+ * 
+ * 
+ * \param n_row number of rows in A
+ * \param n_col number of columns in A
+ * \param Alen, size of the array A
+ * \param A row indices of the matrix, of size ALen
+ * \param p column pointers of A, of size n_col+1
+ * \param knobs parameter settings for colamd
+ * \param stats colamd output statistics and error codes
+ */
+template <typename Index>
+static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, double knobs[COLAMD_KNOBS], Index stats[COLAMD_STATS])
+{
+  /* === Local variables ================================================== */
+  
+  Index i ;     /* loop index */
+  Index nnz ;     /* nonzeros in A */
+  Index Row_size ;    /* size of Row [], in integers */
+  Index Col_size ;    /* size of Col [], in integers */
+  Index need ;      /* minimum required length of A */
+  Colamd_Row<Index> *Row ;   /* pointer into A of Row [0..n_row] array */
+  colamd_col<Index> *Col ;   /* pointer into A of Col [0..n_col] array */
+  Index n_col2 ;    /* number of non-dense, non-empty columns */
+  Index n_row2 ;    /* number of non-dense, non-empty rows */
+  Index ngarbage ;    /* number of garbage collections performed */
+  Index max_deg ;   /* maximum row degree */
+  double default_knobs [COLAMD_KNOBS] ; /* default knobs array */
+  
+  
+  /* === Check the input arguments ======================================== */
+  
+  if (!stats)
+  {
+    COLAMD_DEBUG0 (("colamd: stats not present\n")) ;
+    return (false) ;
+  }
+  for (i = 0 ; i < COLAMD_STATS ; i++)
+  {
+    stats [i] = 0 ;
+  }
+  stats [COLAMD_STATUS] = COLAMD_OK ;
+  stats [COLAMD_INFO1] = -1 ;
+  stats [COLAMD_INFO2] = -1 ;
+  
+  if (!A)   /* A is not present */
+  {
+    stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+    COLAMD_DEBUG0 (("colamd: A not present\n")) ;
+    return (false) ;
+  }
+  
+  if (!p)   /* p is not present */
+  {
+    stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+    COLAMD_DEBUG0 (("colamd: p not present\n")) ;
+    return (false) ;
+  }
+  
+  if (n_row < 0)  /* n_row must be >= 0 */
+  {
+    stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
+    stats [COLAMD_INFO1] = n_row ;
+    COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
+    return (false) ;
+  }
+  
+  if (n_col < 0)  /* n_col must be >= 0 */
+  {
+    stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
+    stats [COLAMD_INFO1] = n_col ;
+    COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
+    return (false) ;
+  }
+  
+  nnz = p [n_col] ;
+  if (nnz < 0)  /* nnz must be >= 0 */
+  {
+    stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
+    stats [COLAMD_INFO1] = nnz ;
+    COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
+    return (false) ;
+  }
+  
+  if (p [0] != 0)
+  {
+    stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
+    stats [COLAMD_INFO1] = p [0] ;
+    COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
+    return (false) ;
+  }
+  
+  /* === If no knobs, set default knobs =================================== */
+  
+  if (!knobs)
+  {
+    colamd_set_defaults (default_knobs) ;
+    knobs = default_knobs ;
+  }
+  
+  /* === Allocate the Row and Col arrays from array A ===================== */
+  
+  Col_size = colamd_c (n_col) ;
+  Row_size = colamd_r (n_row) ;
+  need = 2*nnz + n_col + Col_size + Row_size ;
+  
+  if (need > Alen)
+  {
+    /* not enough space in array A to perform the ordering */
+    stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
+    stats [COLAMD_INFO1] = need ;
+    stats [COLAMD_INFO2] = Alen ;
+    COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
+    return (false) ;
+  }
+  
+  Alen -= Col_size + Row_size ;
+  Col = (colamd_col<Index> *) &A [Alen] ;
+  Row = (Colamd_Row<Index> *) &A [Alen + Col_size] ;
+
+  /* === Construct the row and column data structures ===================== */
+  
+  if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
+  {
+    /* input matrix is invalid */
+    COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ;
+    return (false) ;
+  }
+  
+  /* === Initialize scores, kill dense rows/columns ======================= */
+
+  Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+		&n_row2, &n_col2, &max_deg) ;
+  
+  /* === Order the supercolumns =========================================== */
+  
+  ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+			    n_col2, max_deg, 2*nnz) ;
+  
+  /* === Order the non-principal columns ================================== */
+  
+  Eigen::internal::order_children (n_col, Col, p) ;
+  
+  /* === Return statistics in stats ======================================= */
+  
+  stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
+  stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
+  stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
+  COLAMD_DEBUG0 (("colamd: done.\n")) ; 
+  return (true) ;
+}
+
+/* ========================================================================== */
+/* === NON-USER-CALLABLE ROUTINES: ========================================== */
+/* ========================================================================== */
+
+/* There are no user-callable routines beyond this point in the file */
+
+
+/* ========================================================================== */
+/* === init_rows_cols ======================================================= */
+/* ========================================================================== */
+
+/*
+  Takes the column form of the matrix in A and creates the row form of the
+  matrix.  Also, row and column attributes are stored in the Col and Row
+  structs.  If the columns are un-sorted or contain duplicate row indices,
+  this routine will also sort and remove duplicate row indices from the
+  column form of the matrix.  Returns false if the matrix is invalid,
+  true otherwise.  Not user-callable.
+*/
+template <typename Index>
+static Index init_rows_cols  /* returns true if OK, or false otherwise */
+  (
+    /* === Parameters ======================================================= */
+
+    Index n_row,      /* number of rows of A */
+    Index n_col,      /* number of columns of A */
+    Colamd_Row<Index> Row [],    /* of size n_row+1 */
+    colamd_col<Index> Col [],    /* of size n_col+1 */
+    Index A [],     /* row indices of A, of size Alen */
+    Index p [],     /* pointers to columns in A, of size n_col+1 */
+    Index stats [COLAMD_STATS]  /* colamd statistics */ 
+    )
+{
+  /* === Local variables ================================================== */
+
+  Index col ;     /* a column index */
+  Index row ;     /* a row index */
+  Index *cp ;     /* a column pointer */
+  Index *cp_end ;   /* a pointer to the end of a column */
+  Index *rp ;     /* a row pointer */
+  Index *rp_end ;   /* a pointer to the end of a row */
+  Index last_row ;    /* previous row */
+
+  /* === Initialize columns, and check column pointers ==================== */
+
+  for (col = 0 ; col < n_col ; col++)
+  {
+    Col [col].start = p [col] ;
+    Col [col].length = p [col+1] - p [col] ;
+
+    if (Col [col].length < 0)
+    {
+      /* column pointers must be non-decreasing */
+      stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
+      stats [COLAMD_INFO1] = col ;
+      stats [COLAMD_INFO2] = Col [col].length ;
+      COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
+      return (false) ;
+    }
+
+    Col [col].shared1.thickness = 1 ;
+    Col [col].shared2.score = 0 ;
+    Col [col].shared3.prev = COLAMD_EMPTY ;
+    Col [col].shared4.degree_next = COLAMD_EMPTY ;
+  }
+
+  /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
+
+  /* === Scan columns, compute row degrees, and check row indices ========= */
+
+  stats [COLAMD_INFO3] = 0 ;  /* number of duplicate or unsorted row indices*/
+
+  for (row = 0 ; row < n_row ; row++)
+  {
+    Row [row].length = 0 ;
+    Row [row].shared2.mark = -1 ;
+  }
+
+  for (col = 0 ; col < n_col ; col++)
+  {
+    last_row = -1 ;
+
+    cp = &A [p [col]] ;
+    cp_end = &A [p [col+1]] ;
+
+    while (cp < cp_end)
+    {
+      row = *cp++ ;
+
+      /* make sure row indices within range */
+      if (row < 0 || row >= n_row)
+      {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
+	stats [COLAMD_INFO1] = col ;
+	stats [COLAMD_INFO2] = row ;
+	stats [COLAMD_INFO3] = n_row ;
+	COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
+	return (false) ;
+      }
+
+      if (row <= last_row || Row [row].shared2.mark == col)
+      {
+	/* row index are unsorted or repeated (or both), thus col */
+	/* is jumbled.  This is a notice, not an error condition. */
+	stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
+	stats [COLAMD_INFO1] = col ;
+	stats [COLAMD_INFO2] = row ;
+	(stats [COLAMD_INFO3]) ++ ;
+	COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
+      }
+
+      if (Row [row].shared2.mark != col)
+      {
+	Row [row].length++ ;
+      }
+      else
+      {
+	/* this is a repeated entry in the column, */
+	/* it will be removed */
+	Col [col].length-- ;
+      }
+
+      /* mark the row as having been seen in this column */
+      Row [row].shared2.mark = col ;
+
+      last_row = row ;
+    }
+  }
+
+  /* === Compute row pointers ============================================= */
+
+  /* row form of the matrix starts directly after the column */
+  /* form of matrix in A */
+  Row [0].start = p [n_col] ;
+  Row [0].shared1.p = Row [0].start ;
+  Row [0].shared2.mark = -1 ;
+  for (row = 1 ; row < n_row ; row++)
+  {
+    Row [row].start = Row [row-1].start + Row [row-1].length ;
+    Row [row].shared1.p = Row [row].start ;
+    Row [row].shared2.mark = -1 ;
+  }
+
+  /* === Create row form ================================================== */
+
+  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+  {
+    /* if cols jumbled, watch for repeated row indices */
+    for (col = 0 ; col < n_col ; col++)
+    {
+      cp = &A [p [col]] ;
+      cp_end = &A [p [col+1]] ;
+      while (cp < cp_end)
+      {
+	row = *cp++ ;
+	if (Row [row].shared2.mark != col)
+	{
+	  A [(Row [row].shared1.p)++] = col ;
+	  Row [row].shared2.mark = col ;
+	}
+      }
+    }
+  }
+  else
+  {
+    /* if cols not jumbled, we don't need the mark (this is faster) */
+    for (col = 0 ; col < n_col ; col++)
+    {
+      cp = &A [p [col]] ;
+      cp_end = &A [p [col+1]] ;
+      while (cp < cp_end)
+      {
+	A [(Row [*cp++].shared1.p)++] = col ;
+      }
+    }
+  }
+
+  /* === Clear the row marks and set row degrees ========================== */
+
+  for (row = 0 ; row < n_row ; row++)
+  {
+    Row [row].shared2.mark = 0 ;
+    Row [row].shared1.degree = Row [row].length ;
+  }
+
+  /* === See if we need to re-create columns ============================== */
+
+  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+  {
+    COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
+
+
+    /* === Compute col pointers ========================================= */
+
+    /* col form of the matrix starts at A [0]. */
+    /* Note, we may have a gap between the col form and the row */
+    /* form if there were duplicate entries, if so, it will be */
+    /* removed upon the first garbage collection */
+    Col [0].start = 0 ;
+    p [0] = Col [0].start ;
+    for (col = 1 ; col < n_col ; col++)
+    {
+      /* note that the lengths here are for pruned columns, i.e. */
+      /* no duplicate row indices will exist for these columns */
+      Col [col].start = Col [col-1].start + Col [col-1].length ;
+      p [col] = Col [col].start ;
+    }
+
+    /* === Re-create col form =========================================== */
+
+    for (row = 0 ; row < n_row ; row++)
+    {
+      rp = &A [Row [row].start] ;
+      rp_end = rp + Row [row].length ;
+      while (rp < rp_end)
+      {
+	A [(p [*rp++])++] = row ;
+      }
+    }
+  }
+
+  /* === Done.  Matrix is not (or no longer) jumbled ====================== */
+
+  return (true) ;
+}
+
+
+/* ========================================================================== */
+/* === init_scoring ========================================================= */
+/* ========================================================================== */
+
+/*
+  Kills dense or empty columns and rows, calculates an initial score for
+  each column, and places all columns in the degree lists.  Not user-callable.
+*/
+template <typename Index>
+static void init_scoring
+  (
+    /* === Parameters ======================================================= */
+
+    Index n_row,      /* number of rows of A */
+    Index n_col,      /* number of columns of A */
+    Colamd_Row<Index> Row [],    /* of size n_row+1 */
+    colamd_col<Index> Col [],    /* of size n_col+1 */
+    Index A [],     /* column form and row form of A */
+    Index head [],    /* of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameters */
+    Index *p_n_row2,    /* number of non-dense, non-empty rows */
+    Index *p_n_col2,    /* number of non-dense, non-empty columns */
+    Index *p_max_deg    /* maximum row degree */
+    )
+{
+  /* === Local variables ================================================== */
+
+  Index c ;     /* a column index */
+  Index r, row ;    /* a row index */
+  Index *cp ;     /* a column pointer */
+  Index deg ;     /* degree of a row or column */
+  Index *cp_end ;   /* a pointer to the end of a column */
+  Index *new_cp ;   /* new column pointer */
+  Index col_length ;    /* length of pruned column */
+  Index score ;     /* current column score */
+  Index n_col2 ;    /* number of non-dense, non-empty columns */
+  Index n_row2 ;    /* number of non-dense, non-empty rows */
+  Index dense_row_count ; /* remove rows with more entries than this */
+  Index dense_col_count ; /* remove cols with more entries than this */
+  Index min_score ;   /* smallest column score */
+  Index max_deg ;   /* maximum row degree */
+  Index next_col ;    /* Used to add to degree list.*/
+
+
+  /* === Extract knobs ==================================================== */
+
+  dense_row_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
+  dense_col_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
+  COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
+  max_deg = 0 ;
+  n_col2 = n_col ;
+  n_row2 = n_row ;
+
+  /* === Kill empty columns =============================================== */
+
+  /* Put the empty columns at the end in their natural order, so that LU */
+  /* factorization can proceed as far as possible. */
+  for (c = n_col-1 ; c >= 0 ; c--)
+  {
+    deg = Col [c].length ;
+    if (deg == 0)
+    {
+      /* this is a empty column, kill and order it last */
+      Col [c].shared2.order = --n_col2 ;
+      KILL_PRINCIPAL_COL (c) ;
+    }
+  }
+  COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
+
+  /* === Kill dense columns =============================================== */
+
+  /* Put the dense columns at the end, in their natural order */
+  for (c = n_col-1 ; c >= 0 ; c--)
+  {
+    /* skip any dead columns */
+    if (COL_IS_DEAD (c))
+    {
+      continue ;
+    }
+    deg = Col [c].length ;
+    if (deg > dense_col_count)
+    {
+      /* this is a dense column, kill and order it last */
+      Col [c].shared2.order = --n_col2 ;
+      /* decrement the row degrees */
+      cp = &A [Col [c].start] ;
+      cp_end = cp + Col [c].length ;
+      while (cp < cp_end)
+      {
+	Row [*cp++].shared1.degree-- ;
+      }
+      KILL_PRINCIPAL_COL (c) ;
+    }
+  }
+  COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
+
+  /* === Kill dense and empty rows ======================================== */
+
+  for (r = 0 ; r < n_row ; r++)
+  {
+    deg = Row [r].shared1.degree ;
+    COLAMD_ASSERT (deg >= 0 && deg <= n_col) ;
+    if (deg > dense_row_count || deg == 0)
+    {
+      /* kill a dense or empty row */
+      KILL_ROW (r) ;
+      --n_row2 ;
+    }
+    else
+    {
+      /* keep track of max degree of remaining rows */
+      max_deg = COLAMD_MAX (max_deg, deg) ;
+    }
+  }
+  COLAMD_DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
+
+  /* === Compute initial column scores ==================================== */
+
+  /* At this point the row degrees are accurate.  They reflect the number */
+  /* of "live" (non-dense) columns in each row.  No empty rows exist. */
+  /* Some "live" columns may contain only dead rows, however.  These are */
+  /* pruned in the code below. */
+
+  /* now find the initial matlab score for each column */
+  for (c = n_col-1 ; c >= 0 ; c--)
+  {
+    /* skip dead column */
+    if (COL_IS_DEAD (c))
+    {
+      continue ;
+    }
+    score = 0 ;
+    cp = &A [Col [c].start] ;
+    new_cp = cp ;
+    cp_end = cp + Col [c].length ;
+    while (cp < cp_end)
+    {
+      /* get a row */
+      row = *cp++ ;
+      /* skip if dead */
+      if (ROW_IS_DEAD (row))
+      {
+	continue ;
+      }
+      /* compact the column */
+      *new_cp++ = row ;
+      /* add row's external degree */
+      score += Row [row].shared1.degree - 1 ;
+      /* guard against integer overflow */
+      score = COLAMD_MIN (score, n_col) ;
+    }
+    /* determine pruned column length */
+    col_length = (Index) (new_cp - &A [Col [c].start]) ;
+    if (col_length == 0)
+    {
+      /* a newly-made null column (all rows in this col are "dense" */
+      /* and have already been killed) */
+      COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ;
+      Col [c].shared2.order = --n_col2 ;
+      KILL_PRINCIPAL_COL (c) ;
+    }
+    else
+    {
+      /* set column length and set score */
+      COLAMD_ASSERT (score >= 0) ;
+      COLAMD_ASSERT (score <= n_col) ;
+      Col [c].length = col_length ;
+      Col [c].shared2.score = score ;
+    }
+  }
+  COLAMD_DEBUG1 (("colamd: Dense, null, and newly-null columns killed: %d\n",
+		  n_col-n_col2)) ;
+
+  /* At this point, all empty rows and columns are dead.  All live columns */
+  /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
+  /* yet).  Rows may contain dead columns, but all live rows contain at */
+  /* least one live column. */
+
+  /* === Initialize degree lists ========================================== */
+
+
+  /* clear the hash buckets */
+  for (c = 0 ; c <= n_col ; c++)
+  {
+    head [c] = COLAMD_EMPTY ;
+  }
+  min_score = n_col ;
+  /* place in reverse order, so low column indices are at the front */
+  /* of the lists.  This is to encourage natural tie-breaking */
+  for (c = n_col-1 ; c >= 0 ; c--)
+  {
+    /* only add principal columns to degree lists */
+    if (COL_IS_ALIVE (c))
+    {
+      COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n",
+		      c, Col [c].shared2.score, min_score, n_col)) ;
+
+      /* === Add columns score to DList =============================== */
+
+      score = Col [c].shared2.score ;
+
+      COLAMD_ASSERT (min_score >= 0) ;
+      COLAMD_ASSERT (min_score <= n_col) ;
+      COLAMD_ASSERT (score >= 0) ;
+      COLAMD_ASSERT (score <= n_col) ;
+      COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ;
+
+      /* now add this column to dList at proper score location */
+      next_col = head [score] ;
+      Col [c].shared3.prev = COLAMD_EMPTY ;
+      Col [c].shared4.degree_next = next_col ;
+
+      /* if there already was a column with the same score, set its */
+      /* previous pointer to this new column */
+      if (next_col != COLAMD_EMPTY)
+      {
+	Col [next_col].shared3.prev = c ;
+      }
+      head [score] = c ;
+
+      /* see if this score is less than current min */
+      min_score = COLAMD_MIN (min_score, score) ;
+
+
+    }
+  }
+
+
+  /* === Return number of remaining columns, and max row degree =========== */
+
+  *p_n_col2 = n_col2 ;
+  *p_n_row2 = n_row2 ;
+  *p_max_deg = max_deg ;
+}
+
+
+/* ========================================================================== */
+/* === find_ordering ======================================================== */
+/* ========================================================================== */
+
+/*
+  Order the principal columns of the supercolumn form of the matrix
+  (no supercolumns on input).  Uses a minimum approximate column minimum
+  degree ordering method.  Not user-callable.
+*/
+template <typename Index>
+static Index find_ordering /* return the number of garbage collections */
+  (
+    /* === Parameters ======================================================= */
+
+    Index n_row,      /* number of rows of A */
+    Index n_col,      /* number of columns of A */
+    Index Alen,     /* size of A, 2*nnz + n_col or larger */
+    Colamd_Row<Index> Row [],    /* of size n_row+1 */
+    colamd_col<Index> Col [],    /* of size n_col+1 */
+    Index A [],     /* column form and row form of A */
+    Index head [],    /* of size n_col+1 */
+    Index n_col2,     /* Remaining columns to order */
+    Index max_deg,    /* Maximum row degree */
+    Index pfree     /* index of first free slot (2*nnz on entry) */
+    )
+{
+  /* === Local variables ================================================== */
+
+  Index k ;     /* current pivot ordering step */
+  Index pivot_col ;   /* current pivot column */
+  Index *cp ;     /* a column pointer */
+  Index *rp ;     /* a row pointer */
+  Index pivot_row ;   /* current pivot row */
+  Index *new_cp ;   /* modified column pointer */
+  Index *new_rp ;   /* modified row pointer */
+  Index pivot_row_start ; /* pointer to start of pivot row */
+  Index pivot_row_degree ;  /* number of columns in pivot row */
+  Index pivot_row_length ;  /* number of supercolumns in pivot row */
+  Index pivot_col_score ; /* score of pivot column */
+  Index needed_memory ;   /* free space needed for pivot row */
+  Index *cp_end ;   /* pointer to the end of a column */
+  Index *rp_end ;   /* pointer to the end of a row */
+  Index row ;     /* a row index */
+  Index col ;     /* a column index */
+  Index max_score ;   /* maximum possible score */
+  Index cur_score ;   /* score of current column */
+  unsigned int hash ;   /* hash value for supernode detection */
+  Index head_column ;   /* head of hash bucket */
+  Index first_col ;   /* first column in hash bucket */
+  Index tag_mark ;    /* marker value for mark array */
+  Index row_mark ;    /* Row [row].shared2.mark */
+  Index set_difference ;  /* set difference size of row with pivot row */
+  Index min_score ;   /* smallest column score */
+  Index col_thickness ;   /* "thickness" (no. of columns in a supercol) */
+  Index max_mark ;    /* maximum value of tag_mark */
+  Index pivot_col_thickness ; /* number of columns represented by pivot col */
+  Index prev_col ;    /* Used by Dlist operations. */
+  Index next_col ;    /* Used by Dlist operations. */
+  Index ngarbage ;    /* number of garbage collections performed */
+
+
+  /* === Initialization and clear mark ==================================== */
+
+  max_mark = INT_MAX - n_col ;  /* INT_MAX defined in <limits.h> */
+  tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+  min_score = 0 ;
+  ngarbage = 0 ;
+  COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
+
+  /* === Order the columns ================================================ */
+
+  for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
+  {
+
+    /* === Select pivot column, and order it ============================ */
+
+    /* make sure degree list isn't empty */
+    COLAMD_ASSERT (min_score >= 0) ;
+    COLAMD_ASSERT (min_score <= n_col) ;
+    COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ;
+
+    /* get pivot column from head of minimum degree list */
+    while (head [min_score] == COLAMD_EMPTY && min_score < n_col)
+    {
+      min_score++ ;
+    }
+    pivot_col = head [min_score] ;
+    COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
+    next_col = Col [pivot_col].shared4.degree_next ;
+    head [min_score] = next_col ;
+    if (next_col != COLAMD_EMPTY)
+    {
+      Col [next_col].shared3.prev = COLAMD_EMPTY ;
+    }
+
+    COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ;
+    COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
+
+    /* remember score for defrag check */
+    pivot_col_score = Col [pivot_col].shared2.score ;
+
+    /* the pivot column is the kth column in the pivot order */
+    Col [pivot_col].shared2.order = k ;
+
+    /* increment order count by column thickness */
+    pivot_col_thickness = Col [pivot_col].shared1.thickness ;
+    k += pivot_col_thickness ;
+    COLAMD_ASSERT (pivot_col_thickness > 0) ;
+
+    /* === Garbage_collection, if necessary ============================= */
+
+    needed_memory = COLAMD_MIN (pivot_col_score, n_col - k) ;
+    if (pfree + needed_memory >= Alen)
+    {
+      pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+      ngarbage++ ;
+      /* after garbage collection we will have enough */
+      COLAMD_ASSERT (pfree + needed_memory < Alen) ;
+      /* garbage collection has wiped out the Row[].shared2.mark array */
+      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+
+    }
+
+    /* === Compute pivot row pattern ==================================== */
+
+    /* get starting location for this new merged row */
+    pivot_row_start = pfree ;
+
+    /* initialize new row counts to zero */
+    pivot_row_degree = 0 ;
+
+    /* tag pivot column as having been visited so it isn't included */
+    /* in merged pivot row */
+    Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
+
+    /* pivot row is the union of all rows in the pivot column pattern */
+    cp = &A [Col [pivot_col].start] ;
+    cp_end = cp + Col [pivot_col].length ;
+    while (cp < cp_end)
+    {
+      /* get a row */
+      row = *cp++ ;
+      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+      /* skip if row is dead */
+      if (ROW_IS_DEAD (row))
+      {
+	continue ;
+      }
+      rp = &A [Row [row].start] ;
+      rp_end = rp + Row [row].length ;
+      while (rp < rp_end)
+      {
+	/* get a column */
+	col = *rp++ ;
+	/* add the column, if alive and untagged */
+	col_thickness = Col [col].shared1.thickness ;
+	if (col_thickness > 0 && COL_IS_ALIVE (col))
+	{
+	  /* tag column in pivot row */
+	  Col [col].shared1.thickness = -col_thickness ;
+	  COLAMD_ASSERT (pfree < Alen) ;
+	  /* place column in pivot row */
+	  A [pfree++] = col ;
+	  pivot_row_degree += col_thickness ;
+	}
+      }
+    }
+
+    /* clear tag on pivot column */
+    Col [pivot_col].shared1.thickness = pivot_col_thickness ;
+    max_deg = COLAMD_MAX (max_deg, pivot_row_degree) ;
+
+
+    /* === Kill all rows used to construct pivot row ==================== */
+
+    /* also kill pivot row, temporarily */
+    cp = &A [Col [pivot_col].start] ;
+    cp_end = cp + Col [pivot_col].length ;
+    while (cp < cp_end)
+    {
+      /* may be killing an already dead row */
+      row = *cp++ ;
+      COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
+      KILL_ROW (row) ;
+    }
+
+    /* === Select a row index to use as the new pivot row =============== */
+
+    pivot_row_length = pfree - pivot_row_start ;
+    if (pivot_row_length > 0)
+    {
+      /* pick the "pivot" row arbitrarily (first row in col) */
+      pivot_row = A [Col [pivot_col].start] ;
+      COLAMD_DEBUG3 (("Pivotal row is %d\n", pivot_row)) ;
+    }
+    else
+    {
+      /* there is no pivot row, since it is of zero length */
+      pivot_row = COLAMD_EMPTY ;
+      COLAMD_ASSERT (pivot_row_length == 0) ;
+    }
+    COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
+
+    /* === Approximate degree computation =============================== */
+
+    /* Here begins the computation of the approximate degree.  The column */
+    /* score is the sum of the pivot row "length", plus the size of the */
+    /* set differences of each row in the column minus the pattern of the */
+    /* pivot row itself.  The column ("thickness") itself is also */
+    /* excluded from the column score (we thus use an approximate */
+    /* external degree). */
+
+    /* The time taken by the following code (compute set differences, and */
+    /* add them up) is proportional to the size of the data structure */
+    /* being scanned - that is, the sum of the sizes of each column in */
+    /* the pivot row.  Thus, the amortized time to compute a column score */
+    /* is proportional to the size of that column (where size, in this */
+    /* context, is the column "length", or the number of row indices */
+    /* in that column).  The number of row indices in a column is */
+    /* monotonically non-decreasing, from the length of the original */
+    /* column on input to colamd. */
+
+    /* === Compute set differences ====================================== */
+
+    COLAMD_DEBUG3 (("** Computing set differences phase. **\n")) ;
+
+    /* pivot row is currently dead - it will be revived later. */
+
+    COLAMD_DEBUG3 (("Pivot row: ")) ;
+    /* for each column in pivot row */
+    rp = &A [pivot_row_start] ;
+    rp_end = rp + pivot_row_length ;
+    while (rp < rp_end)
+    {
+      col = *rp++ ;
+      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+      COLAMD_DEBUG3 (("Col: %d\n", col)) ;
+
+      /* clear tags used to construct pivot row pattern */
+      col_thickness = -Col [col].shared1.thickness ;
+      COLAMD_ASSERT (col_thickness > 0) ;
+      Col [col].shared1.thickness = col_thickness ;
+
+      /* === Remove column from degree list =========================== */
+
+      cur_score = Col [col].shared2.score ;
+      prev_col = Col [col].shared3.prev ;
+      next_col = Col [col].shared4.degree_next ;
+      COLAMD_ASSERT (cur_score >= 0) ;
+      COLAMD_ASSERT (cur_score <= n_col) ;
+      COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ;
+      if (prev_col == COLAMD_EMPTY)
+      {
+	head [cur_score] = next_col ;
+      }
+      else
+      {
+	Col [prev_col].shared4.degree_next = next_col ;
+      }
+      if (next_col != COLAMD_EMPTY)
+      {
+	Col [next_col].shared3.prev = prev_col ;
+      }
+
+      /* === Scan the column ========================================== */
+
+      cp = &A [Col [col].start] ;
+      cp_end = cp + Col [col].length ;
+      while (cp < cp_end)
+      {
+	/* get a row */
+	row = *cp++ ;
+	row_mark = Row [row].shared2.mark ;
+	/* skip if dead */
+	if (ROW_IS_MARKED_DEAD (row_mark))
+	{
+	  continue ;
+	}
+	COLAMD_ASSERT (row != pivot_row) ;
+	set_difference = row_mark - tag_mark ;
+	/* check if the row has been seen yet */
+	if (set_difference < 0)
+	{
+	  COLAMD_ASSERT (Row [row].shared1.degree <= max_deg) ;
+	  set_difference = Row [row].shared1.degree ;
+	}
+	/* subtract column thickness from this row's set difference */
+	set_difference -= col_thickness ;
+	COLAMD_ASSERT (set_difference >= 0) ;
+	/* absorb this row if the set difference becomes zero */
+	if (set_difference == 0)
+	{
+	  COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
+	  KILL_ROW (row) ;
+	}
+	else
+	{
+	  /* save the new mark */
+	  Row [row].shared2.mark = set_difference + tag_mark ;
+	}
+      }
+    }
+
+
+    /* === Add up set differences for each column ======================= */
+
+    COLAMD_DEBUG3 (("** Adding set differences phase. **\n")) ;
+
+    /* for each column in pivot row */
+    rp = &A [pivot_row_start] ;
+    rp_end = rp + pivot_row_length ;
+    while (rp < rp_end)
+    {
+      /* get a column */
+      col = *rp++ ;
+      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+      hash = 0 ;
+      cur_score = 0 ;
+      cp = &A [Col [col].start] ;
+      /* compact the column */
+      new_cp = cp ;
+      cp_end = cp + Col [col].length ;
+
+      COLAMD_DEBUG4 (("Adding set diffs for Col: %d.\n", col)) ;
+
+      while (cp < cp_end)
+      {
+	/* get a row */
+	row = *cp++ ;
+	COLAMD_ASSERT(row >= 0 && row < n_row) ;
+	row_mark = Row [row].shared2.mark ;
+	/* skip if dead */
+	if (ROW_IS_MARKED_DEAD (row_mark))
+	{
+	  continue ;
+	}
+	COLAMD_ASSERT (row_mark > tag_mark) ;
+	/* compact the column */
+	*new_cp++ = row ;
+	/* compute hash function */
+	hash += row ;
+	/* add set difference */
+	cur_score += row_mark - tag_mark ;
+	/* integer overflow... */
+	cur_score = COLAMD_MIN (cur_score, n_col) ;
+      }
+
+      /* recompute the column's length */
+      Col [col].length = (Index) (new_cp - &A [Col [col].start]) ;
+
+      /* === Further mass elimination ================================= */
+
+      if (Col [col].length == 0)
+      {
+	COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
+	/* nothing left but the pivot row in this column */
+	KILL_PRINCIPAL_COL (col) ;
+	pivot_row_degree -= Col [col].shared1.thickness ;
+	COLAMD_ASSERT (pivot_row_degree >= 0) ;
+	/* order it */
+	Col [col].shared2.order = k ;
+	/* increment order count by column thickness */
+	k += Col [col].shared1.thickness ;
+      }
+      else
+      {
+	/* === Prepare for supercolumn detection ==================== */
+
+	COLAMD_DEBUG4 (("Preparing supercol detection for Col: %d.\n", col)) ;
+
+	/* save score so far */
+	Col [col].shared2.score = cur_score ;
+
+	/* add column to hash table, for supercolumn detection */
+	hash %= n_col + 1 ;
+
+	COLAMD_DEBUG4 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
+	COLAMD_ASSERT (hash <= n_col) ;
+
+	head_column = head [hash] ;
+	if (head_column > COLAMD_EMPTY)
+	{
+	  /* degree list "hash" is non-empty, use prev (shared3) of */
+	  /* first column in degree list as head of hash bucket */
+	  first_col = Col [head_column].shared3.headhash ;
+	  Col [head_column].shared3.headhash = col ;
+	}
+	else
+	{
+	  /* degree list "hash" is empty, use head as hash bucket */
+	  first_col = - (head_column + 2) ;
+	  head [hash] = - (col + 2) ;
+	}
+	Col [col].shared4.hash_next = first_col ;
+
+	/* save hash function in Col [col].shared3.hash */
+	Col [col].shared3.hash = (Index) hash ;
+	COLAMD_ASSERT (COL_IS_ALIVE (col)) ;
+      }
+    }
+
+    /* The approximate external column degree is now computed.  */
+
+    /* === Supercolumn detection ======================================== */
+
+    COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ;
+
+    Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
+
+    /* === Kill the pivotal column ====================================== */
+
+    KILL_PRINCIPAL_COL (pivot_col) ;
+
+    /* === Clear mark =================================================== */
+
+    tag_mark += (max_deg + 1) ;
+    if (tag_mark >= max_mark)
+    {
+      COLAMD_DEBUG2 (("clearing tag_mark\n")) ;
+      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+    }
+
+    /* === Finalize the new pivot row, and column scores ================ */
+
+    COLAMD_DEBUG3 (("** Finalize scores phase. **\n")) ;
+
+    /* for each column in pivot row */
+    rp = &A [pivot_row_start] ;
+    /* compact the pivot row */
+    new_rp = rp ;
+    rp_end = rp + pivot_row_length ;
+    while (rp < rp_end)
+    {
+      col = *rp++ ;
+      /* skip dead columns */
+      if (COL_IS_DEAD (col))
+      {
+	continue ;
+      }
+      *new_rp++ = col ;
+      /* add new pivot row to column */
+      A [Col [col].start + (Col [col].length++)] = pivot_row ;
+
+      /* retrieve score so far and add on pivot row's degree. */
+      /* (we wait until here for this in case the pivot */
+      /* row's degree was reduced due to mass elimination). */
+      cur_score = Col [col].shared2.score + pivot_row_degree ;
+
+      /* calculate the max possible score as the number of */
+      /* external columns minus the 'k' value minus the */
+      /* columns thickness */
+      max_score = n_col - k - Col [col].shared1.thickness ;
+
+      /* make the score the external degree of the union-of-rows */
+      cur_score -= Col [col].shared1.thickness ;
+
+      /* make sure score is less or equal than the max score */
+      cur_score = COLAMD_MIN (cur_score, max_score) ;
+      COLAMD_ASSERT (cur_score >= 0) ;
+
+      /* store updated score */
+      Col [col].shared2.score = cur_score ;
+
+      /* === Place column back in degree list ========================= */
+
+      COLAMD_ASSERT (min_score >= 0) ;
+      COLAMD_ASSERT (min_score <= n_col) ;
+      COLAMD_ASSERT (cur_score >= 0) ;
+      COLAMD_ASSERT (cur_score <= n_col) ;
+      COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ;
+      next_col = head [cur_score] ;
+      Col [col].shared4.degree_next = next_col ;
+      Col [col].shared3.prev = COLAMD_EMPTY ;
+      if (next_col != COLAMD_EMPTY)
+      {
+	Col [next_col].shared3.prev = col ;
+      }
+      head [cur_score] = col ;
+
+      /* see if this score is less than current min */
+      min_score = COLAMD_MIN (min_score, cur_score) ;
+
+    }
+
+    /* === Resurrect the new pivot row ================================== */
+
+    if (pivot_row_degree > 0)
+    {
+      /* update pivot row length to reflect any cols that were killed */
+      /* during super-col detection and mass elimination */
+      Row [pivot_row].start  = pivot_row_start ;
+      Row [pivot_row].length = (Index) (new_rp - &A[pivot_row_start]) ;
+      Row [pivot_row].shared1.degree = pivot_row_degree ;
+      Row [pivot_row].shared2.mark = 0 ;
+      /* pivot row is no longer dead */
+    }
+  }
+
+  /* === All principal columns have now been ordered ====================== */
+
+  return (ngarbage) ;
+}
+
+
+/* ========================================================================== */
+/* === order_children ======================================================= */
+/* ========================================================================== */
+
+/*
+  The find_ordering routine has ordered all of the principal columns (the
+  representatives of the supercolumns).  The non-principal columns have not
+  yet been ordered.  This routine orders those columns by walking up the
+  parent tree (a column is a child of the column which absorbed it).  The
+  final permutation vector is then placed in p [0 ... n_col-1], with p [0]
+  being the first column, and p [n_col-1] being the last.  It doesn't look
+  like it at first glance, but be assured that this routine takes time linear
+  in the number of columns.  Although not immediately obvious, the time
+  taken by this routine is O (n_col), that is, linear in the number of
+  columns.  Not user-callable.
+*/
+template <typename Index>
+static inline  void order_children
+(
+  /* === Parameters ======================================================= */
+
+  Index n_col,      /* number of columns of A */
+  colamd_col<Index> Col [],    /* of size n_col+1 */
+  Index p []      /* p [0 ... n_col-1] is the column permutation*/
+  )
+{
+  /* === Local variables ================================================== */
+
+  Index i ;     /* loop counter for all columns */
+  Index c ;     /* column index */
+  Index parent ;    /* index of column's parent */
+  Index order ;     /* column's order */
+
+  /* === Order each non-principal column ================================== */
+
+  for (i = 0 ; i < n_col ; i++)
+  {
+    /* find an un-ordered non-principal column */
+    COLAMD_ASSERT (COL_IS_DEAD (i)) ;
+    if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY)
+    {
+      parent = i ;
+      /* once found, find its principal parent */
+      do
+      {
+	parent = Col [parent].shared1.parent ;
+      } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+
+      /* now, order all un-ordered non-principal columns along path */
+      /* to this parent.  collapse tree at the same time */
+      c = i ;
+      /* get order of parent */
+      order = Col [parent].shared2.order ;
+
+      do
+      {
+	COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ;
+
+	/* order this column */
+	Col [c].shared2.order = order++ ;
+	/* collaps tree */
+	Col [c].shared1.parent = parent ;
+
+	/* get immediate parent of this column */
+	c = Col [c].shared1.parent ;
+
+	/* continue until we hit an ordered column.  There are */
+	/* guarranteed not to be anymore unordered columns */
+	/* above an ordered column */
+      } while (Col [c].shared2.order == COLAMD_EMPTY) ;
+
+      /* re-order the super_col parent to largest order for this group */
+      Col [parent].shared2.order = order ;
+    }
+  }
+
+  /* === Generate the permutation ========================================= */
+
+  for (c = 0 ; c < n_col ; c++)
+  {
+    p [Col [c].shared2.order] = c ;
+  }
+}
+
+
+/* ========================================================================== */
+/* === detect_super_cols ==================================================== */
+/* ========================================================================== */
+
+/*
+  Detects supercolumns by finding matches between columns in the hash buckets.
+  Check amongst columns in the set A [row_start ... row_start + row_length-1].
+  The columns under consideration are currently *not* in the degree lists,
+  and have already been placed in the hash buckets.
+
+  The hash bucket for columns whose hash function is equal to h is stored
+  as follows:
+
+  if head [h] is >= 0, then head [h] contains a degree list, so:
+
+  head [h] is the first column in degree bucket h.
+  Col [head [h]].headhash gives the first column in hash bucket h.
+
+  otherwise, the degree list is empty, and:
+
+  -(head [h] + 2) is the first column in hash bucket h.
+
+  For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous
+  column" pointer.  Col [c].shared3.hash is used instead as the hash number
+  for that column.  The value of Col [c].shared4.hash_next is the next column
+  in the same hash bucket.
+
+  Assuming no, or "few" hash collisions, the time taken by this routine is
+  linear in the sum of the sizes (lengths) of each column whose score has
+  just been computed in the approximate degree computation.
+  Not user-callable.
+*/
+template <typename Index>
+static void detect_super_cols
+(
+  /* === Parameters ======================================================= */
+  
+  colamd_col<Index> Col [],    /* of size n_col+1 */
+  Index A [],     /* row indices of A */
+  Index head [],    /* head of degree lists and hash buckets */
+  Index row_start,    /* pointer to set of columns to check */
+  Index row_length    /* number of columns to check */
+)
+{
+  /* === Local variables ================================================== */
+
+  Index hash ;      /* hash value for a column */
+  Index *rp ;     /* pointer to a row */
+  Index c ;     /* a column index */
+  Index super_c ;   /* column index of the column to absorb into */
+  Index *cp1 ;      /* column pointer for column super_c */
+  Index *cp2 ;      /* column pointer for column c */
+  Index length ;    /* length of column super_c */
+  Index prev_c ;    /* column preceding c in hash bucket */
+  Index i ;     /* loop counter */
+  Index *rp_end ;   /* pointer to the end of the row */
+  Index col ;     /* a column index in the row to check */
+  Index head_column ;   /* first column in hash bucket or degree list */
+  Index first_col ;   /* first column in hash bucket */
+
+  /* === Consider each column in the row ================================== */
+
+  rp = &A [row_start] ;
+  rp_end = rp + row_length ;
+  while (rp < rp_end)
+  {
+    col = *rp++ ;
+    if (COL_IS_DEAD (col))
+    {
+      continue ;
+    }
+
+    /* get hash number for this column */
+    hash = Col [col].shared3.hash ;
+    COLAMD_ASSERT (hash <= n_col) ;
+
+    /* === Get the first column in this hash bucket ===================== */
+
+    head_column = head [hash] ;
+    if (head_column > COLAMD_EMPTY)
+    {
+      first_col = Col [head_column].shared3.headhash ;
+    }
+    else
+    {
+      first_col = - (head_column + 2) ;
+    }
+
+    /* === Consider each column in the hash bucket ====================== */
+
+    for (super_c = first_col ; super_c != COLAMD_EMPTY ;
+	 super_c = Col [super_c].shared4.hash_next)
+    {
+      COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ;
+      COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ;
+      length = Col [super_c].length ;
+
+      /* prev_c is the column preceding column c in the hash bucket */
+      prev_c = super_c ;
+
+      /* === Compare super_c with all columns after it ================ */
+
+      for (c = Col [super_c].shared4.hash_next ;
+	   c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next)
+      {
+	COLAMD_ASSERT (c != super_c) ;
+	COLAMD_ASSERT (COL_IS_ALIVE (c)) ;
+	COLAMD_ASSERT (Col [c].shared3.hash == hash) ;
+
+	/* not identical if lengths or scores are different */
+	if (Col [c].length != length ||
+	    Col [c].shared2.score != Col [super_c].shared2.score)
+	{
+	  prev_c = c ;
+	  continue ;
+	}
+
+	/* compare the two columns */
+	cp1 = &A [Col [super_c].start] ;
+	cp2 = &A [Col [c].start] ;
+
+	for (i = 0 ; i < length ; i++)
+	{
+	  /* the columns are "clean" (no dead rows) */
+	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp1))  ;
+	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp2))  ;
+	  /* row indices will same order for both supercols, */
+	  /* no gather scatter nessasary */
+	  if (*cp1++ != *cp2++)
+	  {
+	    break ;
+	  }
+	}
+
+	/* the two columns are different if the for-loop "broke" */
+	if (i != length)
+	{
+	  prev_c = c ;
+	  continue ;
+	}
+
+	/* === Got it!  two columns are identical =================== */
+
+	COLAMD_ASSERT (Col [c].shared2.score == Col [super_c].shared2.score) ;
+
+	Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
+	Col [c].shared1.parent = super_c ;
+	KILL_NON_PRINCIPAL_COL (c) ;
+	/* order c later, in order_children() */
+	Col [c].shared2.order = COLAMD_EMPTY ;
+	/* remove c from hash bucket */
+	Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
+      }
+    }
+
+    /* === Empty this hash bucket ======================================= */
+
+    if (head_column > COLAMD_EMPTY)
+    {
+      /* corresponding degree list "hash" is not empty */
+      Col [head_column].shared3.headhash = COLAMD_EMPTY ;
+    }
+    else
+    {
+      /* corresponding degree list "hash" is empty */
+      head [hash] = COLAMD_EMPTY ;
+    }
+  }
+}
+
+
+/* ========================================================================== */
+/* === garbage_collection =================================================== */
+/* ========================================================================== */
+
+/*
+  Defragments and compacts columns and rows in the workspace A.  Used when
+  all avaliable memory has been used while performing row merging.  Returns
+  the index of the first free position in A, after garbage collection.  The
+  time taken by this routine is linear is the size of the array A, which is
+  itself linear in the number of nonzeros in the input matrix.
+  Not user-callable.
+*/
+template <typename Index>
+static Index garbage_collection  /* returns the new value of pfree */
+  (
+    /* === Parameters ======================================================= */
+    
+    Index n_row,      /* number of rows */
+    Index n_col,      /* number of columns */
+    Colamd_Row<Index> Row [],    /* row info */
+    colamd_col<Index> Col [],    /* column info */
+    Index A [],     /* A [0 ... Alen-1] holds the matrix */
+    Index *pfree      /* &A [0] ... pfree is in use */
+    )
+{
+  /* === Local variables ================================================== */
+
+  Index *psrc ;     /* source pointer */
+  Index *pdest ;    /* destination pointer */
+  Index j ;     /* counter */
+  Index r ;     /* a row index */
+  Index c ;     /* a column index */
+  Index length ;    /* length of a row or column */
+
+  /* === Defragment the columns =========================================== */
+
+  pdest = &A[0] ;
+  for (c = 0 ; c < n_col ; c++)
+  {
+    if (COL_IS_ALIVE (c))
+    {
+      psrc = &A [Col [c].start] ;
+
+      /* move and compact the column */
+      COLAMD_ASSERT (pdest <= psrc) ;
+      Col [c].start = (Index) (pdest - &A [0]) ;
+      length = Col [c].length ;
+      for (j = 0 ; j < length ; j++)
+      {
+	r = *psrc++ ;
+	if (ROW_IS_ALIVE (r))
+	{
+	  *pdest++ = r ;
+	}
+      }
+      Col [c].length = (Index) (pdest - &A [Col [c].start]) ;
+    }
+  }
+
+  /* === Prepare to defragment the rows =================================== */
+
+  for (r = 0 ; r < n_row ; r++)
+  {
+    if (ROW_IS_ALIVE (r))
+    {
+      if (Row [r].length == 0)
+      {
+	/* this row is of zero length.  cannot compact it, so kill it */
+	COLAMD_DEBUG3 (("Defrag row kill\n")) ;
+	KILL_ROW (r) ;
+      }
+      else
+      {
+	/* save first column index in Row [r].shared2.first_column */
+	psrc = &A [Row [r].start] ;
+	Row [r].shared2.first_column = *psrc ;
+	COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
+	/* flag the start of the row with the one's complement of row */
+	*psrc = ONES_COMPLEMENT (r) ;
+
+      }
+    }
+  }
+
+  /* === Defragment the rows ============================================== */
+
+  psrc = pdest ;
+  while (psrc < pfree)
+  {
+    /* find a negative number ... the start of a row */
+    if (*psrc++ < 0)
+    {
+      psrc-- ;
+      /* get the row index */
+      r = ONES_COMPLEMENT (*psrc) ;
+      COLAMD_ASSERT (r >= 0 && r < n_row) ;
+      /* restore first column index */
+      *psrc = Row [r].shared2.first_column ;
+      COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
+
+      /* move and compact the row */
+      COLAMD_ASSERT (pdest <= psrc) ;
+      Row [r].start = (Index) (pdest - &A [0]) ;
+      length = Row [r].length ;
+      for (j = 0 ; j < length ; j++)
+      {
+	c = *psrc++ ;
+	if (COL_IS_ALIVE (c))
+	{
+	  *pdest++ = c ;
+	}
+      }
+      Row [r].length = (Index) (pdest - &A [Row [r].start]) ;
+
+    }
+  }
+  /* ensure we found all the rows */
+  COLAMD_ASSERT (debug_rows == 0) ;
+
+  /* === Return the new value of pfree ==================================== */
+
+  return ((Index) (pdest - &A [0])) ;
+}
+
+
+/* ========================================================================== */
+/* === clear_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+  Clears the Row [].shared2.mark array, and returns the new tag_mark.
+  Return value is the new tag_mark.  Not user-callable.
+*/
+template <typename Index>
+static inline  Index clear_mark  /* return the new value for tag_mark */
+  (
+      /* === Parameters ======================================================= */
+
+    Index n_row,    /* number of rows in A */
+    Colamd_Row<Index> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
+    )
+{
+  /* === Local variables ================================================== */
+
+  Index r ;
+
+  for (r = 0 ; r < n_row ; r++)
+  {
+    if (ROW_IS_ALIVE (r))
+    {
+      Row [r].shared2.mark = 0 ;
+    }
+  }
+  return (1) ;
+}
+
+
+} // namespace internal 
+#endif
diff --git a/include/eigen3/Eigen/src/OrderingMethods/Ordering.h b/include/eigen3/Eigen/src/OrderingMethods/Ordering.h
new file mode 100644
index 0000000..f3c31f9
--- /dev/null
+++ b/include/eigen3/Eigen/src/OrderingMethods/Ordering.h
@@ -0,0 +1,154 @@
+ 
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012  Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ORDERING_H
+#define EIGEN_ORDERING_H
+
+namespace Eigen {
+  
+#include "Eigen_Colamd.h"
+
+namespace internal {
+    
+/** \internal
+  * \ingroup OrderingMethods_Module
+  * \returns the symmetric pattern A^T+A from the input matrix A. 
+  * FIXME: The values should not be considered here
+  */
+template<typename MatrixType> 
+void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat)
+{
+  MatrixType C;
+  C = mat.transpose(); // NOTE: Could be  costly
+  for (int i = 0; i < C.rows(); i++) 
+  {
+      for (typename MatrixType::InnerIterator it(C, i); it; ++it)
+        it.valueRef() = 0.0;
+  }
+  symmat = C + mat; 
+}
+    
+}
+
+#ifndef EIGEN_MPL2_ONLY
+
+/** \ingroup OrderingMethods_Module
+  * \class AMDOrdering
+  *
+  * Functor computing the \em approximate \em minimum \em degree ordering
+  * If the matrix is not structurally symmetric, an ordering of A^T+A is computed
+  * \tparam  Index The type of indices of the matrix 
+  * \sa COLAMDOrdering
+  */
+template <typename Index>
+class AMDOrdering
+{
+  public:
+    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    
+    /** Compute the permutation vector from a sparse matrix
+     * This routine is much faster if the input matrix is column-major     
+     */
+    template <typename MatrixType>
+    void operator()(const MatrixType& mat, PermutationType& perm)
+    {
+      // Compute the symmetric pattern
+      SparseMatrix<typename MatrixType::Scalar, ColMajor, Index> symm;
+      internal::ordering_helper_at_plus_a(mat,symm); 
+    
+      // Call the AMD routine 
+      //m_mat.prune(keep_diag());
+      internal::minimum_degree_ordering(symm, perm);
+    }
+    
+    /** Compute the permutation with a selfadjoint matrix */
+    template <typename SrcType, unsigned int SrcUpLo> 
+    void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm)
+    { 
+      SparseMatrix<typename SrcType::Scalar, ColMajor, Index> C; C = mat;
+      
+      // Call the AMD routine 
+      // m_mat.prune(keep_diag()); //Remove the diagonal elements 
+      internal::minimum_degree_ordering(C, perm);
+    }
+};
+
+#endif // EIGEN_MPL2_ONLY
+
+/** \ingroup OrderingMethods_Module
+  * \class NaturalOrdering
+  *
+  * Functor computing the natural ordering (identity)
+  * 
+  * \note Returns an empty permutation matrix
+  * \tparam  Index The type of indices of the matrix 
+  */
+template <typename Index>
+class NaturalOrdering
+{
+  public:
+    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    
+    /** Compute the permutation vector from a column-major sparse matrix */
+    template <typename MatrixType>
+    void operator()(const MatrixType& /*mat*/, PermutationType& perm)
+    {
+      perm.resize(0); 
+    }
+    
+};
+
+/** \ingroup OrderingMethods_Module
+  * \class COLAMDOrdering
+  *
+  * Functor computing the \em column \em approximate \em minimum \em degree ordering 
+  * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()).
+  */
+template<typename Index>
+class COLAMDOrdering
+{
+  public:
+    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType; 
+    typedef Matrix<Index, Dynamic, 1> IndexVector;
+    
+    /** Compute the permutation vector \a perm form the sparse matrix \a mat
+      * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+      */
+    template <typename MatrixType>
+    void operator() (const MatrixType& mat, PermutationType& perm)
+    {
+      eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering");
+      
+      Index m = mat.rows();
+      Index n = mat.cols();
+      Index nnz = mat.nonZeros();
+      // Get the recommended value of Alen to be used by colamd
+      Index Alen = internal::colamd_recommended(nnz, m, n); 
+      // Set the default parameters
+      double knobs [COLAMD_KNOBS]; 
+      Index stats [COLAMD_STATS];
+      internal::colamd_set_defaults(knobs);
+      
+      IndexVector p(n+1), A(Alen); 
+      for(Index i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
+      for(Index i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
+      // Call Colamd routine to compute the ordering 
+      Index info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
+      EIGEN_UNUSED_VARIABLE(info);
+      eigen_assert( info && "COLAMD failed " );
+      
+      perm.resize(n);
+      for (Index i = 0; i < n; i++) perm.indices()(p(i)) = i;
+    }
+};
+
+} // end namespace Eigen
+
+#endif
diff --git a/include/eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h b/include/eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h
new file mode 100644
index 0000000..a955287
--- /dev/null
+++ b/include/eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -0,0 +1,721 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PASTIXSUPPORT_H
+#define EIGEN_PASTIXSUPPORT_H
+
+namespace Eigen { 
+
+/** \ingroup PaStiXSupport_Module
+  * \brief Interface to the PaStix solver
+  * 
+  * This class is used to solve the linear systems A.X = B via the PaStix library. 
+  * The matrix can be either real or complex, symmetric or not.
+  *
+  * \sa TutorialSparseDirectSolvers
+  */
+template<typename _MatrixType, bool IsStrSym = false> class PastixLU;
+template<typename _MatrixType, int Options> class PastixLLT;
+template<typename _MatrixType, int Options> class PastixLDLT;
+
+namespace internal
+{
+    
+  template<class Pastix> struct pastix_traits;
+
+  template<typename _MatrixType>
+  struct pastix_traits< PastixLU<_MatrixType> >
+  {
+    typedef _MatrixType MatrixType;
+    typedef typename _MatrixType::Scalar Scalar;
+    typedef typename _MatrixType::RealScalar RealScalar;
+    typedef typename _MatrixType::Index Index;
+  };
+
+  template<typename _MatrixType, int Options>
+  struct pastix_traits< PastixLLT<_MatrixType,Options> >
+  {
+    typedef _MatrixType MatrixType;
+    typedef typename _MatrixType::Scalar Scalar;
+    typedef typename _MatrixType::RealScalar RealScalar;
+    typedef typename _MatrixType::Index Index;
+  };
+
+  template<typename _MatrixType, int Options>
+  struct pastix_traits< PastixLDLT<_MatrixType,Options> >
+  {
+    typedef _MatrixType MatrixType;
+    typedef typename _MatrixType::Scalar Scalar;
+    typedef typename _MatrixType::RealScalar RealScalar;
+    typedef typename _MatrixType::Index Index;
+  };
+  
+  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
+  {
+    if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
+    if (nbrhs == 0) {x = NULL; nbrhs=1;}
+    s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); 
+  }
+  
+  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm)
+  {
+    if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
+    if (nbrhs == 0) {x = NULL; nbrhs=1;}
+    d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); 
+  }
+  
+  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<float> *vals, int *perm, int * invp, std::complex<float> *x, int nbrhs, int *iparm, double *dparm)
+  {
+    if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
+    if (nbrhs == 0) {x = NULL; nbrhs=1;}
+    c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<COMPLEX*>(vals), perm, invp, reinterpret_cast<COMPLEX*>(x), nbrhs, iparm, dparm); 
+  }
+  
+  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<double> *vals, int *perm, int * invp, std::complex<double> *x, int nbrhs, int *iparm, double *dparm)
+  {
+    if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
+    if (nbrhs == 0) {x = NULL; nbrhs=1;}
+    z_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<DCOMPLEX*>(vals), perm, invp, reinterpret_cast<DCOMPLEX*>(x), nbrhs, iparm, dparm); 
+  }
+
+  // Convert the matrix  to Fortran-style Numbering
+  template <typename MatrixType>
+  void c_to_fortran_numbering (MatrixType& mat)
+  {
+    if ( !(mat.outerIndexPtr()[0]) ) 
+    { 
+      int i;
+      for(i = 0; i <= mat.rows(); ++i)
+        ++mat.outerIndexPtr()[i];
+      for(i = 0; i < mat.nonZeros(); ++i)
+        ++mat.innerIndexPtr()[i];
+    }
+  }
+  
+  // Convert to C-style Numbering
+  template <typename MatrixType>
+  void fortran_to_c_numbering (MatrixType& mat)
+  {
+    // Check the Numbering
+    if ( mat.outerIndexPtr()[0] == 1 ) 
+    { // Convert to C-style numbering
+      int i;
+      for(i = 0; i <= mat.rows(); ++i)
+        --mat.outerIndexPtr()[i];
+      for(i = 0; i < mat.nonZeros(); ++i)
+        --mat.innerIndexPtr()[i];
+    }
+  }
+}
+
+// This is the base class to interface with PaStiX functions. 
+// Users should not used this class directly. 
+template <class Derived>
+class PastixBase : internal::noncopyable
+{
+  public:
+    typedef typename internal::pastix_traits<Derived>::MatrixType _MatrixType;
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
+    
+  public:
+    
+    PastixBase() : m_initisOk(false), m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false), m_pastixdata(0), m_size(0)
+    {
+      init();
+    }
+    
+    ~PastixBase() 
+    {
+      clean();
+    }
+
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<PastixBase, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Pastix solver is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<PastixBase, Rhs>(*this, b.derived());
+    }
+    
+    template<typename Rhs,typename Dest>
+    bool _solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const;
+    
+    Derived& derived()
+    {
+      return *static_cast<Derived*>(this);
+    }
+    const Derived& derived() const
+    {
+      return *static_cast<const Derived*>(this);
+    }
+
+    /** Returns a reference to the integer vector IPARM of PaStiX parameters
+      * to modify the default parameters. 
+      * The statistics related to the different phases of factorization and solve are saved here as well
+      * \sa analyzePattern() factorize()
+      */
+    Array<Index,IPARM_SIZE,1>& iparm()
+    {
+      return m_iparm; 
+    }
+    
+    /** Return a reference to a particular index parameter of the IPARM vector 
+     * \sa iparm()
+     */
+    
+    int& iparm(int idxparam)
+    {
+      return m_iparm(idxparam);
+    }
+    
+     /** Returns a reference to the double vector DPARM of PaStiX parameters 
+      * The statistics related to the different phases of factorization and solve are saved here as well
+      * \sa analyzePattern() factorize()
+      */
+    Array<RealScalar,IPARM_SIZE,1>& dparm()
+    {
+      return m_dparm; 
+    }
+    
+    
+    /** Return a reference to a particular index parameter of the DPARM vector 
+     * \sa dparm()
+     */
+    double& dparm(int idxparam)
+    {
+      return m_dparm(idxparam);
+    }
+    
+    inline Index cols() const { return m_size; }
+    inline Index rows() const { return m_size; }
+    
+     /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the PaStiX reports a problem
+      *          \c InvalidInput if the input matrix is invalid
+      *
+      * \sa iparm()          
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::sparse_solve_retval<PastixBase, Rhs>
+    solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Pastix LU, LLT or LDLT is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::sparse_solve_retval<PastixBase, Rhs>(*this, b.derived());
+    }
+    
+  protected:
+
+    // Initialize the Pastix data structure, check the matrix
+    void init(); 
+    
+    // Compute the ordering and the symbolic factorization
+    void analyzePattern(ColSpMatrix& mat);
+    
+    // Compute the numerical factorization
+    void factorize(ColSpMatrix& mat);
+    
+    // Free all the data allocated by Pastix
+    void clean()
+    {
+      eigen_assert(m_initisOk && "The Pastix structure should be allocated first"); 
+      m_iparm(IPARM_START_TASK) = API_TASK_CLEAN;
+      m_iparm(IPARM_END_TASK) = API_TASK_CLEAN;
+      internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, 0, 0, 0, (Scalar*)0,
+                             m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
+    }
+    
+    void compute(ColSpMatrix& mat);
+    
+    int m_initisOk; 
+    int m_analysisIsOk;
+    int m_factorizationIsOk;
+    bool m_isInitialized;
+    mutable ComputationInfo m_info; 
+    mutable pastix_data_t *m_pastixdata; // Data structure for pastix
+    mutable int m_comm; // The MPI communicator identifier
+    mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
+    mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
+    mutable Matrix<Index,Dynamic,1> m_perm;  // Permutation vector
+    mutable Matrix<Index,Dynamic,1> m_invp;  // Inverse permutation vector
+    mutable int m_size; // Size of the matrix 
+}; 
+
+ /** Initialize the PaStiX data structure. 
+   *A first call to this function fills iparm and dparm with the default PaStiX parameters
+   * \sa iparm() dparm()
+   */
+template <class Derived>
+void PastixBase<Derived>::init()
+{
+  m_size = 0; 
+  m_iparm.setZero(IPARM_SIZE);
+  m_dparm.setZero(DPARM_SIZE);
+  
+  m_iparm(IPARM_MODIFY_PARAMETER) = API_NO;
+  pastix(&m_pastixdata, MPI_COMM_WORLD,
+         0, 0, 0, 0,
+         0, 0, 0, 1, m_iparm.data(), m_dparm.data());
+  
+  m_iparm[IPARM_MATRIX_VERIFICATION] = API_NO;
+  m_iparm[IPARM_VERBOSE]             = 2;
+  m_iparm[IPARM_ORDERING]            = API_ORDER_SCOTCH;
+  m_iparm[IPARM_INCOMPLETE]          = API_NO;
+  m_iparm[IPARM_OOC_LIMIT]           = 2000;
+  m_iparm[IPARM_RHS_MAKING]          = API_RHS_B;
+  m_iparm(IPARM_MATRIX_VERIFICATION) = API_NO;
+  
+  m_iparm(IPARM_START_TASK) = API_TASK_INIT;
+  m_iparm(IPARM_END_TASK) = API_TASK_INIT;
+  internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, 0, 0, 0, (Scalar*)0,
+                         0, 0, 0, 0, m_iparm.data(), m_dparm.data());
+  
+  // Check the returned error
+  if(m_iparm(IPARM_ERROR_NUMBER)) {
+    m_info = InvalidInput;
+    m_initisOk = false;
+  }
+  else { 
+    m_info = Success;
+    m_initisOk = true;
+  }
+}
+
+template <class Derived>
+void PastixBase<Derived>::compute(ColSpMatrix& mat)
+{
+  eigen_assert(mat.rows() == mat.cols() && "The input matrix should be squared");
+  
+  analyzePattern(mat);  
+  factorize(mat);
+  
+  m_iparm(IPARM_MATRIX_VERIFICATION) = API_NO;
+  m_isInitialized = m_factorizationIsOk;
+}
+
+
+template <class Derived>
+void PastixBase<Derived>::analyzePattern(ColSpMatrix& mat)
+{                         
+  eigen_assert(m_initisOk && "The initialization of PaSTiX failed");
+  
+  // clean previous calls
+  if(m_size>0)
+    clean();
+  
+  m_size = mat.rows();
+  m_perm.resize(m_size);
+  m_invp.resize(m_size);
+  
+  m_iparm(IPARM_START_TASK) = API_TASK_ORDERING;
+  m_iparm(IPARM_END_TASK) = API_TASK_ANALYSE;
+  internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, m_size, mat.outerIndexPtr(), mat.innerIndexPtr(),
+               mat.valuePtr(), m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
+  
+  // Check the returned error
+  if(m_iparm(IPARM_ERROR_NUMBER))
+  {
+    m_info = NumericalIssue;
+    m_analysisIsOk = false;
+  }
+  else
+  { 
+    m_info = Success;
+    m_analysisIsOk = true;
+  }
+}
+
+template <class Derived>
+void PastixBase<Derived>::factorize(ColSpMatrix& mat)
+{
+//   if(&m_cpyMat != &mat) m_cpyMat = mat;
+  eigen_assert(m_analysisIsOk && "The analysis phase should be called before the factorization phase");
+  m_iparm(IPARM_START_TASK) = API_TASK_NUMFACT;
+  m_iparm(IPARM_END_TASK) = API_TASK_NUMFACT;
+  m_size = mat.rows();
+  
+  internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, m_size, mat.outerIndexPtr(), mat.innerIndexPtr(),
+               mat.valuePtr(), m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
+  
+  // Check the returned error
+  if(m_iparm(IPARM_ERROR_NUMBER))
+  {
+    m_info = NumericalIssue;
+    m_factorizationIsOk = false;
+    m_isInitialized = false;
+  }
+  else
+  {
+    m_info = Success;
+    m_factorizationIsOk = true;
+    m_isInitialized = true;
+  }
+}
+
+/* Solve the system */
+template<typename Base>
+template<typename Rhs,typename Dest>
+bool PastixBase<Base>::_solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const
+{
+  eigen_assert(m_isInitialized && "The matrix should be factorized first");
+  EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
+                     THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  int rhs = 1;
+  
+  x = b; /* on return, x is overwritten by the computed solution */
+  
+  for (int i = 0; i < b.cols(); i++){
+    m_iparm[IPARM_START_TASK]          = API_TASK_SOLVE;
+    m_iparm[IPARM_END_TASK]            = API_TASK_REFINE;
+  
+    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, x.rows(), 0, 0, 0,
+                           m_perm.data(), m_invp.data(), &x(0, i), rhs, m_iparm.data(), m_dparm.data());
+  }
+  
+  // Check the returned error
+  m_info = m_iparm(IPARM_ERROR_NUMBER)==0 ? Success : NumericalIssue;
+  
+  return m_iparm(IPARM_ERROR_NUMBER)==0;
+}
+
+/** \ingroup PaStiXSupport_Module
+  * \class PastixLU
+  * \brief Sparse direct LU solver based on PaStiX library
+  * 
+  * This class is used to solve the linear systems A.X = B with a supernodal LU 
+  * factorization in the PaStiX library. The matrix A should be squared and nonsingular
+  * PaStiX requires that the matrix A has a symmetric structural pattern. 
+  * This interface can symmetrize the input matrix otherwise. 
+  * The vectors or matrices X and B can be either dense or sparse.
+  * 
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam IsStrSym Indicates if the input matrix has a symmetric pattern, default is false
+  * NOTE : Note that if the analysis and factorization phase are called separately, 
+  * the input matrix will be symmetrized at each call, hence it is advised to 
+  * symmetrize the matrix in a end-user program and set \p IsStrSym to true
+  * 
+  * \sa \ref TutorialSparseDirectSolvers
+  * 
+  */
+template<typename _MatrixType, bool IsStrSym>
+class PastixLU : public PastixBase< PastixLU<_MatrixType> >
+{
+  public:
+    typedef _MatrixType MatrixType;
+    typedef PastixBase<PastixLU<MatrixType> > Base;
+    typedef typename Base::ColSpMatrix ColSpMatrix;
+    typedef typename MatrixType::Index Index;
+    
+  public:
+    PastixLU() : Base()
+    {
+      init();
+    }
+    
+    PastixLU(const MatrixType& matrix):Base()
+    {
+      init();
+      compute(matrix);
+    }
+    /** Compute the LU supernodal factorization of \p matrix. 
+      * iparm and dparm can be used to tune the PaStiX parameters. 
+      * see the PaStiX user's manual
+      * \sa analyzePattern() factorize()
+      */
+    void compute (const MatrixType& matrix)
+    {
+      m_structureIsUptodate = false;
+      ColSpMatrix temp;
+      grabMatrix(matrix, temp);
+      Base::compute(temp);
+    }
+    /** Compute the LU symbolic factorization of \p matrix using its sparsity pattern. 
+      * Several ordering methods can be used at this step. See the PaStiX user's manual. 
+      * The result of this operation can be used with successive matrices having the same pattern as \p matrix
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& matrix)
+    {
+      m_structureIsUptodate = false;
+      ColSpMatrix temp;
+      grabMatrix(matrix, temp);
+      Base::analyzePattern(temp);
+    }
+
+    /** Compute the LU supernodal factorization of \p matrix
+      * WARNING The matrix \p matrix should have the same structural pattern 
+      * as the same used in the analysis phase.
+      * \sa analyzePattern()
+      */ 
+    void factorize(const MatrixType& matrix)
+    {
+      ColSpMatrix temp;
+      grabMatrix(matrix, temp);
+      Base::factorize(temp);
+    }
+  protected:
+    
+    void init()
+    {
+      m_structureIsUptodate = false;
+      m_iparm(IPARM_SYM) = API_SYM_NO;
+      m_iparm(IPARM_FACTORIZATION) = API_FACT_LU;
+    }
+    
+    void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
+    {
+      if(IsStrSym)
+        out = matrix;
+      else
+      {
+        if(!m_structureIsUptodate)
+        {
+          // update the transposed structure
+          m_transposedStructure = matrix.transpose();
+          
+          // Set the elements of the matrix to zero 
+          for (Index j=0; j<m_transposedStructure.outerSize(); ++j) 
+            for(typename ColSpMatrix::InnerIterator it(m_transposedStructure, j); it; ++it)
+              it.valueRef() = 0.0;
+
+          m_structureIsUptodate = true;
+        }
+        
+        out = m_transposedStructure + matrix;
+      }
+      internal::c_to_fortran_numbering(out);
+    }
+    
+    using Base::m_iparm;
+    using Base::m_dparm;
+    
+    ColSpMatrix m_transposedStructure;
+    bool m_structureIsUptodate;
+};
+
+/** \ingroup PaStiXSupport_Module
+  * \class PastixLLT
+  * \brief A sparse direct supernodal Cholesky (LLT) factorization and solver based on the PaStiX library
+  * 
+  * This class is used to solve the linear systems A.X = B via a LL^T supernodal Cholesky factorization
+  * available in the PaStiX library. The matrix A should be symmetric and positive definite
+  * WARNING Selfadjoint complex matrices are not supported in the current version of PaStiX
+  * The vectors or matrices X and B can be either dense or sparse
+  * 
+  * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
+  * 
+  * \sa \ref TutorialSparseDirectSolvers
+  */
+template<typename _MatrixType, int _UpLo>
+class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
+{
+  public:
+    typedef _MatrixType MatrixType;
+    typedef PastixBase<PastixLLT<MatrixType, _UpLo> > Base;
+    typedef typename Base::ColSpMatrix ColSpMatrix;
+    
+  public:
+    enum { UpLo = _UpLo };
+    PastixLLT() : Base()
+    {
+      init();
+    }
+    
+    PastixLLT(const MatrixType& matrix):Base()
+    {
+      init();
+      compute(matrix);
+    }
+
+    /** Compute the L factor of the LL^T supernodal factorization of \p matrix 
+      * \sa analyzePattern() factorize()
+      */
+    void compute (const MatrixType& matrix)
+    {
+      ColSpMatrix temp;
+      grabMatrix(matrix, temp);
+      Base::compute(temp);
+    }
+
+     /** Compute the LL^T symbolic factorization of \p matrix using its sparsity pattern
+      * The result of this operation can be used with successive matrices having the same pattern as \p matrix
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& matrix)
+    {
+      ColSpMatrix temp;
+      grabMatrix(matrix, temp);
+      Base::analyzePattern(temp);
+    }
+      /** Compute the LL^T supernodal numerical factorization of \p matrix 
+        * \sa analyzePattern()
+        */
+    void factorize(const MatrixType& matrix)
+    {
+      ColSpMatrix temp;
+      grabMatrix(matrix, temp);
+      Base::factorize(temp);
+    }
+  protected:
+    using Base::m_iparm;
+    
+    void init()
+    {
+      m_iparm(IPARM_SYM) = API_SYM_YES;
+      m_iparm(IPARM_FACTORIZATION) = API_FACT_LLT;
+    }
+    
+    void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
+    {
+      // Pastix supports only lower, column-major matrices 
+      out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
+      internal::c_to_fortran_numbering(out);
+    }
+};
+
+/** \ingroup PaStiXSupport_Module
+  * \class PastixLDLT
+  * \brief A sparse direct supernodal Cholesky (LLT) factorization and solver based on the PaStiX library
+  * 
+  * This class is used to solve the linear systems A.X = B via a LDL^T supernodal Cholesky factorization
+  * available in the PaStiX library. The matrix A should be symmetric and positive definite
+  * WARNING Selfadjoint complex matrices are not supported in the current version of PaStiX
+  * The vectors or matrices X and B can be either dense or sparse
+  * 
+  * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
+  * 
+  * \sa \ref TutorialSparseDirectSolvers
+  */
+template<typename _MatrixType, int _UpLo>
+class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
+{
+  public:
+    typedef _MatrixType MatrixType;
+    typedef PastixBase<PastixLDLT<MatrixType, _UpLo> > Base; 
+    typedef typename Base::ColSpMatrix ColSpMatrix;
+    
+  public:
+    enum { UpLo = _UpLo };
+    PastixLDLT():Base()
+    {
+      init();
+    }
+    
+    PastixLDLT(const MatrixType& matrix):Base()
+    {
+      init();
+      compute(matrix);
+    }
+
+    /** Compute the L and D factors of the LDL^T factorization of \p matrix 
+      * \sa analyzePattern() factorize()
+      */
+    void compute (const MatrixType& matrix)
+    {
+      ColSpMatrix temp;
+      grabMatrix(matrix, temp);
+      Base::compute(temp);
+    }
+
+    /** Compute the LDL^T symbolic factorization of \p matrix using its sparsity pattern
+      * The result of this operation can be used with successive matrices having the same pattern as \p matrix
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& matrix)
+    { 
+      ColSpMatrix temp;
+      grabMatrix(matrix, temp);
+      Base::analyzePattern(temp);
+    }
+    /** Compute the LDL^T supernodal numerical factorization of \p matrix 
+      * 
+      */
+    void factorize(const MatrixType& matrix)
+    {
+      ColSpMatrix temp;
+      grabMatrix(matrix, temp);
+      Base::factorize(temp);
+    }
+
+  protected:
+    using Base::m_iparm;
+    
+    void init()
+    {
+      m_iparm(IPARM_SYM) = API_SYM_YES;
+      m_iparm(IPARM_FACTORIZATION) = API_FACT_LDLT;
+    }
+    
+    void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
+    {
+      // Pastix supports only lower, column-major matrices 
+      out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
+      internal::c_to_fortran_numbering(out);
+    }
+};
+
+namespace internal {
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<PastixBase<_MatrixType>, Rhs>
+  : solve_retval_base<PastixBase<_MatrixType>, Rhs>
+{
+  typedef PastixBase<_MatrixType> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+template<typename _MatrixType, typename Rhs>
+struct sparse_solve_retval<PastixBase<_MatrixType>, Rhs>
+  : sparse_solve_retval_base<PastixBase<_MatrixType>, Rhs>
+{
+  typedef PastixBase<_MatrixType> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    this->defaultEvalTo(dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif
diff --git a/include/eigen3/Eigen/src/PardisoSupport/PardisoSupport.h b/include/eigen3/Eigen/src/PardisoSupport/PardisoSupport.h
new file mode 100644
index 0000000..18cd7d8
--- /dev/null
+++ b/include/eigen3/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -0,0 +1,592 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL PARDISO
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_PARDISOSUPPORT_H
+#define EIGEN_PARDISOSUPPORT_H
+
+namespace Eigen { 
+
+template<typename _MatrixType> class PardisoLU;
+template<typename _MatrixType, int Options=Upper> class PardisoLLT;
+template<typename _MatrixType, int Options=Upper> class PardisoLDLT;
+
+namespace internal
+{
+  template<typename Index>
+  struct pardiso_run_selector
+  {
+    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
+                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
+    {
+      Index error = 0;
+      ::pardiso(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
+      return error;
+    }
+  };
+  template<>
+  struct pardiso_run_selector<long long int>
+  {
+    typedef long long int Index;
+    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
+                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
+    {
+      Index error = 0;
+      ::pardiso_64(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
+      return error;
+    }
+  };
+
+  template<class Pardiso> struct pardiso_traits;
+
+  template<typename _MatrixType>
+  struct pardiso_traits< PardisoLU<_MatrixType> >
+  {
+    typedef _MatrixType MatrixType;
+    typedef typename _MatrixType::Scalar Scalar;
+    typedef typename _MatrixType::RealScalar RealScalar;
+    typedef typename _MatrixType::Index Index;
+  };
+
+  template<typename _MatrixType, int Options>
+  struct pardiso_traits< PardisoLLT<_MatrixType, Options> >
+  {
+    typedef _MatrixType MatrixType;
+    typedef typename _MatrixType::Scalar Scalar;
+    typedef typename _MatrixType::RealScalar RealScalar;
+    typedef typename _MatrixType::Index Index;
+  };
+
+  template<typename _MatrixType, int Options>
+  struct pardiso_traits< PardisoLDLT<_MatrixType, Options> >
+  {
+    typedef _MatrixType MatrixType;
+    typedef typename _MatrixType::Scalar Scalar;
+    typedef typename _MatrixType::RealScalar RealScalar;
+    typedef typename _MatrixType::Index Index;    
+  };
+
+}
+
+template<class Derived>
+class PardisoImpl
+{
+    typedef internal::pardiso_traits<Derived> Traits;
+  public:
+    typedef typename Traits::MatrixType MatrixType;
+    typedef typename Traits::Scalar Scalar;
+    typedef typename Traits::RealScalar RealScalar;
+    typedef typename Traits::Index Index;
+    typedef SparseMatrix<Scalar,RowMajor,Index> SparseMatrixType;
+    typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef Matrix<Index, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<Index, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef Array<Index,64,1,DontAlign> ParameterType;
+    enum {
+      ScalarIsComplex = NumTraits<Scalar>::IsComplex
+    };
+
+    PardisoImpl()
+    {
+      eigen_assert((sizeof(Index) >= sizeof(_INTEGER_t) && sizeof(Index) <= 8) && "Non-supported index type");
+      m_iparm.setZero();
+      m_msglvl = 0; // No output
+      m_initialized = false;
+    }
+
+    ~PardisoImpl()
+    {
+      pardisoRelease();
+    }
+
+    inline Index cols() const { return m_size; }
+    inline Index rows() const { return m_size; }
+  
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the matrix appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_initialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+
+    /** \warning for advanced usage only.
+      * \returns a reference to the parameter array controlling PARDISO.
+      * See the PARDISO manual to know how to use it. */
+    ParameterType& pardisoParameterArray()
+    {
+      return m_iparm;
+    }
+    
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      * 
+      * \sa factorize()
+      */
+    Derived& analyzePattern(const MatrixType& matrix);
+    
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    Derived& factorize(const MatrixType& matrix);
+
+    Derived& compute(const MatrixType& matrix);
+    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<PardisoImpl, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<PardisoImpl, Rhs>(*this, b.derived());
+    }
+
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::sparse_solve_retval<PardisoImpl, Rhs>
+    solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::sparse_solve_retval<PardisoImpl, Rhs>(*this, b.derived());
+    }
+
+    Derived& derived()
+    {
+      return *static_cast<Derived*>(this);
+    }
+    const Derived& derived() const
+    {
+      return *static_cast<const Derived*>(this);
+    }
+
+    template<typename BDerived, typename XDerived>
+    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const;
+
+  protected:
+    void pardisoRelease()
+    {
+      if(m_initialized) // Factorization ran at least once
+      {
+        internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, -1, m_size, 0, 0, 0, m_perm.data(), 0,
+                                                   m_iparm.data(), m_msglvl, 0, 0);
+      }
+    }
+
+    void pardisoInit(int type)
+    {
+      m_type = type;
+      bool symmetric = std::abs(m_type) < 10;
+      m_iparm[0] = 1;   // No solver default
+      m_iparm[1] = 3;   // use Metis for the ordering
+      m_iparm[2] = 1;   // Numbers of processors, value of OMP_NUM_THREADS
+      m_iparm[3] = 0;   // No iterative-direct algorithm
+      m_iparm[4] = 0;   // No user fill-in reducing permutation
+      m_iparm[5] = 0;   // Write solution into x
+      m_iparm[6] = 0;   // Not in use
+      m_iparm[7] = 2;   // Max numbers of iterative refinement steps
+      m_iparm[8] = 0;   // Not in use
+      m_iparm[9] = 13;  // Perturb the pivot elements with 1E-13
+      m_iparm[10] = symmetric ? 0 : 1; // Use nonsymmetric permutation and scaling MPS
+      m_iparm[11] = 0;  // Not in use
+      m_iparm[12] = symmetric ? 0 : 1;  // Maximum weighted matching algorithm is switched-off (default for symmetric).
+                                        // Try m_iparm[12] = 1 in case of inappropriate accuracy
+      m_iparm[13] = 0;  // Output: Number of perturbed pivots
+      m_iparm[14] = 0;  // Not in use
+      m_iparm[15] = 0;  // Not in use
+      m_iparm[16] = 0;  // Not in use
+      m_iparm[17] = -1; // Output: Number of nonzeros in the factor LU
+      m_iparm[18] = -1; // Output: Mflops for LU factorization
+      m_iparm[19] = 0;  // Output: Numbers of CG Iterations
+      
+      m_iparm[20] = 0;  // 1x1 pivoting
+      m_iparm[26] = 0;  // No matrix checker
+      m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
+      m_iparm[34] = 1;  // C indexing
+      m_iparm[59] = 1;  // Automatic switch between In-Core and Out-of-Core modes
+    }
+
+  protected:
+    // cached data to reduce reallocation, etc.
+    
+    void manageErrorCode(Index error)
+    {
+      switch(error)
+      {
+        case 0:
+          m_info = Success;
+          break;
+        case -4:
+        case -7:
+          m_info = NumericalIssue;
+          break;
+        default:
+          m_info = InvalidInput;
+      }
+    }
+
+    mutable SparseMatrixType m_matrix;
+    ComputationInfo m_info;
+    bool m_initialized, m_analysisIsOk, m_factorizationIsOk;
+    Index m_type, m_msglvl;
+    mutable void *m_pt[64];
+    mutable ParameterType m_iparm;
+    mutable IntColVectorType m_perm;
+    Index m_size;
+    
+  private:
+    PardisoImpl(PardisoImpl &) {}
+};
+
+template<class Derived>
+Derived& PardisoImpl<Derived>::compute(const MatrixType& a)
+{
+  m_size = a.rows();
+  eigen_assert(a.rows() == a.cols());
+
+  pardisoRelease();
+  memset(m_pt, 0, sizeof(m_pt));
+  m_perm.setZero(m_size);
+  derived().getMatrix(a);
+  
+  Index error;
+  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 12, m_size,
+                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+
+  manageErrorCode(error);
+  m_analysisIsOk = true;
+  m_factorizationIsOk = true;
+  m_initialized = true;
+  return derived();
+}
+
+template<class Derived>
+Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a)
+{
+  m_size = a.rows();
+  eigen_assert(m_size == a.cols());
+
+  pardisoRelease();
+  memset(m_pt, 0, sizeof(m_pt));
+  m_perm.setZero(m_size);
+  derived().getMatrix(a);
+  
+  Index error;
+  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 11, m_size,
+                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+  
+  manageErrorCode(error);
+  m_analysisIsOk = true;
+  m_factorizationIsOk = false;
+  m_initialized = true;
+  return derived();
+}
+
+template<class Derived>
+Derived& PardisoImpl<Derived>::factorize(const MatrixType& a)
+{
+  eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+  eigen_assert(m_size == a.rows() && m_size == a.cols());
+  
+  derived().getMatrix(a);
+
+  Index error;  
+  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 22, m_size,
+                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+  
+  manageErrorCode(error);
+  m_factorizationIsOk = true;
+  return derived();
+}
+
+template<class Base>
+template<typename BDerived,typename XDerived>
+bool PardisoImpl<Base>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
+{
+  if(m_iparm[0] == 0) // Factorization was not computed
+    return false;
+
+  //Index n = m_matrix.rows();
+  Index nrhs = Index(b.cols());
+  eigen_assert(m_size==b.rows());
+  eigen_assert(((MatrixBase<BDerived>::Flags & RowMajorBit) == 0 || nrhs == 1) && "Row-major right hand sides are not supported");
+  eigen_assert(((MatrixBase<XDerived>::Flags & RowMajorBit) == 0 || nrhs == 1) && "Row-major matrices of unknowns are not supported");
+  eigen_assert(((nrhs == 1) || b.outerStride() == b.rows()));
+
+
+//  switch (transposed) {
+//    case SvNoTrans    : m_iparm[11] = 0 ; break;
+//    case SvTranspose  : m_iparm[11] = 2 ; break;
+//    case SvAdjoint    : m_iparm[11] = 1 ; break;
+//    default:
+//      //std::cerr << "Eigen: transposition  option \"" << transposed << "\" not supported by the PARDISO backend\n";
+//      m_iparm[11] = 0;
+//  }
+
+  Scalar* rhs_ptr = const_cast<Scalar*>(b.derived().data());
+  Matrix<Scalar,Dynamic,Dynamic,ColMajor> tmp;
+  
+  // Pardiso cannot solve in-place
+  if(rhs_ptr == x.derived().data())
+  {
+    tmp = b;
+    rhs_ptr = tmp.data();
+  }
+  
+  Index error;
+  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 33, m_size,
+                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                     m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
+                                                     rhs_ptr, x.derived().data());
+
+  return error==0;
+}
+
+
+/** \ingroup PardisoSupport_Module
+  * \class PardisoLU
+  * \brief A sparse direct LU factorization and solver based on the PARDISO library
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a direct LU factorization
+  * using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible.
+  * The vectors or matrices X and B can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  *
+  * \sa \ref TutorialSparseDirectSolvers
+  */
+template<typename MatrixType>
+class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
+{
+  protected:
+    typedef PardisoImpl< PardisoLU<MatrixType> > Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
+    using Base::pardisoInit;
+    using Base::m_matrix;
+    friend class PardisoImpl< PardisoLU<MatrixType> >;
+
+  public:
+
+    using Base::compute;
+    using Base::solve;
+
+    PardisoLU()
+      : Base()
+    {
+      pardisoInit(Base::ScalarIsComplex ? 13 : 11);
+    }
+
+    PardisoLU(const MatrixType& matrix)
+      : Base()
+    {
+      pardisoInit(Base::ScalarIsComplex ? 13 : 11);
+      compute(matrix);
+    }
+  protected:
+    void getMatrix(const MatrixType& matrix)
+    {
+      m_matrix = matrix;
+    }
+    
+  private:
+    PardisoLU(PardisoLU& ) {}
+};
+
+/** \ingroup PardisoSupport_Module
+  * \class PardisoLLT
+  * \brief A sparse direct Cholesky (LLT) factorization and solver based on the PARDISO library
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a LL^T Cholesky factorization
+  * using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite.
+  * The vectors or matrices X and B can be either dense or sparse.
+  *
+  * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used.
+  *         Upper|Lower can be used to tell both triangular parts can be used as input.
+  *
+  * \sa \ref TutorialSparseDirectSolvers
+  */
+template<typename MatrixType, int _UpLo>
+class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
+{
+  protected:
+    typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::Index Index;
+    typedef typename Base::RealScalar RealScalar;
+    using Base::pardisoInit;
+    using Base::m_matrix;
+    friend class PardisoImpl< PardisoLLT<MatrixType,_UpLo> >;
+
+  public:
+
+    enum { UpLo = _UpLo };
+    using Base::compute;
+    using Base::solve;
+
+    PardisoLLT()
+      : Base()
+    {
+      pardisoInit(Base::ScalarIsComplex ? 4 : 2);
+    }
+
+    PardisoLLT(const MatrixType& matrix)
+      : Base()
+    {
+      pardisoInit(Base::ScalarIsComplex ? 4 : 2);
+      compute(matrix);
+    }
+    
+  protected:
+    
+    void getMatrix(const MatrixType& matrix)
+    {
+      // PARDISO supports only upper, row-major matrices
+      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
+      m_matrix.resize(matrix.rows(), matrix.cols());
+      m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+    }
+    
+  private:
+    PardisoLLT(PardisoLLT& ) {}
+};
+
+/** \ingroup PardisoSupport_Module
+  * \class PardisoLDLT
+  * \brief A sparse direct Cholesky (LDLT) factorization and solver based on the PARDISO library
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a LDL^T Cholesky factorization
+  * using the Intel MKL PARDISO library. The sparse matrix A is assumed to be selfajoint and positive definite.
+  * For complex matrices, A can also be symmetric only, see the \a Options template parameter.
+  * The vectors or matrices X and B can be either dense or sparse.
+  *
+  * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the upper triangular part has to be used.
+  *         Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix.
+  *         Upper|Lower can be used to tell both triangular parts can be used as input.
+  *
+  * \sa \ref TutorialSparseDirectSolvers
+  */
+template<typename MatrixType, int Options>
+class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
+{
+  protected:
+    typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::Index Index;
+    typedef typename Base::RealScalar RealScalar;
+    using Base::pardisoInit;
+    using Base::m_matrix;
+    friend class PardisoImpl< PardisoLDLT<MatrixType,Options> >;
+
+  public:
+
+    using Base::compute;
+    using Base::solve;
+    enum { UpLo = Options&(Upper|Lower) };
+
+    PardisoLDLT()
+      : Base()
+    {
+      pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
+    }
+
+    PardisoLDLT(const MatrixType& matrix)
+      : Base()
+    {
+      pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
+      compute(matrix);
+    }
+    
+    void getMatrix(const MatrixType& matrix)
+    {
+      // PARDISO supports only upper, row-major matrices
+      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
+      m_matrix.resize(matrix.rows(), matrix.cols());
+      m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+    }
+    
+  private:
+    PardisoLDLT(PardisoLDLT& ) {}
+};
+
+namespace internal {
+  
+template<typename _Derived, typename Rhs>
+struct solve_retval<PardisoImpl<_Derived>, Rhs>
+  : solve_retval_base<PardisoImpl<_Derived>, Rhs>
+{
+  typedef PardisoImpl<_Derived> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+template<typename Derived, typename Rhs>
+struct sparse_solve_retval<PardisoImpl<Derived>, Rhs>
+  : sparse_solve_retval_base<PardisoImpl<Derived>, Rhs>
+{
+  typedef PardisoImpl<Derived> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    this->defaultEvalTo(dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARDISOSUPPORT_H
diff --git a/include/eigen3/Eigen/src/QR/ColPivHouseholderQR.h b/include/eigen3/Eigen/src/QR/ColPivHouseholderQR.h
new file mode 100644
index 0000000..773d1df
--- /dev/null
+++ b/include/eigen3/Eigen/src/QR/ColPivHouseholderQR.h
@@ -0,0 +1,580 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_H
+#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_H
+
+namespace Eigen { 
+
+/** \ingroup QR_Module
+  *
+  * \class ColPivHouseholderQR
+  *
+  * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting
+  *
+  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  *
+  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
+  * such that 
+  * \f[
+  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R}
+  * \f]
+  * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an 
+  * upper triangular matrix.
+  *
+  * This decomposition performs column pivoting in order to be rank-revealing and improve
+  * numerical stability. It is slower than HouseholderQR, and faster than FullPivHouseholderQR.
+  *
+  * \sa MatrixBase::colPivHouseholderQr()
+  */
+template<typename _MatrixType> class ColPivHouseholderQR
+{
+  public:
+
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
+    typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
+    typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
+    typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
+    typedef typename internal::plain_row_type<MatrixType, RealScalar>::type RealRowVectorType;
+    typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename HCoeffsType::ConjugateReturnType>::type> HouseholderSequenceType;
+    
+  private:
+    
+    typedef typename PermutationType::Index PermIndexType;
+    
+  public:
+
+    /**
+    * \brief Default Constructor.
+    *
+    * The default constructor is useful in cases in which the user intends to
+    * perform decompositions via ColPivHouseholderQR::compute(const MatrixType&).
+    */
+    ColPivHouseholderQR()
+      : m_qr(),
+        m_hCoeffs(),
+        m_colsPermutation(),
+        m_colsTranspositions(),
+        m_temp(),
+        m_colSqNorms(),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false) {}
+
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa ColPivHouseholderQR()
+      */
+    ColPivHouseholderQR(Index rows, Index cols)
+      : m_qr(rows, cols),
+        m_hCoeffs((std::min)(rows,cols)),
+        m_colsPermutation(PermIndexType(cols)),
+        m_colsTranspositions(cols),
+        m_temp(cols),
+        m_colSqNorms(cols),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false) {}
+
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This constructor computes the QR factorization of the matrix \a matrix by calling
+      * the method compute(). It is a short cut for:
+      * 
+      * \code
+      * ColPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
+      * qr.compute(matrix);
+      * \endcode
+      * 
+      * \sa compute()
+      */
+    ColPivHouseholderQR(const MatrixType& matrix)
+      : m_qr(matrix.rows(), matrix.cols()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_colsPermutation(PermIndexType(matrix.cols())),
+        m_colsTranspositions(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_colSqNorms(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      compute(matrix);
+    }
+
+    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
+      * *this is the QR decomposition, if any exists.
+      *
+      * \param b the right-hand-side of the equation to solve.
+      *
+      * \returns a solution.
+      *
+      * \note The case where b is a matrix is not yet implemented. Also, this
+      *       code is space inefficient.
+      *
+      * \note_about_checking_solutions
+      *
+      * \note_about_arbitrary_choice_of_solution
+      *
+      * Example: \include ColPivHouseholderQR_solve.cpp
+      * Output: \verbinclude ColPivHouseholderQR_solve.out
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<ColPivHouseholderQR, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return internal::solve_retval<ColPivHouseholderQR, Rhs>(*this, b.derived());
+    }
+
+    HouseholderSequenceType householderQ(void) const;
+    HouseholderSequenceType matrixQ(void) const
+    {
+      return householderQ(); 
+    }
+
+    /** \returns a reference to the matrix where the Householder QR decomposition is stored
+      */
+    const MatrixType& matrixQR() const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return m_qr;
+    }
+    
+    /** \returns a reference to the matrix where the result Householder QR is stored 
+     * \warning The strict lower part of this matrix contains internal values. 
+     * Only the upper triangular part should be referenced. To get it, use
+     * \code matrixR().template triangularView<Upper>() \endcode
+     * For rank-deficient matrices, use 
+     * \code 
+     * matrixR().topLeftCorner(rank(), rank()).template triangularView<Upper>() 
+     * \endcode
+     */
+    const MatrixType& matrixR() const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return m_qr;
+    }
+    
+    ColPivHouseholderQR& compute(const MatrixType& matrix);
+
+    /** \returns a const reference to the column permutation matrix */
+    const PermutationType& colsPermutation() const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return m_colsPermutation;
+    }
+
+    /** \returns the absolute value of the determinant of the matrix of which
+      * *this is the QR decomposition. It has only linear complexity
+      * (that is, O(n) where n is the dimension of the square matrix)
+      * as the QR decomposition has already been computed.
+      *
+      * \note This is only for square matrices.
+      *
+      * \warning a determinant can be very big or small, so for matrices
+      * of large enough dimension, there is a risk of overflow/underflow.
+      * One way to work around that is to use logAbsDeterminant() instead.
+      *
+      * \sa logAbsDeterminant(), MatrixBase::determinant()
+      */
+    typename MatrixType::RealScalar absDeterminant() const;
+
+    /** \returns the natural log of the absolute value of the determinant of the matrix of which
+      * *this is the QR decomposition. It has only linear complexity
+      * (that is, O(n) where n is the dimension of the square matrix)
+      * as the QR decomposition has already been computed.
+      *
+      * \note This is only for square matrices.
+      *
+      * \note This method is useful to work around the risk of overflow/underflow that's inherent
+      * to determinant computation.
+      *
+      * \sa absDeterminant(), MatrixBase::determinant()
+      */
+    typename MatrixType::RealScalar logAbsDeterminant() const;
+
+    /** \returns the rank of the matrix of which *this is the QR decomposition.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline Index rank() const
+    {
+      using std::abs;
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
+      Index result = 0;
+      for(Index i = 0; i < m_nonzero_pivots; ++i)
+        result += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);
+      return result;
+    }
+
+    /** \returns the dimension of the kernel of the matrix of which *this is the QR decomposition.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline Index dimensionOfKernel() const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return cols() - rank();
+    }
+
+    /** \returns true if the matrix of which *this is the QR decomposition represents an injective
+      *          linear map, i.e. has trivial kernel; false otherwise.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline bool isInjective() const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return rank() == cols();
+    }
+
+    /** \returns true if the matrix of which *this is the QR decomposition represents a surjective
+      *          linear map; false otherwise.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline bool isSurjective() const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return rank() == rows();
+    }
+
+    /** \returns true if the matrix of which *this is the QR decomposition is invertible.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline bool isInvertible() const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return isInjective() && isSurjective();
+    }
+
+    /** \returns the inverse of the matrix of which *this is the QR decomposition.
+      *
+      * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
+      *       Use isInvertible() to first determine whether this matrix is invertible.
+      */
+    inline const
+    internal::solve_retval<ColPivHouseholderQR, typename MatrixType::IdentityReturnType>
+    inverse() const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return internal::solve_retval<ColPivHouseholderQR,typename MatrixType::IdentityReturnType>
+               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
+    }
+
+    inline Index rows() const { return m_qr.rows(); }
+    inline Index cols() const { return m_qr.cols(); }
+    
+    /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
+      * 
+      * For advanced uses only.
+      */
+    const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
+
+    /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
+      * who need to determine when pivots are to be considered nonzero. This is not used for the
+      * QR decomposition itself.
+      *
+      * When it needs to get the threshold value, Eigen calls threshold(). By default, this
+      * uses a formula to automatically determine a reasonable threshold.
+      * Once you have called the present method setThreshold(const RealScalar&),
+      * your value is used instead.
+      *
+      * \param threshold The new value to use as the threshold.
+      *
+      * A pivot will be considered nonzero if its absolute value is strictly greater than
+      *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
+      * where maxpivot is the biggest pivot.
+      *
+      * If you want to come back to the default behavior, call setThreshold(Default_t)
+      */
+    ColPivHouseholderQR& setThreshold(const RealScalar& threshold)
+    {
+      m_usePrescribedThreshold = true;
+      m_prescribedThreshold = threshold;
+      return *this;
+    }
+
+    /** Allows to come back to the default behavior, letting Eigen use its default formula for
+      * determining the threshold.
+      *
+      * You should pass the special object Eigen::Default as parameter here.
+      * \code qr.setThreshold(Eigen::Default); \endcode
+      *
+      * See the documentation of setThreshold(const RealScalar&).
+      */
+    ColPivHouseholderQR& setThreshold(Default_t)
+    {
+      m_usePrescribedThreshold = false;
+      return *this;
+    }
+
+    /** Returns the threshold that will be used by certain methods such as rank().
+      *
+      * See the documentation of setThreshold(const RealScalar&).
+      */
+    RealScalar threshold() const
+    {
+      eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+      return m_usePrescribedThreshold ? m_prescribedThreshold
+      // this formula comes from experimenting (see "LU precision tuning" thread on the list)
+      // and turns out to be identical to Higham's formula used already in LDLt.
+                                      : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
+    }
+
+    /** \returns the number of nonzero pivots in the QR decomposition.
+      * Here nonzero is meant in the exact sense, not in a fuzzy sense.
+      * So that notion isn't really intrinsically interesting, but it is
+      * still useful when implementing algorithms.
+      *
+      * \sa rank()
+      */
+    inline Index nonzeroPivots() const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return m_nonzero_pivots;
+    }
+
+    /** \returns the absolute value of the biggest pivot, i.e. the biggest
+      *          diagonal coefficient of R.
+      */
+    RealScalar maxPivot() const { return m_maxpivot; }
+    
+    /** \brief Reports whether the QR factorization was succesful.
+      *
+      * \note This function always returns \c Success. It is provided for compatibility 
+      * with other factorization routines.
+      * \returns \c Success 
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return Success;
+    }
+
+  protected:
+    MatrixType m_qr;
+    HCoeffsType m_hCoeffs;
+    PermutationType m_colsPermutation;
+    IntRowVectorType m_colsTranspositions;
+    RowVectorType m_temp;
+    RealRowVectorType m_colSqNorms;
+    bool m_isInitialized, m_usePrescribedThreshold;
+    RealScalar m_prescribedThreshold, m_maxpivot;
+    Index m_nonzero_pivots;
+    Index m_det_pq;
+};
+
+template<typename MatrixType>
+typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::absDeterminant() const
+{
+  using std::abs;
+  eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  return abs(m_qr.diagonal().prod());
+}
+
+template<typename MatrixType>
+typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::logAbsDeterminant() const
+{
+  eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  return m_qr.diagonal().cwiseAbs().array().log().sum();
+}
+
+/** Performs the QR factorization of the given matrix \a matrix. The result of
+  * the factorization is stored into \c *this, and a reference to \c *this
+  * is returned.
+  *
+  * \sa class ColPivHouseholderQR, ColPivHouseholderQR(const MatrixType&)
+  */
+template<typename MatrixType>
+ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+{
+  using std::abs;
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+  Index size = matrix.diagonalSize();
+  
+  // the column permutation is stored as int indices, so just to be sure:
+  eigen_assert(cols<=NumTraits<int>::highest());
+
+  m_qr = matrix;
+  m_hCoeffs.resize(size);
+
+  m_temp.resize(cols);
+
+  m_colsTranspositions.resize(matrix.cols());
+  Index number_of_transpositions = 0;
+
+  m_colSqNorms.resize(cols);
+  for(Index k = 0; k < cols; ++k)
+    m_colSqNorms.coeffRef(k) = m_qr.col(k).squaredNorm();
+
+  RealScalar threshold_helper = m_colSqNorms.maxCoeff() * numext::abs2(NumTraits<Scalar>::epsilon()) / RealScalar(rows);
+
+  m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
+  m_maxpivot = RealScalar(0);
+
+  for(Index k = 0; k < size; ++k)
+  {
+    // first, we look up in our table m_colSqNorms which column has the biggest squared norm
+    Index biggest_col_index;
+    RealScalar biggest_col_sq_norm = m_colSqNorms.tail(cols-k).maxCoeff(&biggest_col_index);
+    biggest_col_index += k;
+
+    // since our table m_colSqNorms accumulates imprecision at every step, we must now recompute
+    // the actual squared norm of the selected column.
+    // Note that not doing so does result in solve() sometimes returning inf/nan values
+    // when running the unit test with 1000 repetitions.
+    biggest_col_sq_norm = m_qr.col(biggest_col_index).tail(rows-k).squaredNorm();
+
+    // we store that back into our table: it can't hurt to correct our table.
+    m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm;
+
+    // if the current biggest column is smaller than epsilon times the initial biggest column,
+    // terminate to avoid generating nan/inf values.
+    // Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so)
+    // repetitions of the unit test, with the result of solve() filled with large values of the order
+    // of 1/(size*epsilon).
+    if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
+    {
+      m_nonzero_pivots = k;
+      m_hCoeffs.tail(size-k).setZero();
+      m_qr.bottomRightCorner(rows-k,cols-k)
+          .template triangularView<StrictlyLower>()
+          .setZero();
+      break;
+    }
+
+    // apply the transposition to the columns
+    m_colsTranspositions.coeffRef(k) = biggest_col_index;
+    if(k != biggest_col_index) {
+      m_qr.col(k).swap(m_qr.col(biggest_col_index));
+      std::swap(m_colSqNorms.coeffRef(k), m_colSqNorms.coeffRef(biggest_col_index));
+      ++number_of_transpositions;
+    }
+
+    // generate the householder vector, store it below the diagonal
+    RealScalar beta;
+    m_qr.col(k).tail(rows-k).makeHouseholderInPlace(m_hCoeffs.coeffRef(k), beta);
+
+    // apply the householder transformation to the diagonal coefficient
+    m_qr.coeffRef(k,k) = beta;
+
+    // remember the maximum absolute value of diagonal coefficients
+    if(abs(beta) > m_maxpivot) m_maxpivot = abs(beta);
+
+    // apply the householder transformation
+    m_qr.bottomRightCorner(rows-k, cols-k-1)
+        .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows-k-1), m_hCoeffs.coeffRef(k), &m_temp.coeffRef(k+1));
+
+    // update our table of squared norms of the columns
+    m_colSqNorms.tail(cols-k-1) -= m_qr.row(k).tail(cols-k-1).cwiseAbs2();
+  }
+
+  m_colsPermutation.setIdentity(PermIndexType(cols));
+  for(PermIndexType k = 0; k < m_nonzero_pivots; ++k)
+    m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
+
+  m_det_pq = (number_of_transpositions%2) ? -1 : 1;
+  m_isInitialized = true;
+
+  return *this;
+}
+
+namespace internal {
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<ColPivHouseholderQR<_MatrixType>, Rhs>
+  : solve_retval_base<ColPivHouseholderQR<_MatrixType>, Rhs>
+{
+  EIGEN_MAKE_SOLVE_HELPERS(ColPivHouseholderQR<_MatrixType>,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    eigen_assert(rhs().rows() == dec().rows());
+
+    const Index cols = dec().cols(),
+				nonzero_pivots = dec().nonzeroPivots();
+
+    if(nonzero_pivots == 0)
+    {
+      dst.setZero();
+      return;
+    }
+
+    typename Rhs::PlainObject c(rhs());
+
+    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
+    c.applyOnTheLeft(householderSequence(dec().matrixQR(), dec().hCoeffs())
+                     .setLength(dec().nonzeroPivots())
+		     .transpose()
+      );
+
+    dec().matrixR()
+       .topLeftCorner(nonzero_pivots, nonzero_pivots)
+       .template triangularView<Upper>()
+       .solveInPlace(c.topRows(nonzero_pivots));
+
+    for(Index i = 0; i < nonzero_pivots; ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
+    for(Index i = nonzero_pivots; i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+  }
+};
+
+} // end namespace internal
+
+/** \returns the matrix Q as a sequence of householder transformations */
+template<typename MatrixType>
+typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
+  ::householderQ() const
+{
+  eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+  return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots);
+}
+
+/** \return the column-pivoting Householder QR decomposition of \c *this.
+  *
+  * \sa class ColPivHouseholderQR
+  */
+template<typename Derived>
+const ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::colPivHouseholderQr() const
+{
+  return ColPivHouseholderQR<PlainObject>(eval());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_H
diff --git a/include/eigen3/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/include/eigen3/Eigen/src/QR/ColPivHouseholderQR_MKL.h
new file mode 100644
index 0000000..b5b1983
--- /dev/null
+++ b/include/eigen3/Eigen/src/QR/ColPivHouseholderQR_MKL.h
@@ -0,0 +1,99 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *    Householder QR decomposition of a matrix with column pivoting based on
+ *    LAPACKE_?geqp3 function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
+#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
+
+#include "Eigen/src/Core/util/MKL_support.h"
+
+namespace Eigen { 
+
+/** \internal Specialization for the data types supported by MKL */
+
+#define EIGEN_MKL_QR_COLPIV(EIGTYPE, MKLTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
+template<> inline \
+ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
+ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
+              const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix) \
+\
+{ \
+  using std::abs; \
+  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
+  typedef MatrixType::Scalar Scalar; \
+  typedef MatrixType::RealScalar RealScalar; \
+  Index rows = matrix.rows();\
+  Index cols = matrix.cols();\
+  Index size = matrix.diagonalSize();\
+\
+  m_qr = matrix;\
+  m_hCoeffs.resize(size);\
+\
+  m_colsTranspositions.resize(cols);\
+  /*Index number_of_transpositions = 0;*/ \
+\
+  m_nonzero_pivots = 0; \
+  m_maxpivot = RealScalar(0);\
+  m_colsPermutation.resize(cols); \
+  m_colsPermutation.indices().setZero(); \
+\
+  lapack_int lda = m_qr.outerStride(), i; \
+  lapack_int matrix_order = MKLCOLROW; \
+  LAPACKE_##MKLPREFIX##geqp3( matrix_order, rows, cols, (MKLTYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (MKLTYPE*)m_hCoeffs.data()); \
+  m_isInitialized = true; \
+  m_maxpivot=m_qr.diagonal().cwiseAbs().maxCoeff(); \
+  m_hCoeffs.adjointInPlace(); \
+  RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold(); \
+  lapack_int *perm = m_colsPermutation.indices().data(); \
+  for(i=0;i<size;i++) { \
+    m_nonzero_pivots += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);\
+  } \
+  for(i=0;i<cols;i++) perm[i]--;\
+\
+  /*m_det_pq = (number_of_transpositions%2) ? -1 : 1;  // TODO: It's not needed now; fix upon availability in Eigen */ \
+\
+  return *this; \
+}
+
+EIGEN_MKL_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, ColMajor, LAPACK_COL_MAJOR)
+
+EIGEN_MKL_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, RowMajor, LAPACK_ROW_MAJOR)
+
+} // end namespace Eigen
+
+#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
diff --git a/include/eigen3/Eigen/src/QR/FullPivHouseholderQR.h b/include/eigen3/Eigen/src/QR/FullPivHouseholderQR.h
new file mode 100644
index 0000000..6168e7a
--- /dev/null
+++ b/include/eigen3/Eigen/src/QR/FullPivHouseholderQR.h
@@ -0,0 +1,614 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H
+#define EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType;
+
+template<typename MatrixType>
+struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+{
+  typedef typename MatrixType::PlainObject ReturnType;
+};
+
+}
+
+/** \ingroup QR_Module
+  *
+  * \class FullPivHouseholderQR
+  *
+  * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting
+  *
+  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  *
+  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
+  * such that 
+  * \f[
+  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R}
+  * \f]
+  * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an 
+  * upper triangular matrix.
+  *
+  * This decomposition performs a very prudent full pivoting in order to be rank-revealing and achieve optimal
+  * numerical stability. The trade-off is that it is slower than HouseholderQR and ColPivHouseholderQR.
+  *
+  * \sa MatrixBase::fullPivHouseholderQr()
+  */
+template<typename _MatrixType> class FullPivHouseholderQR
+{
+  public:
+
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
+    typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+    typedef Matrix<Index, 1,
+                   EIGEN_SIZE_MIN_PREFER_DYNAMIC(ColsAtCompileTime,RowsAtCompileTime), RowMajor, 1,
+                   EIGEN_SIZE_MIN_PREFER_FIXED(MaxColsAtCompileTime,MaxRowsAtCompileTime)> IntDiagSizeVectorType;
+    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
+    typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
+    typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
+
+    /** \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via FullPivHouseholderQR::compute(const MatrixType&).
+      */
+    FullPivHouseholderQR()
+      : m_qr(),
+        m_hCoeffs(),
+        m_rows_transpositions(),
+        m_cols_transpositions(),
+        m_cols_permutation(),
+        m_temp(),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false) {}
+
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa FullPivHouseholderQR()
+      */
+    FullPivHouseholderQR(Index rows, Index cols)
+      : m_qr(rows, cols),
+        m_hCoeffs((std::min)(rows,cols)),
+        m_rows_transpositions((std::min)(rows,cols)),
+        m_cols_transpositions((std::min)(rows,cols)),
+        m_cols_permutation(cols),
+        m_temp(cols),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false) {}
+
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This constructor computes the QR factorization of the matrix \a matrix by calling
+      * the method compute(). It is a short cut for:
+      * 
+      * \code
+      * FullPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
+      * qr.compute(matrix);
+      * \endcode
+      * 
+      * \sa compute()
+      */
+    FullPivHouseholderQR(const MatrixType& matrix)
+      : m_qr(matrix.rows(), matrix.cols()),
+        m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
+        m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_permutation(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      compute(matrix);
+    }
+
+    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
+      * \c *this is the QR decomposition.
+      *
+      * \param b the right-hand-side of the equation to solve.
+      *
+      * \returns the exact or least-square solution if the rank is greater or equal to the number of columns of A,
+      * and an arbitrary solution otherwise.
+      *
+      * \note The case where b is a matrix is not yet implemented. Also, this
+      *       code is space inefficient.
+      *
+      * \note_about_checking_solutions
+      *
+      * \note_about_arbitrary_choice_of_solution
+      *
+      * Example: \include FullPivHouseholderQR_solve.cpp
+      * Output: \verbinclude FullPivHouseholderQR_solve.out
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<FullPivHouseholderQR, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return internal::solve_retval<FullPivHouseholderQR, Rhs>(*this, b.derived());
+    }
+
+    /** \returns Expression object representing the matrix Q
+      */
+    MatrixQReturnType matrixQ(void) const;
+
+    /** \returns a reference to the matrix where the Householder QR decomposition is stored
+      */
+    const MatrixType& matrixQR() const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return m_qr;
+    }
+
+    FullPivHouseholderQR& compute(const MatrixType& matrix);
+
+    /** \returns a const reference to the column permutation matrix */
+    const PermutationType& colsPermutation() const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return m_cols_permutation;
+    }
+
+    /** \returns a const reference to the vector of indices representing the rows transpositions */
+    const IntDiagSizeVectorType& rowsTranspositions() const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return m_rows_transpositions;
+    }
+
+    /** \returns the absolute value of the determinant of the matrix of which
+      * *this is the QR decomposition. It has only linear complexity
+      * (that is, O(n) where n is the dimension of the square matrix)
+      * as the QR decomposition has already been computed.
+      *
+      * \note This is only for square matrices.
+      *
+      * \warning a determinant can be very big or small, so for matrices
+      * of large enough dimension, there is a risk of overflow/underflow.
+      * One way to work around that is to use logAbsDeterminant() instead.
+      *
+      * \sa logAbsDeterminant(), MatrixBase::determinant()
+      */
+    typename MatrixType::RealScalar absDeterminant() const;
+
+    /** \returns the natural log of the absolute value of the determinant of the matrix of which
+      * *this is the QR decomposition. It has only linear complexity
+      * (that is, O(n) where n is the dimension of the square matrix)
+      * as the QR decomposition has already been computed.
+      *
+      * \note This is only for square matrices.
+      *
+      * \note This method is useful to work around the risk of overflow/underflow that's inherent
+      * to determinant computation.
+      *
+      * \sa absDeterminant(), MatrixBase::determinant()
+      */
+    typename MatrixType::RealScalar logAbsDeterminant() const;
+
+    /** \returns the rank of the matrix of which *this is the QR decomposition.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline Index rank() const
+    {
+      using std::abs;
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
+      Index result = 0;
+      for(Index i = 0; i < m_nonzero_pivots; ++i)
+        result += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);
+      return result;
+    }
+
+    /** \returns the dimension of the kernel of the matrix of which *this is the QR decomposition.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline Index dimensionOfKernel() const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return cols() - rank();
+    }
+
+    /** \returns true if the matrix of which *this is the QR decomposition represents an injective
+      *          linear map, i.e. has trivial kernel; false otherwise.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline bool isInjective() const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return rank() == cols();
+    }
+
+    /** \returns true if the matrix of which *this is the QR decomposition represents a surjective
+      *          linear map; false otherwise.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline bool isSurjective() const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return rank() == rows();
+    }
+
+    /** \returns true if the matrix of which *this is the QR decomposition is invertible.
+      *
+      * \note This method has to determine which pivots should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline bool isInvertible() const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return isInjective() && isSurjective();
+    }
+
+    /** \returns the inverse of the matrix of which *this is the QR decomposition.
+      *
+      * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
+      *       Use isInvertible() to first determine whether this matrix is invertible.
+      */    inline const
+    internal::solve_retval<FullPivHouseholderQR, typename MatrixType::IdentityReturnType>
+    inverse() const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return internal::solve_retval<FullPivHouseholderQR,typename MatrixType::IdentityReturnType>
+               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
+    }
+
+    inline Index rows() const { return m_qr.rows(); }
+    inline Index cols() const { return m_qr.cols(); }
+    
+    /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
+      * 
+      * For advanced uses only.
+      */
+    const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
+
+    /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
+      * who need to determine when pivots are to be considered nonzero. This is not used for the
+      * QR decomposition itself.
+      *
+      * When it needs to get the threshold value, Eigen calls threshold(). By default, this
+      * uses a formula to automatically determine a reasonable threshold.
+      * Once you have called the present method setThreshold(const RealScalar&),
+      * your value is used instead.
+      *
+      * \param threshold The new value to use as the threshold.
+      *
+      * A pivot will be considered nonzero if its absolute value is strictly greater than
+      *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
+      * where maxpivot is the biggest pivot.
+      *
+      * If you want to come back to the default behavior, call setThreshold(Default_t)
+      */
+    FullPivHouseholderQR& setThreshold(const RealScalar& threshold)
+    {
+      m_usePrescribedThreshold = true;
+      m_prescribedThreshold = threshold;
+      return *this;
+    }
+
+    /** Allows to come back to the default behavior, letting Eigen use its default formula for
+      * determining the threshold.
+      *
+      * You should pass the special object Eigen::Default as parameter here.
+      * \code qr.setThreshold(Eigen::Default); \endcode
+      *
+      * See the documentation of setThreshold(const RealScalar&).
+      */
+    FullPivHouseholderQR& setThreshold(Default_t)
+    {
+      m_usePrescribedThreshold = false;
+      return *this;
+    }
+
+    /** Returns the threshold that will be used by certain methods such as rank().
+      *
+      * See the documentation of setThreshold(const RealScalar&).
+      */
+    RealScalar threshold() const
+    {
+      eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+      return m_usePrescribedThreshold ? m_prescribedThreshold
+      // this formula comes from experimenting (see "LU precision tuning" thread on the list)
+      // and turns out to be identical to Higham's formula used already in LDLt.
+                                      : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
+    }
+
+    /** \returns the number of nonzero pivots in the QR decomposition.
+      * Here nonzero is meant in the exact sense, not in a fuzzy sense.
+      * So that notion isn't really intrinsically interesting, but it is
+      * still useful when implementing algorithms.
+      *
+      * \sa rank()
+      */
+    inline Index nonzeroPivots() const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return m_nonzero_pivots;
+    }
+
+    /** \returns the absolute value of the biggest pivot, i.e. the biggest
+      *          diagonal coefficient of U.
+      */
+    RealScalar maxPivot() const { return m_maxpivot; }
+
+  protected:
+    MatrixType m_qr;
+    HCoeffsType m_hCoeffs;
+    IntDiagSizeVectorType m_rows_transpositions;
+    IntDiagSizeVectorType m_cols_transpositions;
+    PermutationType m_cols_permutation;
+    RowVectorType m_temp;
+    bool m_isInitialized, m_usePrescribedThreshold;
+    RealScalar m_prescribedThreshold, m_maxpivot;
+    Index m_nonzero_pivots;
+    RealScalar m_precision;
+    Index m_det_pq;
+};
+
+template<typename MatrixType>
+typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::absDeterminant() const
+{
+  using std::abs;
+  eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  return abs(m_qr.diagonal().prod());
+}
+
+template<typename MatrixType>
+typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::logAbsDeterminant() const
+{
+  eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  return m_qr.diagonal().cwiseAbs().array().log().sum();
+}
+
+/** Performs the QR factorization of the given matrix \a matrix. The result of
+  * the factorization is stored into \c *this, and a reference to \c *this
+  * is returned.
+  *
+  * \sa class FullPivHouseholderQR, FullPivHouseholderQR(const MatrixType&)
+  */
+template<typename MatrixType>
+FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+{
+  using std::abs;
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+  Index size = (std::min)(rows,cols);
+
+  m_qr = matrix;
+  m_hCoeffs.resize(size);
+
+  m_temp.resize(cols);
+
+  m_precision = NumTraits<Scalar>::epsilon() * RealScalar(size);
+
+  m_rows_transpositions.resize(size);
+  m_cols_transpositions.resize(size);
+  Index number_of_transpositions = 0;
+
+  RealScalar biggest(0);
+
+  m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
+  m_maxpivot = RealScalar(0);
+
+  for (Index k = 0; k < size; ++k)
+  {
+    Index row_of_biggest_in_corner, col_of_biggest_in_corner;
+    RealScalar biggest_in_corner;
+
+    biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k)
+                            .cwiseAbs()
+                            .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    row_of_biggest_in_corner += k;
+    col_of_biggest_in_corner += k;
+    if(k==0) biggest = biggest_in_corner;
+
+    // if the corner is negligible, then we have less than full rank, and we can finish early
+    if(internal::isMuchSmallerThan(biggest_in_corner, biggest, m_precision))
+    {
+      m_nonzero_pivots = k;
+      for(Index i = k; i < size; i++)
+      {
+        m_rows_transpositions.coeffRef(i) = i;
+        m_cols_transpositions.coeffRef(i) = i;
+        m_hCoeffs.coeffRef(i) = Scalar(0);
+      }
+      break;
+    }
+
+    m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner;
+    m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner;
+    if(k != row_of_biggest_in_corner) {
+      m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k));
+      ++number_of_transpositions;
+    }
+    if(k != col_of_biggest_in_corner) {
+      m_qr.col(k).swap(m_qr.col(col_of_biggest_in_corner));
+      ++number_of_transpositions;
+    }
+
+    RealScalar beta;
+    m_qr.col(k).tail(rows-k).makeHouseholderInPlace(m_hCoeffs.coeffRef(k), beta);
+    m_qr.coeffRef(k,k) = beta;
+
+    // remember the maximum absolute value of diagonal coefficients
+    if(abs(beta) > m_maxpivot) m_maxpivot = abs(beta);
+
+    m_qr.bottomRightCorner(rows-k, cols-k-1)
+        .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows-k-1), m_hCoeffs.coeffRef(k), &m_temp.coeffRef(k+1));
+  }
+
+  m_cols_permutation.setIdentity(cols);
+  for(Index k = 0; k < size; ++k)
+    m_cols_permutation.applyTranspositionOnTheRight(k, m_cols_transpositions.coeff(k));
+
+  m_det_pq = (number_of_transpositions%2) ? -1 : 1;
+  m_isInitialized = true;
+
+  return *this;
+}
+
+namespace internal {
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<FullPivHouseholderQR<_MatrixType>, Rhs>
+  : solve_retval_base<FullPivHouseholderQR<_MatrixType>, Rhs>
+{
+  EIGEN_MAKE_SOLVE_HELPERS(FullPivHouseholderQR<_MatrixType>,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    const Index rows = dec().rows(), cols = dec().cols();
+    eigen_assert(rhs().rows() == rows);
+
+    // FIXME introduce nonzeroPivots() and use it here. and more generally,
+    // make the same improvements in this dec as in FullPivLU.
+    if(dec().rank()==0)
+    {
+      dst.setZero();
+      return;
+    }
+
+    typename Rhs::PlainObject c(rhs());
+
+    Matrix<Scalar,1,Rhs::ColsAtCompileTime> temp(rhs().cols());
+    for (Index k = 0; k < dec().rank(); ++k)
+    {
+      Index remainingSize = rows-k;
+      c.row(k).swap(c.row(dec().rowsTranspositions().coeff(k)));
+      c.bottomRightCorner(remainingSize, rhs().cols())
+       .applyHouseholderOnTheLeft(dec().matrixQR().col(k).tail(remainingSize-1),
+                                  dec().hCoeffs().coeff(k), &temp.coeffRef(0));
+    }
+
+    dec().matrixQR()
+       .topLeftCorner(dec().rank(), dec().rank())
+       .template triangularView<Upper>()
+       .solveInPlace(c.topRows(dec().rank()));
+
+    for(Index i = 0; i < dec().rank(); ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
+    for(Index i = dec().rank(); i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+  }
+};
+
+/** \ingroup QR_Module
+  *
+  * \brief Expression type for return value of FullPivHouseholderQR::matrixQ()
+  *
+  * \tparam MatrixType type of underlying dense matrix
+  */
+template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType
+  : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+{
+public:
+  typedef typename MatrixType::Index Index;
+  typedef typename FullPivHouseholderQR<MatrixType>::IntDiagSizeVectorType IntDiagSizeVectorType;
+  typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+  typedef Matrix<typename MatrixType::Scalar, 1, MatrixType::RowsAtCompileTime, RowMajor, 1,
+                 MatrixType::MaxRowsAtCompileTime> WorkVectorType;
+
+  FullPivHouseholderQRMatrixQReturnType(const MatrixType&       qr,
+                                        const HCoeffsType&      hCoeffs,
+                                        const IntDiagSizeVectorType& rowsTranspositions)
+    : m_qr(qr),
+      m_hCoeffs(hCoeffs),
+      m_rowsTranspositions(rowsTranspositions)
+      {}
+
+  template <typename ResultType>
+  void evalTo(ResultType& result) const
+  {
+    const Index rows = m_qr.rows();
+    WorkVectorType workspace(rows);
+    evalTo(result, workspace);
+  }
+
+  template <typename ResultType>
+  void evalTo(ResultType& result, WorkVectorType& workspace) const
+  {
+    using numext::conj;
+    // compute the product H'_0 H'_1 ... H'_n-1,
+    // where H_k is the k-th Householder transformation I - h_k v_k v_k'
+    // and v_k is the k-th Householder vector [1,m_qr(k+1,k), m_qr(k+2,k), ...]
+    const Index rows = m_qr.rows();
+    const Index cols = m_qr.cols();
+    const Index size = (std::min)(rows, cols);
+    workspace.resize(rows);
+    result.setIdentity(rows, rows);
+    for (Index k = size-1; k >= 0; k--)
+    {
+      result.block(k, k, rows-k, rows-k)
+            .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows-k-1), conj(m_hCoeffs.coeff(k)), &workspace.coeffRef(k));
+      result.row(k).swap(result.row(m_rowsTranspositions.coeff(k)));
+    }
+  }
+
+    Index rows() const { return m_qr.rows(); }
+    Index cols() const { return m_qr.rows(); }
+
+protected:
+  typename MatrixType::Nested m_qr;
+  typename HCoeffsType::Nested m_hCoeffs;
+  typename IntDiagSizeVectorType::Nested m_rowsTranspositions;
+};
+
+} // end namespace internal
+
+template<typename MatrixType>
+inline typename FullPivHouseholderQR<MatrixType>::MatrixQReturnType FullPivHouseholderQR<MatrixType>::matrixQ() const
+{
+  eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+  return MatrixQReturnType(m_qr, m_hCoeffs, m_rows_transpositions);
+}
+
+/** \return the full-pivoting Householder QR decomposition of \c *this.
+  *
+  * \sa class FullPivHouseholderQR
+  */
+template<typename Derived>
+const FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::fullPivHouseholderQr() const
+{
+  return FullPivHouseholderQR<PlainObject>(eval());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H
diff --git a/include/eigen3/Eigen/src/QR/HouseholderQR.h b/include/eigen3/Eigen/src/QR/HouseholderQR.h
new file mode 100644
index 0000000..abc61bc
--- /dev/null
+++ b/include/eigen3/Eigen/src/QR/HouseholderQR.h
@@ -0,0 +1,374 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+// Copyright (C) 2010 Vincent Lejeune
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_QR_H
+#define EIGEN_QR_H
+
+namespace Eigen { 
+
+/** \ingroup QR_Module
+  *
+  *
+  * \class HouseholderQR
+  *
+  * \brief Householder QR decomposition of a matrix
+  *
+  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  *
+  * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R
+  * such that 
+  * \f[
+  *  \mathbf{A} = \mathbf{Q} \, \mathbf{R}
+  * \f]
+  * by using Householder transformations. Here, \b Q a unitary matrix and \b R an upper triangular matrix.
+  * The result is stored in a compact way compatible with LAPACK.
+  *
+  * Note that no pivoting is performed. This is \b not a rank-revealing decomposition.
+  * If you want that feature, use FullPivHouseholderQR or ColPivHouseholderQR instead.
+  *
+  * This Householder QR decomposition is faster, but less numerically stable and less feature-full than
+  * FullPivHouseholderQR or ColPivHouseholderQR.
+  *
+  * \sa MatrixBase::householderQr()
+  */
+template<typename _MatrixType> class HouseholderQR
+{
+  public:
+
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      Options = MatrixType::Options,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
+    typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+    typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
+    typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename HCoeffsType::ConjugateReturnType>::type> HouseholderSequenceType;
+
+    /**
+      * \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via HouseholderQR::compute(const MatrixType&).
+      */
+    HouseholderQR() : m_qr(), m_hCoeffs(), m_temp(), m_isInitialized(false) {}
+
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa HouseholderQR()
+      */
+    HouseholderQR(Index rows, Index cols)
+      : m_qr(rows, cols),
+        m_hCoeffs((std::min)(rows,cols)),
+        m_temp(cols),
+        m_isInitialized(false) {}
+
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This constructor computes the QR factorization of the matrix \a matrix by calling
+      * the method compute(). It is a short cut for:
+      * 
+      * \code
+      * HouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
+      * qr.compute(matrix);
+      * \endcode
+      * 
+      * \sa compute()
+      */
+    HouseholderQR(const MatrixType& matrix)
+      : m_qr(matrix.rows(), matrix.cols()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_temp(matrix.cols()),
+        m_isInitialized(false)
+    {
+      compute(matrix);
+    }
+
+    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
+      * *this is the QR decomposition, if any exists.
+      *
+      * \param b the right-hand-side of the equation to solve.
+      *
+      * \returns a solution.
+      *
+      * \note The case where b is a matrix is not yet implemented. Also, this
+      *       code is space inefficient.
+      *
+      * \note_about_checking_solutions
+      *
+      * \note_about_arbitrary_choice_of_solution
+      *
+      * Example: \include HouseholderQR_solve.cpp
+      * Output: \verbinclude HouseholderQR_solve.out
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<HouseholderQR, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+      return internal::solve_retval<HouseholderQR, Rhs>(*this, b.derived());
+    }
+
+    /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
+      *
+      * The returned expression can directly be used to perform matrix products. It can also be assigned to a dense Matrix object.
+      * Here is an example showing how to recover the full or thin matrix Q, as well as how to perform matrix products using operator*:
+      *
+      * Example: \include HouseholderQR_householderQ.cpp
+      * Output: \verbinclude HouseholderQR_householderQ.out
+      */
+    HouseholderSequenceType householderQ() const
+    {
+      eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+      return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
+    }
+
+    /** \returns a reference to the matrix where the Householder QR decomposition is stored
+      * in a LAPACK-compatible way.
+      */
+    const MatrixType& matrixQR() const
+    {
+        eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+        return m_qr;
+    }
+
+    HouseholderQR& compute(const MatrixType& matrix);
+
+    /** \returns the absolute value of the determinant of the matrix of which
+      * *this is the QR decomposition. It has only linear complexity
+      * (that is, O(n) where n is the dimension of the square matrix)
+      * as the QR decomposition has already been computed.
+      *
+      * \note This is only for square matrices.
+      *
+      * \warning a determinant can be very big or small, so for matrices
+      * of large enough dimension, there is a risk of overflow/underflow.
+      * One way to work around that is to use logAbsDeterminant() instead.
+      *
+      * \sa logAbsDeterminant(), MatrixBase::determinant()
+      */
+    typename MatrixType::RealScalar absDeterminant() const;
+
+    /** \returns the natural log of the absolute value of the determinant of the matrix of which
+      * *this is the QR decomposition. It has only linear complexity
+      * (that is, O(n) where n is the dimension of the square matrix)
+      * as the QR decomposition has already been computed.
+      *
+      * \note This is only for square matrices.
+      *
+      * \note This method is useful to work around the risk of overflow/underflow that's inherent
+      * to determinant computation.
+      *
+      * \sa absDeterminant(), MatrixBase::determinant()
+      */
+    typename MatrixType::RealScalar logAbsDeterminant() const;
+
+    inline Index rows() const { return m_qr.rows(); }
+    inline Index cols() const { return m_qr.cols(); }
+    
+    /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
+      * 
+      * For advanced uses only.
+      */
+    const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
+
+  protected:
+    MatrixType m_qr;
+    HCoeffsType m_hCoeffs;
+    RowVectorType m_temp;
+    bool m_isInitialized;
+};
+
+template<typename MatrixType>
+typename MatrixType::RealScalar HouseholderQR<MatrixType>::absDeterminant() const
+{
+  using std::abs;
+  eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  return abs(m_qr.diagonal().prod());
+}
+
+template<typename MatrixType>
+typename MatrixType::RealScalar HouseholderQR<MatrixType>::logAbsDeterminant() const
+{
+  eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  return m_qr.diagonal().cwiseAbs().array().log().sum();
+}
+
+namespace internal {
+
+/** \internal */
+template<typename MatrixQR, typename HCoeffs>
+void householder_qr_inplace_unblocked(MatrixQR& mat, HCoeffs& hCoeffs, typename MatrixQR::Scalar* tempData = 0)
+{
+  typedef typename MatrixQR::Index Index;
+  typedef typename MatrixQR::Scalar Scalar;
+  typedef typename MatrixQR::RealScalar RealScalar;
+  Index rows = mat.rows();
+  Index cols = mat.cols();
+  Index size = (std::min)(rows,cols);
+
+  eigen_assert(hCoeffs.size() == size);
+
+  typedef Matrix<Scalar,MatrixQR::ColsAtCompileTime,1> TempType;
+  TempType tempVector;
+  if(tempData==0)
+  {
+    tempVector.resize(cols);
+    tempData = tempVector.data();
+  }
+
+  for(Index k = 0; k < size; ++k)
+  {
+    Index remainingRows = rows - k;
+    Index remainingCols = cols - k - 1;
+
+    RealScalar beta;
+    mat.col(k).tail(remainingRows).makeHouseholderInPlace(hCoeffs.coeffRef(k), beta);
+    mat.coeffRef(k,k) = beta;
+
+    // apply H to remaining part of m_qr from the left
+    mat.bottomRightCorner(remainingRows, remainingCols)
+        .applyHouseholderOnTheLeft(mat.col(k).tail(remainingRows-1), hCoeffs.coeffRef(k), tempData+k+1);
+  }
+}
+
+/** \internal */
+template<typename MatrixQR, typename HCoeffs>
+void householder_qr_inplace_blocked(MatrixQR& mat, HCoeffs& hCoeffs,
+                                       typename MatrixQR::Index maxBlockSize=32,
+                                       typename MatrixQR::Scalar* tempData = 0)
+{
+  typedef typename MatrixQR::Index Index;
+  typedef typename MatrixQR::Scalar Scalar;
+  typedef Block<MatrixQR,Dynamic,Dynamic> BlockType;
+
+  Index rows = mat.rows();
+  Index cols = mat.cols();
+  Index size = (std::min)(rows, cols);
+
+  typedef Matrix<Scalar,Dynamic,1,ColMajor,MatrixQR::MaxColsAtCompileTime,1> TempType;
+  TempType tempVector;
+  if(tempData==0)
+  {
+    tempVector.resize(cols);
+    tempData = tempVector.data();
+  }
+
+  Index blockSize = (std::min)(maxBlockSize,size);
+
+  Index k = 0;
+  for (k = 0; k < size; k += blockSize)
+  {
+    Index bs = (std::min)(size-k,blockSize);  // actual size of the block
+    Index tcols = cols - k - bs;            // trailing columns
+    Index brows = rows-k;                   // rows of the block
+
+    // partition the matrix:
+    //        A00 | A01 | A02
+    // mat  = A10 | A11 | A12
+    //        A20 | A21 | A22
+    // and performs the qr dec of [A11^T A12^T]^T
+    // and update [A21^T A22^T]^T using level 3 operations.
+    // Finally, the algorithm continue on A22
+
+    BlockType A11_21 = mat.block(k,k,brows,bs);
+    Block<HCoeffs,Dynamic,1> hCoeffsSegment = hCoeffs.segment(k,bs);
+
+    householder_qr_inplace_unblocked(A11_21, hCoeffsSegment, tempData);
+
+    if(tcols)
+    {
+      BlockType A21_22 = mat.block(k,k+bs,brows,tcols);
+      apply_block_householder_on_the_left(A21_22,A11_21,hCoeffsSegment.adjoint());
+    }
+  }
+}
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<HouseholderQR<_MatrixType>, Rhs>
+  : solve_retval_base<HouseholderQR<_MatrixType>, Rhs>
+{
+  EIGEN_MAKE_SOLVE_HELPERS(HouseholderQR<_MatrixType>,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    const Index rows = dec().rows(), cols = dec().cols();
+    const Index rank = (std::min)(rows, cols);
+    eigen_assert(rhs().rows() == rows);
+
+    typename Rhs::PlainObject c(rhs());
+
+    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
+    c.applyOnTheLeft(householderSequence(
+      dec().matrixQR().leftCols(rank),
+      dec().hCoeffs().head(rank)).transpose()
+    );
+
+    dec().matrixQR()
+       .topLeftCorner(rank, rank)
+       .template triangularView<Upper>()
+       .solveInPlace(c.topRows(rank));
+
+    dst.topRows(rank) = c.topRows(rank);
+    dst.bottomRows(cols-rank).setZero();
+  }
+};
+
+} // end namespace internal
+
+/** Performs the QR factorization of the given matrix \a matrix. The result of
+  * the factorization is stored into \c *this, and a reference to \c *this
+  * is returned.
+  *
+  * \sa class HouseholderQR, HouseholderQR(const MatrixType&)
+  */
+template<typename MatrixType>
+HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+{
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+  Index size = (std::min)(rows,cols);
+
+  m_qr = matrix;
+  m_hCoeffs.resize(size);
+
+  m_temp.resize(cols);
+
+  internal::householder_qr_inplace_blocked(m_qr, m_hCoeffs, 48, m_temp.data());
+
+  m_isInitialized = true;
+  return *this;
+}
+
+/** \return the Householder QR decomposition of \c *this.
+  *
+  * \sa class HouseholderQR
+  */
+template<typename Derived>
+const HouseholderQR<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::householderQr() const
+{
+  return HouseholderQR<PlainObject>(eval());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_QR_H
diff --git a/include/eigen3/Eigen/src/QR/HouseholderQR_MKL.h b/include/eigen3/Eigen/src/QR/HouseholderQR_MKL.h
new file mode 100644
index 0000000..5313de6
--- /dev/null
+++ b/include/eigen3/Eigen/src/QR/HouseholderQR_MKL.h
@@ -0,0 +1,69 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *    Householder QR decomposition of a matrix w/o pivoting based on
+ *    LAPACKE_?geqrf function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_QR_MKL_H
+#define EIGEN_QR_MKL_H
+
+#include "Eigen/src/Core/util/MKL_support.h"
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Specialization for the data types supported by MKL */
+
+#define EIGEN_MKL_QR_NOPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
+template<typename MatrixQR, typename HCoeffs> \
+void householder_qr_inplace_blocked(MatrixQR& mat, HCoeffs& hCoeffs, \
+                                       typename MatrixQR::Index maxBlockSize=32, \
+                                       EIGTYPE* tempData = 0) \
+{ \
+  lapack_int m = mat.rows(); \
+  lapack_int n = mat.cols(); \
+  lapack_int lda = mat.outerStride(); \
+  lapack_int matrix_order = (MatrixQR::IsRowMajor) ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
+  LAPACKE_##MKLPREFIX##geqrf( matrix_order, m, n, (MKLTYPE*)mat.data(), lda, (MKLTYPE*)hCoeffs.data()); \
+  hCoeffs.adjointInPlace(); \
+\
+}
+
+EIGEN_MKL_QR_NOPIV(double, double, d)
+EIGEN_MKL_QR_NOPIV(float, float, s)
+EIGEN_MKL_QR_NOPIV(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_QR_NOPIV(scomplex, MKL_Complex8, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_QR_MKL_H
diff --git a/include/eigen3/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/include/eigen3/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
new file mode 100644
index 0000000..a2cc2a9
--- /dev/null
+++ b/include/eigen3/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -0,0 +1,314 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SUITESPARSEQRSUPPORT_H
+#define EIGEN_SUITESPARSEQRSUPPORT_H
+
+namespace Eigen {
+  
+  template<typename MatrixType> class SPQR; 
+  template<typename SPQRType> struct SPQRMatrixQReturnType; 
+  template<typename SPQRType> struct SPQRMatrixQTransposeReturnType; 
+  template <typename SPQRType, typename Derived> struct SPQR_QProduct;
+  namespace internal {
+    template <typename SPQRType> struct traits<SPQRMatrixQReturnType<SPQRType> >
+    {
+      typedef typename SPQRType::MatrixType ReturnType;
+    };
+    template <typename SPQRType> struct traits<SPQRMatrixQTransposeReturnType<SPQRType> >
+    {
+      typedef typename SPQRType::MatrixType ReturnType;
+    };
+    template <typename SPQRType, typename Derived> struct traits<SPQR_QProduct<SPQRType, Derived> >
+    {
+      typedef typename Derived::PlainObject ReturnType;
+    };
+  } // End namespace internal
+  
+/**
+ * \ingroup SPQRSupport_Module
+ * \class SPQR
+ * \brief Sparse QR factorization based on SuiteSparseQR library
+ * 
+ * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition 
+ * of sparse matrices. The result is then used to solve linear leasts_square systems.
+ * Clearly, a QR factorization is returned such that A*P = Q*R where :
+ * 
+ * P is the column permutation. Use colsPermutation() to get it.
+ * 
+ * Q is the orthogonal matrix represented as Householder reflectors. 
+ * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
+ * You can then apply it to a vector.
+ * 
+ * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
+ * NOTE : The Index type of R is always UF_long. You can get it with SPQR::Index
+ * 
+ * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
+ * NOTE 
+ * 
+ */
+template<typename _MatrixType>
+class SPQR
+{
+  public:
+    typedef typename _MatrixType::Scalar Scalar;
+    typedef typename _MatrixType::RealScalar RealScalar;
+    typedef UF_long Index ; 
+    typedef SparseMatrix<Scalar, ColMajor, Index> MatrixType;
+    typedef PermutationMatrix<Dynamic, Dynamic> PermutationType;
+  public:
+    SPQR() 
+      : m_isInitialized(false),
+      m_ordering(SPQR_ORDERING_DEFAULT),
+      m_allow_tol(SPQR_DEFAULT_TOL),
+      m_tolerance (NumTraits<Scalar>::epsilon())
+    { 
+      cholmod_l_start(&m_cc);
+    }
+    
+    SPQR(const _MatrixType& matrix) 
+    : m_isInitialized(false),
+      m_ordering(SPQR_ORDERING_DEFAULT),
+      m_allow_tol(SPQR_DEFAULT_TOL),
+      m_tolerance (NumTraits<Scalar>::epsilon())
+    {
+      cholmod_l_start(&m_cc);
+      compute(matrix);
+    }
+    
+    ~SPQR()
+    {
+      SPQR_free();
+      cholmod_l_finish(&m_cc);
+    }
+    void SPQR_free()
+    {
+      cholmod_l_free_sparse(&m_H, &m_cc);
+      cholmod_l_free_sparse(&m_cR, &m_cc);
+      cholmod_l_free_dense(&m_HTau, &m_cc);
+      std::free(m_E);
+      std::free(m_HPinv);
+    }
+
+    void compute(const _MatrixType& matrix)
+    {
+      if(m_isInitialized) SPQR_free();
+
+      MatrixType mat(matrix);
+      cholmod_sparse A; 
+      A = viewAsCholmod(mat);
+      Index col = matrix.cols();
+      m_rank = SuiteSparseQR<Scalar>(m_ordering, m_tolerance, col, &A, 
+                             &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
+
+      if (!m_cR)
+      {
+        m_info = NumericalIssue; 
+        m_isInitialized = false;
+        return;
+      }
+      m_info = Success;
+      m_isInitialized = true;
+      m_isRUpToDate = false;
+    }
+    /** 
+     * Get the number of rows of the input matrix and the Q matrix
+     */
+    inline Index rows() const {return m_H->nrow; }
+    
+    /** 
+     * Get the number of columns of the input matrix. 
+     */
+    inline Index cols() const { return m_cR->ncol; }
+   
+      /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<SPQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
+    {
+      eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
+      eigen_assert(this->rows()==B.rows()
+                    && "SPQR::solve(): invalid number of rows of the right hand side matrix B");
+          return internal::solve_retval<SPQR, Rhs>(*this, B.derived());
+    }
+    
+    template<typename Rhs, typename Dest>
+    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    {
+      eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
+      eigen_assert(b.cols()==1 && "This method is for vectors only");
+      
+      //Compute Q^T * b
+      typename Dest::PlainObject y;
+      y = matrixQ().transpose() * b;
+        // Solves with the triangular matrix R
+      Index rk = this->rank();
+      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y.topRows(rk));
+      y.bottomRows(cols()-rk).setZero();
+      // Apply the column permutation 
+      dest.topRows(cols()) = colsPermutation() * y.topRows(cols());
+      
+      m_info = Success;
+    }
+    
+    /** \returns the sparse triangular factor R. It is a sparse matrix
+     */
+    const MatrixType matrixR() const
+    {
+      eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
+      if(!m_isRUpToDate) {
+        m_R = viewAsEigen<Scalar,ColMajor, typename MatrixType::Index>(*m_cR);
+        m_isRUpToDate = true;
+      }
+      return m_R;
+    }
+    /// Get an expression of the matrix Q
+    SPQRMatrixQReturnType<SPQR> matrixQ() const
+    {
+      return SPQRMatrixQReturnType<SPQR>(*this);
+    }
+    /// Get the permutation that was applied to columns of A
+    PermutationType colsPermutation() const
+    { 
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      Index n = m_cR->ncol;
+      PermutationType colsPerm(n);
+      for(Index j = 0; j <n; j++) colsPerm.indices()(j) = m_E[j];
+      return colsPerm; 
+      
+    }
+    /**
+     * Gets the rank of the matrix. 
+     * It should be equal to matrixQR().cols if the matrix is full-rank
+     */
+    Index rank() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_cc.SPQR_istat[4];
+    }
+    /// Set the fill-reducing ordering method to be used
+    void setSPQROrdering(int ord) { m_ordering = ord;}
+    /// Set the tolerance tol to treat columns with 2-norm < =tol as zero
+    void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; }
+    
+    /** \returns a pointer to the SPQR workspace */
+    cholmod_common *cholmodCommon() const { return &m_cc; }
+    
+    
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the sparse QR can not be computed
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+  protected:
+    bool m_isInitialized;
+    bool m_analysisIsOk;
+    bool m_factorizationIsOk;
+    mutable bool m_isRUpToDate;
+    mutable ComputationInfo m_info;
+    int m_ordering; // Ordering method to use, see SPQR's manual
+    int m_allow_tol; // Allow to use some tolerance during numerical factorization.
+    RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero
+    mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format
+    mutable MatrixType m_R; // The sparse matrix R in Eigen format
+    mutable Index *m_E; // The permutation applied to columns
+    mutable cholmod_sparse *m_H;  //The householder vectors
+    mutable Index *m_HPinv; // The row permutation of H
+    mutable cholmod_dense *m_HTau; // The Householder coefficients
+    mutable Index m_rank; // The rank of the matrix
+    mutable cholmod_common m_cc; // Workspace and parameters
+    template<typename ,typename > friend struct SPQR_QProduct;
+};
+
+template <typename SPQRType, typename Derived>
+struct SPQR_QProduct : ReturnByValue<SPQR_QProduct<SPQRType,Derived> >
+{
+  typedef typename SPQRType::Scalar Scalar;
+  typedef typename SPQRType::Index Index;
+  //Define the constructor to get reference to argument types
+  SPQR_QProduct(const SPQRType& spqr, const Derived& other, bool transpose) : m_spqr(spqr),m_other(other),m_transpose(transpose) {}
+  
+  inline Index rows() const { return m_transpose ? m_spqr.rows() : m_spqr.cols(); }
+  inline Index cols() const { return m_other.cols(); }
+  // Assign to a vector
+  template<typename ResType>
+  void evalTo(ResType& res) const
+  {
+    cholmod_dense y_cd;
+    cholmod_dense *x_cd; 
+    int method = m_transpose ? SPQR_QTX : SPQR_QX; 
+    cholmod_common *cc = m_spqr.cholmodCommon();
+    y_cd = viewAsCholmod(m_other.const_cast_derived());
+    x_cd = SuiteSparseQR_qmult<Scalar>(method, m_spqr.m_H, m_spqr.m_HTau, m_spqr.m_HPinv, &y_cd, cc);
+    res = Matrix<Scalar,ResType::RowsAtCompileTime,ResType::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x), x_cd->nrow, x_cd->ncol);
+    cholmod_l_free_dense(&x_cd, cc);
+  }
+  const SPQRType& m_spqr; 
+  const Derived& m_other; 
+  bool m_transpose; 
+  
+};
+template<typename SPQRType>
+struct SPQRMatrixQReturnType{
+  
+  SPQRMatrixQReturnType(const SPQRType& spqr) : m_spqr(spqr) {}
+  template<typename Derived>
+  SPQR_QProduct<SPQRType, Derived> operator*(const MatrixBase<Derived>& other)
+  {
+    return SPQR_QProduct<SPQRType,Derived>(m_spqr,other.derived(),false);
+  }
+  SPQRMatrixQTransposeReturnType<SPQRType> adjoint() const
+  {
+    return SPQRMatrixQTransposeReturnType<SPQRType>(m_spqr);
+  }
+  // To use for operations with the transpose of Q
+  SPQRMatrixQTransposeReturnType<SPQRType> transpose() const
+  {
+    return SPQRMatrixQTransposeReturnType<SPQRType>(m_spqr);
+  }
+  const SPQRType& m_spqr;
+};
+
+template<typename SPQRType>
+struct SPQRMatrixQTransposeReturnType{
+  SPQRMatrixQTransposeReturnType(const SPQRType& spqr) : m_spqr(spqr) {}
+  template<typename Derived>
+  SPQR_QProduct<SPQRType,Derived> operator*(const MatrixBase<Derived>& other)
+  {
+    return SPQR_QProduct<SPQRType,Derived>(m_spqr,other.derived(), true);
+  }
+  const SPQRType& m_spqr;
+};
+
+namespace internal {
+  
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<SPQR<_MatrixType>, Rhs>
+  : solve_retval_base<SPQR<_MatrixType>, Rhs>
+{
+  typedef SPQR<_MatrixType> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
+}// End namespace Eigen
+#endif
diff --git a/include/eigen3/Eigen/src/SVD/JacobiSVD.h b/include/eigen3/Eigen/src/SVD/JacobiSVD.h
new file mode 100644
index 0000000..c57f297
--- /dev/null
+++ b/include/eigen3/Eigen/src/SVD/JacobiSVD.h
@@ -0,0 +1,969 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_JACOBISVD_H
+#define EIGEN_JACOBISVD_H
+
+namespace Eigen { 
+
+namespace internal {
+// forward declaration (needed by ICC)
+// the empty body is required by MSVC
+template<typename MatrixType, int QRPreconditioner,
+         bool IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
+struct svd_precondition_2x2_block_to_be_real {};
+
+/*** QR preconditioners (R-SVD)
+ ***
+ *** Their role is to reduce the problem of computing the SVD to the case of a square matrix.
+ *** This approach, known as R-SVD, is an optimization for rectangular-enough matrices, and is a requirement for
+ *** JacobiSVD which by itself is only able to work on square matrices.
+ ***/
+
+enum { PreconditionIfMoreColsThanRows, PreconditionIfMoreRowsThanCols };
+
+template<typename MatrixType, int QRPreconditioner, int Case>
+struct qr_preconditioner_should_do_anything
+{
+  enum { a = MatrixType::RowsAtCompileTime != Dynamic &&
+             MatrixType::ColsAtCompileTime != Dynamic &&
+             MatrixType::ColsAtCompileTime <= MatrixType::RowsAtCompileTime,
+         b = MatrixType::RowsAtCompileTime != Dynamic &&
+             MatrixType::ColsAtCompileTime != Dynamic &&
+             MatrixType::RowsAtCompileTime <= MatrixType::ColsAtCompileTime,
+         ret = !( (QRPreconditioner == NoQRPreconditioner) ||
+                  (Case == PreconditionIfMoreColsThanRows && bool(a)) ||
+                  (Case == PreconditionIfMoreRowsThanCols && bool(b)) )
+  };
+};
+
+template<typename MatrixType, int QRPreconditioner, int Case,
+         bool DoAnything = qr_preconditioner_should_do_anything<MatrixType, QRPreconditioner, Case>::ret
+> struct qr_preconditioner_impl {};
+
+template<typename MatrixType, int QRPreconditioner, int Case>
+class qr_preconditioner_impl<MatrixType, QRPreconditioner, Case, false>
+{
+public:
+  typedef typename MatrixType::Index Index;
+  void allocate(const JacobiSVD<MatrixType, QRPreconditioner>&) {}
+  bool run(JacobiSVD<MatrixType, QRPreconditioner>&, const MatrixType&)
+  {
+    return false;
+  }
+};
+
+/*** preconditioner using FullPivHouseholderQR ***/
+
+template<typename MatrixType>
+class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
+{
+public:
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  enum
+  {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime
+  };
+  typedef Matrix<Scalar, 1, RowsAtCompileTime, RowMajor, 1, MaxRowsAtCompileTime> WorkspaceType;
+
+  void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
+  {
+    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
+    {
+      m_qr.~QRType();
+      ::new (&m_qr) QRType(svd.rows(), svd.cols());
+    }
+    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
+  }
+
+  bool run(JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
+  {
+    if(matrix.rows() > matrix.cols())
+    {
+      m_qr.compute(matrix);
+      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
+      if(svd.m_computeFullU) m_qr.matrixQ().evalTo(svd.m_matrixU, m_workspace);
+      if(svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
+      return true;
+    }
+    return false;
+  }
+private:
+  typedef FullPivHouseholderQR<MatrixType> QRType;
+  QRType m_qr;
+  WorkspaceType m_workspace;
+};
+
+template<typename MatrixType>
+class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
+{
+public:
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  enum
+  {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    Options = MatrixType::Options
+  };
+  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
+          TransposeTypeWithSameStorageOrder;
+
+  void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
+  {
+    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
+    {
+      m_qr.~QRType();
+      ::new (&m_qr) QRType(svd.cols(), svd.rows());
+    }
+    m_adjoint.resize(svd.cols(), svd.rows());
+    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
+  }
+
+  bool run(JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
+  {
+    if(matrix.cols() > matrix.rows())
+    {
+      m_adjoint = matrix.adjoint();
+      m_qr.compute(m_adjoint);
+      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
+      if(svd.m_computeFullV) m_qr.matrixQ().evalTo(svd.m_matrixV, m_workspace);
+      if(svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
+      return true;
+    }
+    else return false;
+  }
+private:
+  typedef FullPivHouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
+  QRType m_qr;
+  TransposeTypeWithSameStorageOrder m_adjoint;
+  typename internal::plain_row_type<MatrixType>::type m_workspace;
+};
+
+/*** preconditioner using ColPivHouseholderQR ***/
+
+template<typename MatrixType>
+class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
+{
+public:
+  typedef typename MatrixType::Index Index;
+
+  void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
+  {
+    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
+    {
+      m_qr.~QRType();
+      ::new (&m_qr) QRType(svd.rows(), svd.cols());
+    }
+    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
+    else if (svd.m_computeThinU) m_workspace.resize(svd.cols());
+  }
+
+  bool run(JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
+  {
+    if(matrix.rows() > matrix.cols())
+    {
+      m_qr.compute(matrix);
+      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
+      if(svd.m_computeFullU) m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
+      else if(svd.m_computeThinU)
+      {
+        svd.m_matrixU.setIdentity(matrix.rows(), matrix.cols());
+        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixU, m_workspace);
+      }
+      if(svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
+      return true;
+    }
+    return false;
+  }
+
+private:
+  typedef ColPivHouseholderQR<MatrixType> QRType;
+  QRType m_qr;
+  typename internal::plain_col_type<MatrixType>::type m_workspace;
+};
+
+template<typename MatrixType>
+class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
+{
+public:
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  enum
+  {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    Options = MatrixType::Options
+  };
+
+  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
+          TransposeTypeWithSameStorageOrder;
+
+  void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
+  {
+    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
+    {
+      m_qr.~QRType();
+      ::new (&m_qr) QRType(svd.cols(), svd.rows());
+    }
+    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
+    else if (svd.m_computeThinV) m_workspace.resize(svd.rows());
+    m_adjoint.resize(svd.cols(), svd.rows());
+  }
+
+  bool run(JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
+  {
+    if(matrix.cols() > matrix.rows())
+    {
+      m_adjoint = matrix.adjoint();
+      m_qr.compute(m_adjoint);
+
+      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
+      if(svd.m_computeFullV) m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
+      else if(svd.m_computeThinV)
+      {
+        svd.m_matrixV.setIdentity(matrix.cols(), matrix.rows());
+        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixV, m_workspace);
+      }
+      if(svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
+      return true;
+    }
+    else return false;
+  }
+
+private:
+  typedef ColPivHouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
+  QRType m_qr;
+  TransposeTypeWithSameStorageOrder m_adjoint;
+  typename internal::plain_row_type<MatrixType>::type m_workspace;
+};
+
+/*** preconditioner using HouseholderQR ***/
+
+template<typename MatrixType>
+class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
+{
+public:
+  typedef typename MatrixType::Index Index;
+
+  void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
+  {
+    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
+    {
+      m_qr.~QRType();
+      ::new (&m_qr) QRType(svd.rows(), svd.cols());
+    }
+    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
+    else if (svd.m_computeThinU) m_workspace.resize(svd.cols());
+  }
+
+  bool run(JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd, const MatrixType& matrix)
+  {
+    if(matrix.rows() > matrix.cols())
+    {
+      m_qr.compute(matrix);
+      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
+      if(svd.m_computeFullU) m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
+      else if(svd.m_computeThinU)
+      {
+        svd.m_matrixU.setIdentity(matrix.rows(), matrix.cols());
+        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixU, m_workspace);
+      }
+      if(svd.computeV()) svd.m_matrixV.setIdentity(matrix.cols(), matrix.cols());
+      return true;
+    }
+    return false;
+  }
+private:
+  typedef HouseholderQR<MatrixType> QRType;
+  QRType m_qr;
+  typename internal::plain_col_type<MatrixType>::type m_workspace;
+};
+
+template<typename MatrixType>
+class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
+{
+public:
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  enum
+  {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    Options = MatrixType::Options
+  };
+
+  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
+          TransposeTypeWithSameStorageOrder;
+
+  void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
+  {
+    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
+    {
+      m_qr.~QRType();
+      ::new (&m_qr) QRType(svd.cols(), svd.rows());
+    }
+    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
+    else if (svd.m_computeThinV) m_workspace.resize(svd.rows());
+    m_adjoint.resize(svd.cols(), svd.rows());
+  }
+
+  bool run(JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd, const MatrixType& matrix)
+  {
+    if(matrix.cols() > matrix.rows())
+    {
+      m_adjoint = matrix.adjoint();
+      m_qr.compute(m_adjoint);
+
+      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
+      if(svd.m_computeFullV) m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
+      else if(svd.m_computeThinV)
+      {
+        svd.m_matrixV.setIdentity(matrix.cols(), matrix.rows());
+        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixV, m_workspace);
+      }
+      if(svd.computeU()) svd.m_matrixU.setIdentity(matrix.rows(), matrix.rows());
+      return true;
+    }
+    else return false;
+  }
+
+private:
+  typedef HouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
+  QRType m_qr;
+  TransposeTypeWithSameStorageOrder m_adjoint;
+  typename internal::plain_row_type<MatrixType>::type m_workspace;
+};
+
+/*** 2x2 SVD implementation
+ ***
+ *** JacobiSVD consists in performing a series of 2x2 SVD subproblems
+ ***/
+
+template<typename MatrixType, int QRPreconditioner>
+struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false>
+{
+  typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
+  typedef typename SVD::Index Index;
+  static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {}
+};
+
+template<typename MatrixType, int QRPreconditioner>
+struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
+{
+  typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename SVD::Index Index;
+  static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q)
+  {
+    using std::sqrt;
+    Scalar z;
+    JacobiRotation<Scalar> rot;
+    RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p)));
+    
+    if(n==0)
+    {
+      z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+      work_matrix.row(p) *= z;
+      if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
+      if(work_matrix.coeff(q,q)!=Scalar(0))
+      {
+        z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
+        work_matrix.row(q) *= z;
+        if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
+      }
+      // otherwise the second row is already zero, so we have nothing to do.
+    }
+    else
+    {
+      rot.c() = conj(work_matrix.coeff(p,p)) / n;
+      rot.s() = work_matrix.coeff(q,p) / n;
+      work_matrix.applyOnTheLeft(p,q,rot);
+      if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
+      if(work_matrix.coeff(p,q) != Scalar(0))
+      {
+        Scalar z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+        work_matrix.col(q) *= z;
+        if(svd.computeV()) svd.m_matrixV.col(q) *= z;
+      }
+      if(work_matrix.coeff(q,q) != Scalar(0))
+      {
+        z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
+        work_matrix.row(q) *= z;
+        if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
+      }
+    }
+  }
+};
+
+template<typename MatrixType, typename RealScalar, typename Index>
+void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
+                            JacobiRotation<RealScalar> *j_left,
+                            JacobiRotation<RealScalar> *j_right)
+{
+  using std::sqrt;
+  using std::abs;
+  Matrix<RealScalar,2,2> m;
+  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
+       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
+  JacobiRotation<RealScalar> rot1;
+  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
+  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
+  if(t == RealScalar(0))
+  {
+    rot1.c() = RealScalar(0);
+    rot1.s() = d > RealScalar(0) ? RealScalar(1) : RealScalar(-1);
+  }
+  else
+  {
+    RealScalar t2d2 = numext::hypot(t,d);
+    rot1.c() = abs(t)/t2d2;
+    rot1.s() = d/t2d2;
+    if(t<RealScalar(0))
+      rot1.s() = -rot1.s();
+  }
+  m.applyOnTheLeft(0,1,rot1);
+  j_right->makeJacobi(m,0,1);
+  *j_left  = rot1 * j_right->transpose();
+}
+
+} // end namespace internal
+
+/** \ingroup SVD_Module
+  *
+  *
+  * \class JacobiSVD
+  *
+  * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix
+  *
+  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
+  * \param QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
+  *                        for the R-SVD step for non-square matrices. See discussion of possible values below.
+  *
+  * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
+  *   \f[ A = U S V^* \f]
+  * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero outside of its main diagonal;
+  * the diagonal entries of S are known as the \em singular \em values of \a A and the columns of \a U and \a V are known as the left
+  * and right \em singular \em vectors of \a A respectively.
+  *
+  * Singular values are always sorted in decreasing order.
+  *
+  * This JacobiSVD decomposition computes only the singular values by default. If you want \a U or \a V, you need to ask for them explicitly.
+  *
+  * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p matrix, letting \a m be the
+  * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
+  * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
+  * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
+  *
+  * Here's an example demonstrating basic usage:
+  * \include JacobiSVD_basic.cpp
+  * Output: \verbinclude JacobiSVD_basic.out
+  *
+  * This JacobiSVD class is a two-sided Jacobi R-SVD decomposition, ensuring optimal reliability and accuracy. The downside is that it's slower than
+  * bidiagonalizing SVD algorithms for large square matrices; however its complexity is still \f$ O(n^2p) \f$ where \a n is the smaller dimension and
+  * \a p is the greater dimension, meaning that it is still of the same order of complexity as the faster bidiagonalizing R-SVD algorithms.
+  * In particular, like any R-SVD, it takes advantage of non-squareness in that its complexity is only linear in the greater dimension.
+  *
+  * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
+  * terminate in finite (and reasonable) time.
+  *
+  * The possible values for QRPreconditioner are:
+  * \li ColPivHouseholderQRPreconditioner is the default. In practice it's very safe. It uses column-pivoting QR.
+  * \li FullPivHouseholderQRPreconditioner, is the safest and slowest. It uses full-pivoting QR.
+  *     Contrary to other QRs, it doesn't allow computing thin unitaries.
+  * \li HouseholderQRPreconditioner is the fastest, and less safe and accurate than the pivoting variants. It uses non-pivoting QR.
+  *     This is very similar in safety and accuracy to the bidiagonalization process used by bidiagonalizing SVD algorithms (since bidiagonalization
+  *     is inherently non-pivoting). However the resulting SVD is still more reliable than bidiagonalizing SVDs because the Jacobi-based iterarive
+  *     process is more reliable than the optimized bidiagonal SVD iterations.
+  * \li NoQRPreconditioner allows not to use a QR preconditioner at all. This is useful if you know that you will only be computing
+  *     JacobiSVD decompositions of square matrices. Non-square matrices require a QR preconditioner. Using this option will result in
+  *     faster compilation and smaller executable code. It won't significantly speed up computation, since JacobiSVD is always checking
+  *     if QR preconditioning is needed before applying it anyway.
+  *
+  * \sa MatrixBase::jacobiSvd()
+  */
+template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
+{
+  public:
+
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef typename MatrixType::Index Index;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime),
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+      MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime,MaxColsAtCompileTime),
+      MatrixOptions = MatrixType::Options
+    };
+
+    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime,
+                   MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
+            MatrixUType;
+    typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime,
+                   MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
+            MatrixVType;
+    typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
+    typedef typename internal::plain_row_type<MatrixType>::type RowType;
+    typedef typename internal::plain_col_type<MatrixType>::type ColType;
+    typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime,
+                   MatrixOptions, MaxDiagSizeAtCompileTime, MaxDiagSizeAtCompileTime>
+            WorkMatrixType;
+
+    /** \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via JacobiSVD::compute(const MatrixType&).
+      */
+    JacobiSVD()
+      : m_isInitialized(false),
+        m_isAllocated(false),
+        m_usePrescribedThreshold(false),
+        m_computationOptions(0),
+        m_rows(-1), m_cols(-1), m_diagSize(0)
+    {}
+
+
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem size.
+      * \sa JacobiSVD()
+      */
+    JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0)
+      : m_isInitialized(false),
+        m_isAllocated(false),
+        m_usePrescribedThreshold(false),
+        m_computationOptions(0),
+        m_rows(-1), m_cols(-1)
+    {
+      allocate(rows, cols, computationOptions);
+    }
+
+    /** \brief Constructor performing the decomposition of given matrix.
+     *
+     * \param matrix the matrix to decompose
+     * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
+     *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
+     *                           #ComputeFullV, #ComputeThinV.
+     *
+     * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
+     * available with the (non-default) FullPivHouseholderQR preconditioner.
+     */
+    JacobiSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
+      : m_isInitialized(false),
+        m_isAllocated(false),
+        m_usePrescribedThreshold(false),
+        m_computationOptions(0),
+        m_rows(-1), m_cols(-1)
+    {
+      compute(matrix, computationOptions);
+    }
+
+    /** \brief Method performing the decomposition of given matrix using custom options.
+     *
+     * \param matrix the matrix to decompose
+     * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
+     *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
+     *                           #ComputeFullV, #ComputeThinV.
+     *
+     * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
+     * available with the (non-default) FullPivHouseholderQR preconditioner.
+     */
+    JacobiSVD& compute(const MatrixType& matrix, unsigned int computationOptions);
+
+    /** \brief Method performing the decomposition of given matrix using current options.
+     *
+     * \param matrix the matrix to decompose
+     *
+     * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
+     */
+    JacobiSVD& compute(const MatrixType& matrix)
+    {
+      return compute(matrix, m_computationOptions);
+    }
+
+    /** \returns the \a U matrix.
+     *
+     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
+     * the U matrix is n-by-n if you asked for #ComputeFullU, and is n-by-m if you asked for #ComputeThinU.
+     *
+     * The \a m first columns of \a U are the left singular vectors of the matrix being decomposed.
+     *
+     * This method asserts that you asked for \a U to be computed.
+     */
+    const MatrixUType& matrixU() const
+    {
+      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+      eigen_assert(computeU() && "This JacobiSVD decomposition didn't compute U. Did you ask for it?");
+      return m_matrixU;
+    }
+
+    /** \returns the \a V matrix.
+     *
+     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
+     * the V matrix is p-by-p if you asked for #ComputeFullV, and is p-by-m if you asked for ComputeThinV.
+     *
+     * The \a m first columns of \a V are the right singular vectors of the matrix being decomposed.
+     *
+     * This method asserts that you asked for \a V to be computed.
+     */
+    const MatrixVType& matrixV() const
+    {
+      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+      eigen_assert(computeV() && "This JacobiSVD decomposition didn't compute V. Did you ask for it?");
+      return m_matrixV;
+    }
+
+    /** \returns the vector of singular values.
+     *
+     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p, the
+     * returned vector has size \a m.  Singular values are always sorted in decreasing order.
+     */
+    const SingularValuesType& singularValues() const
+    {
+      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+      return m_singularValues;
+    }
+
+    /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
+    inline bool computeU() const { return m_computeFullU || m_computeThinU; }
+    /** \returns true if \a V (full or thin) is asked for in this SVD decomposition */
+    inline bool computeV() const { return m_computeFullV || m_computeThinV; }
+
+    /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
+      *
+      * \param b the right-hand-side of the equation to solve.
+      *
+      * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
+      *
+      * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
+      * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<JacobiSVD, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+      eigen_assert(computeU() && computeV() && "JacobiSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
+      return internal::solve_retval<JacobiSVD, Rhs>(*this, b.derived());
+    }
+
+    /** \returns the number of singular values that are not exactly 0 */
+    Index nonzeroSingularValues() const
+    {
+      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+      return m_nonzeroSingularValues;
+    }
+    
+    /** \returns the rank of the matrix of which \c *this is the SVD.
+      *
+      * \note This method has to determine which singular values should be considered nonzero.
+      *       For that, it uses the threshold value that you can control by calling
+      *       setThreshold(const RealScalar&).
+      */
+    inline Index rank() const
+    {
+      using std::abs;
+      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+      if(m_singularValues.size()==0) return 0;
+      RealScalar premultiplied_threshold = m_singularValues.coeff(0) * threshold();
+      Index i = m_nonzeroSingularValues-1;
+      while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
+      return i+1;
+    }
+    
+    /** Allows to prescribe a threshold to be used by certain methods, such as rank() and solve(),
+      * which need to determine when singular values are to be considered nonzero.
+      * This is not used for the SVD decomposition itself.
+      *
+      * When it needs to get the threshold value, Eigen calls threshold().
+      * The default is \c NumTraits<Scalar>::epsilon()
+      *
+      * \param threshold The new value to use as the threshold.
+      *
+      * A singular value will be considered nonzero if its value is strictly greater than
+      *  \f$ \vert singular value \vert \leqslant threshold \times \vert max singular value \vert \f$.
+      *
+      * If you want to come back to the default behavior, call setThreshold(Default_t)
+      */
+    JacobiSVD& setThreshold(const RealScalar& threshold)
+    {
+      m_usePrescribedThreshold = true;
+      m_prescribedThreshold = threshold;
+      return *this;
+    }
+
+    /** Allows to come back to the default behavior, letting Eigen use its default formula for
+      * determining the threshold.
+      *
+      * You should pass the special object Eigen::Default as parameter here.
+      * \code svd.setThreshold(Eigen::Default); \endcode
+      *
+      * See the documentation of setThreshold(const RealScalar&).
+      */
+    JacobiSVD& setThreshold(Default_t)
+    {
+      m_usePrescribedThreshold = false;
+      return *this;
+    }
+
+    /** Returns the threshold that will be used by certain methods such as rank().
+      *
+      * See the documentation of setThreshold(const RealScalar&).
+      */
+    RealScalar threshold() const
+    {
+      eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+      return m_usePrescribedThreshold ? m_prescribedThreshold
+                                      : (std::max<Index>)(1,m_diagSize)*NumTraits<Scalar>::epsilon();
+    }
+
+    inline Index rows() const { return m_rows; }
+    inline Index cols() const { return m_cols; }
+
+  private:
+    void allocate(Index rows, Index cols, unsigned int computationOptions);
+
+  protected:
+    MatrixUType m_matrixU;
+    MatrixVType m_matrixV;
+    SingularValuesType m_singularValues;
+    WorkMatrixType m_workMatrix;
+    bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
+    bool m_computeFullU, m_computeThinU;
+    bool m_computeFullV, m_computeThinV;
+    unsigned int m_computationOptions;
+    Index m_nonzeroSingularValues, m_rows, m_cols, m_diagSize;
+    RealScalar m_prescribedThreshold;
+
+    template<typename __MatrixType, int _QRPreconditioner, bool _IsComplex>
+    friend struct internal::svd_precondition_2x2_block_to_be_real;
+    template<typename __MatrixType, int _QRPreconditioner, int _Case, bool _DoAnything>
+    friend struct internal::qr_preconditioner_impl;
+
+    internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
+    internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
+    MatrixType m_scaledMatrix;
+};
+
+template<typename MatrixType, int QRPreconditioner>
+void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, unsigned int computationOptions)
+{
+  eigen_assert(rows >= 0 && cols >= 0);
+
+  if (m_isAllocated &&
+      rows == m_rows &&
+      cols == m_cols &&
+      computationOptions == m_computationOptions)
+  {
+    return;
+  }
+
+  m_rows = rows;
+  m_cols = cols;
+  m_isInitialized = false;
+  m_isAllocated = true;
+  m_computationOptions = computationOptions;
+  m_computeFullU = (computationOptions & ComputeFullU) != 0;
+  m_computeThinU = (computationOptions & ComputeThinU) != 0;
+  m_computeFullV = (computationOptions & ComputeFullV) != 0;
+  m_computeThinV = (computationOptions & ComputeThinV) != 0;
+  eigen_assert(!(m_computeFullU && m_computeThinU) && "JacobiSVD: you can't ask for both full and thin U");
+  eigen_assert(!(m_computeFullV && m_computeThinV) && "JacobiSVD: you can't ask for both full and thin V");
+  eigen_assert(EIGEN_IMPLIES(m_computeThinU || m_computeThinV, MatrixType::ColsAtCompileTime==Dynamic) &&
+              "JacobiSVD: thin U and V are only available when your matrix has a dynamic number of columns.");
+  if (QRPreconditioner == FullPivHouseholderQRPreconditioner)
+  {
+      eigen_assert(!(m_computeThinU || m_computeThinV) &&
+              "JacobiSVD: can't compute thin U or thin V with the FullPivHouseholderQR preconditioner. "
+              "Use the ColPivHouseholderQR preconditioner instead.");
+  }
+  m_diagSize = (std::min)(m_rows, m_cols);
+  m_singularValues.resize(m_diagSize);
+  if(RowsAtCompileTime==Dynamic)
+    m_matrixU.resize(m_rows, m_computeFullU ? m_rows
+                            : m_computeThinU ? m_diagSize
+                            : 0);
+  if(ColsAtCompileTime==Dynamic)
+    m_matrixV.resize(m_cols, m_computeFullV ? m_cols
+                            : m_computeThinV ? m_diagSize
+                            : 0);
+  m_workMatrix.resize(m_diagSize, m_diagSize);
+  
+  if(m_cols>m_rows)   m_qr_precond_morecols.allocate(*this);
+  if(m_rows>m_cols)   m_qr_precond_morerows.allocate(*this);
+  if(m_cols!=m_cols)  m_scaledMatrix.resize(rows,cols);
+}
+
+template<typename MatrixType, int QRPreconditioner>
+JacobiSVD<MatrixType, QRPreconditioner>&
+JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsigned int computationOptions)
+{
+  using std::abs;
+  allocate(matrix.rows(), matrix.cols(), computationOptions);
+
+  // currently we stop when we reach precision 2*epsilon as the last bit of precision can require an unreasonable number of iterations,
+  // only worsening the precision of U and V as we accumulate more rotations
+  const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
+
+  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
+
+  // Scaling factor to reduce over/under-flows
+  RealScalar scale = matrix.cwiseAbs().maxCoeff();
+  if(scale==RealScalar(0)) scale = RealScalar(1);
+  
+  /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
+
+  if(m_rows!=m_cols)
+  {
+    m_scaledMatrix = matrix / scale;
+    m_qr_precond_morecols.run(*this, m_scaledMatrix);
+    m_qr_precond_morerows.run(*this, m_scaledMatrix);
+  }
+  else
+  {
+    m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale;
+    if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows);
+    if(m_computeThinU) m_matrixU.setIdentity(m_rows,m_diagSize);
+    if(m_computeFullV) m_matrixV.setIdentity(m_cols,m_cols);
+    if(m_computeThinV) m_matrixV.setIdentity(m_cols, m_diagSize);
+  }
+
+  /*** step 2. The main Jacobi SVD iteration. ***/
+
+  bool finished = false;
+  while(!finished)
+  {
+    finished = true;
+
+    // do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix
+
+    for(Index p = 1; p < m_diagSize; ++p)
+    {
+      for(Index q = 0; q < p; ++q)
+      {
+        // if this 2x2 sub-matrix is not diagonal already...
+        // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
+        // keep us iterating forever. Similarly, small denormal numbers are considered zero.
+        using std::max;
+        RealScalar threshold = (max)(considerAsZero, precision * (max)(abs(m_workMatrix.coeff(p,p)),
+                                                                       abs(m_workMatrix.coeff(q,q))));
+        // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791)
+        if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
+        {
+          finished = false;
+
+          // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
+          internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q);
+          JacobiRotation<RealScalar> j_left, j_right;
+          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+
+          // accumulate resulting Jacobi rotations
+          m_workMatrix.applyOnTheLeft(p,q,j_left);
+          if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
+
+          m_workMatrix.applyOnTheRight(p,q,j_right);
+          if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
+        }
+      }
+    }
+  }
+
+  /*** step 3. The work matrix is now diagonal, so ensure it's positive so its diagonal entries are the singular values ***/
+
+  for(Index i = 0; i < m_diagSize; ++i)
+  {
+    RealScalar a = abs(m_workMatrix.coeff(i,i));
+    m_singularValues.coeffRef(i) = a;
+    if(computeU() && (a!=RealScalar(0))) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+  }
+
+  /*** step 4. Sort singular values in descending order and compute the number of nonzero singular values ***/
+
+  m_nonzeroSingularValues = m_diagSize;
+  for(Index i = 0; i < m_diagSize; i++)
+  {
+    Index pos;
+    RealScalar maxRemainingSingularValue = m_singularValues.tail(m_diagSize-i).maxCoeff(&pos);
+    if(maxRemainingSingularValue == RealScalar(0))
+    {
+      m_nonzeroSingularValues = i;
+      break;
+    }
+    if(pos)
+    {
+      pos += i;
+      std::swap(m_singularValues.coeffRef(i), m_singularValues.coeffRef(pos));
+      if(computeU()) m_matrixU.col(pos).swap(m_matrixU.col(i));
+      if(computeV()) m_matrixV.col(pos).swap(m_matrixV.col(i));
+    }
+  }
+  
+  m_singularValues *= scale;
+
+  m_isInitialized = true;
+  return *this;
+}
+
+namespace internal {
+template<typename _MatrixType, int QRPreconditioner, typename Rhs>
+struct solve_retval<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
+  : solve_retval_base<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
+{
+  typedef JacobiSVD<_MatrixType, QRPreconditioner> JacobiSVDType;
+  EIGEN_MAKE_SOLVE_HELPERS(JacobiSVDType,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    eigen_assert(rhs().rows() == dec().rows());
+
+    // A = U S V^*
+    // So A^{-1} = V S^{-1} U^*
+
+    Matrix<Scalar, Dynamic, Rhs::ColsAtCompileTime, 0, _MatrixType::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime> tmp;
+    Index rank = dec().rank();
+    
+    tmp.noalias() = dec().matrixU().leftCols(rank).adjoint() * rhs();
+    tmp = dec().singularValues().head(rank).asDiagonal().inverse() * tmp;
+    dst = dec().matrixV().leftCols(rank) * tmp;
+  }
+};
+} // end namespace internal
+
+/** \svd_module
+  *
+  * \return the singular value decomposition of \c *this computed by two-sided
+  * Jacobi transformations.
+  *
+  * \sa class JacobiSVD
+  */
+template<typename Derived>
+JacobiSVD<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::jacobiSvd(unsigned int computationOptions) const
+{
+  return JacobiSVD<PlainObject>(*this, computationOptions);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_JACOBISVD_H
diff --git a/include/eigen3/Eigen/src/SVD/JacobiSVD_MKL.h b/include/eigen3/Eigen/src/SVD/JacobiSVD_MKL.h
new file mode 100644
index 0000000..decda75
--- /dev/null
+++ b/include/eigen3/Eigen/src/SVD/JacobiSVD_MKL.h
@@ -0,0 +1,92 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *    Singular Value Decomposition - SVD.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_JACOBISVD_MKL_H
+#define EIGEN_JACOBISVD_MKL_H
+
+#include "Eigen/src/Core/util/MKL_support.h"
+
+namespace Eigen { 
+
+/** \internal Specialization for the data types supported by MKL */
+
+#define EIGEN_MKL_SVD(EIGTYPE, MKLTYPE, MKLRTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
+template<> inline \
+JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>& \
+JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) \
+{ \
+  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
+  typedef MatrixType::Scalar Scalar; \
+  typedef MatrixType::RealScalar RealScalar; \
+  allocate(matrix.rows(), matrix.cols(), computationOptions); \
+\
+  /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/ \
+  m_nonzeroSingularValues = m_diagSize; \
+\
+  lapack_int lda = matrix.outerStride(), ldu, ldvt; \
+  lapack_int matrix_order = MKLCOLROW; \
+  char jobu, jobvt; \
+  MKLTYPE *u, *vt, dummy; \
+  jobu  = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N'; \
+  jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N'; \
+  if (computeU()) { \
+    ldu  = m_matrixU.outerStride(); \
+    u    = (MKLTYPE*)m_matrixU.data(); \
+  } else { ldu=1; u=&dummy; }\
+  MatrixType localV; \
+  ldvt = (m_computeFullV) ? m_cols : (m_computeThinV) ? m_diagSize : 1; \
+  if (computeV()) { \
+    localV.resize(ldvt, m_cols); \
+    vt   = (MKLTYPE*)localV.data(); \
+  } else { ldvt=1; vt=&dummy; }\
+  Matrix<MKLRTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
+  MatrixType m_temp; m_temp = matrix; \
+  LAPACKE_##MKLPREFIX##gesvd( matrix_order, jobu, jobvt, m_rows, m_cols, (MKLTYPE*)m_temp.data(), lda, (MKLRTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
+  if (computeV()) m_matrixV = localV.adjoint(); \
+ /* for(int i=0;i<m_diagSize;i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--; m_singularValues.coeffRef(i)=RealScalar(0);}*/ \
+  m_isInitialized = true; \
+  return *this; \
+}
+
+EIGEN_MKL_SVD(double,   double,        double, d, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_SVD(float,    float,         float , s, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, ColMajor, LAPACK_COL_MAJOR)
+
+EIGEN_MKL_SVD(double,   double,        double, d, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_SVD(float,    float,         float , s, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, RowMajor, LAPACK_ROW_MAJOR)
+
+} // end namespace Eigen
+
+#endif // EIGEN_JACOBISVD_MKL_H
diff --git a/include/eigen3/Eigen/src/SVD/UpperBidiagonalization.h b/include/eigen3/Eigen/src/SVD/UpperBidiagonalization.h
new file mode 100644
index 0000000..587de37
--- /dev/null
+++ b/include/eigen3/Eigen/src/SVD/UpperBidiagonalization.h
@@ -0,0 +1,148 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BIDIAGONALIZATION_H
+#define EIGEN_BIDIAGONALIZATION_H
+
+namespace Eigen { 
+
+namespace internal {
+// UpperBidiagonalization will probably be replaced by a Bidiagonalization class, don't want to make it stable API.
+// At the same time, it's useful to keep for now as it's about the only thing that is testing the BandMatrix class.
+
+template<typename _MatrixType> class UpperBidiagonalization
+{
+  public:
+
+    typedef _MatrixType MatrixType;
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      ColsAtCompileTimeMinusOne = internal::decrement_size<ColsAtCompileTime>::ret
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef Matrix<Scalar, 1, ColsAtCompileTime> RowVectorType;
+    typedef Matrix<Scalar, RowsAtCompileTime, 1> ColVectorType;
+    typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0> BidiagonalType;
+    typedef Matrix<Scalar, ColsAtCompileTime, 1> DiagVectorType;
+    typedef Matrix<Scalar, ColsAtCompileTimeMinusOne, 1> SuperDiagVectorType;
+    typedef HouseholderSequence<
+              const MatrixType,
+              CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Diagonal<const MatrixType,0> >
+            > HouseholderUSequenceType;
+    typedef HouseholderSequence<
+              const typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type,
+              Diagonal<const MatrixType,1>,
+              OnTheRight
+            > HouseholderVSequenceType;
+    
+    /**
+    * \brief Default Constructor.
+    *
+    * The default constructor is useful in cases in which the user intends to
+    * perform decompositions via Bidiagonalization::compute(const MatrixType&).
+    */
+    UpperBidiagonalization() : m_householder(), m_bidiagonal(), m_isInitialized(false) {}
+
+    UpperBidiagonalization(const MatrixType& matrix)
+      : m_householder(matrix.rows(), matrix.cols()),
+        m_bidiagonal(matrix.cols(), matrix.cols()),
+        m_isInitialized(false)
+    {
+      compute(matrix);
+    }
+    
+    UpperBidiagonalization& compute(const MatrixType& matrix);
+    
+    const MatrixType& householder() const { return m_householder; }
+    const BidiagonalType& bidiagonal() const { return m_bidiagonal; }
+    
+    const HouseholderUSequenceType householderU() const
+    {
+      eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
+      return HouseholderUSequenceType(m_householder, m_householder.diagonal().conjugate());
+    }
+
+    const HouseholderVSequenceType householderV() // const here gives nasty errors and i'm lazy
+    {
+      eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
+      return HouseholderVSequenceType(m_householder.conjugate(), m_householder.const_derived().template diagonal<1>())
+             .setLength(m_householder.cols()-1)
+             .setShift(1);
+    }
+    
+  protected:
+    MatrixType m_householder;
+    BidiagonalType m_bidiagonal;
+    bool m_isInitialized;
+};
+
+template<typename _MatrixType>
+UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::compute(const _MatrixType& matrix)
+{
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+  
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for matrices satisfying rows>=cols.");
+  
+  m_householder = matrix;
+
+  ColVectorType temp(rows);
+
+  for (Index k = 0; /* breaks at k==cols-1 below */ ; ++k)
+  {
+    Index remainingRows = rows - k;
+    Index remainingCols = cols - k - 1;
+
+    // construct left householder transform in-place in m_householder
+    m_householder.col(k).tail(remainingRows)
+                 .makeHouseholderInPlace(m_householder.coeffRef(k,k),
+                                         m_bidiagonal.template diagonal<0>().coeffRef(k));
+    // apply householder transform to remaining part of m_householder on the left
+    m_householder.bottomRightCorner(remainingRows, remainingCols)
+                 .applyHouseholderOnTheLeft(m_householder.col(k).tail(remainingRows-1),
+                                            m_householder.coeff(k,k),
+                                            temp.data());
+
+    if(k == cols-1) break;
+    
+    // construct right householder transform in-place in m_householder
+    m_householder.row(k).tail(remainingCols)
+                 .makeHouseholderInPlace(m_householder.coeffRef(k,k+1),
+                                         m_bidiagonal.template diagonal<1>().coeffRef(k));
+    // apply householder transform to remaining part of m_householder on the left
+    m_householder.bottomRightCorner(remainingRows-1, remainingCols)
+                 .applyHouseholderOnTheRight(m_householder.row(k).tail(remainingCols-1).transpose(),
+                                             m_householder.coeff(k,k+1),
+                                             temp.data());
+  }
+  m_isInitialized = true;
+  return *this;
+}
+
+#if 0
+/** \return the Householder QR decomposition of \c *this.
+  *
+  * \sa class Bidiagonalization
+  */
+template<typename Derived>
+const UpperBidiagonalization<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::bidiagonalization() const
+{
+  return UpperBidiagonalization<PlainObject>(eval());
+}
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BIDIAGONALIZATION_H
diff --git a/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky.h b/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky.h
new file mode 100644
index 0000000..e1f96ba
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -0,0 +1,671 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SIMPLICIAL_CHOLESKY_H
+#define EIGEN_SIMPLICIAL_CHOLESKY_H
+
+namespace Eigen { 
+
+enum SimplicialCholeskyMode {
+  SimplicialCholeskyLLT,
+  SimplicialCholeskyLDLT
+};
+
+/** \ingroup SparseCholesky_Module
+  * \brief A direct sparse Cholesky factorizations
+  *
+  * These classes provide LL^T and LDL^T Cholesky factorizations of sparse matrices that are
+  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+  * X and B can be either dense or sparse.
+  * 
+  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+  * such that the factorized matrix is P A P^-1.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  *
+  */
+template<typename Derived>
+class SimplicialCholeskyBase : internal::noncopyable
+{
+  public:
+    typedef typename internal::traits<Derived>::MatrixType MatrixType;
+    typedef typename internal::traits<Derived>::OrderingType OrderingType;
+    enum { UpLo = internal::traits<Derived>::UpLo };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef Matrix<Scalar,Dynamic,1> VectorType;
+
+  public:
+
+    /** Default constructor */
+    SimplicialCholeskyBase()
+      : m_info(Success), m_isInitialized(false), m_shiftOffset(0), m_shiftScale(1)
+    {}
+
+    SimplicialCholeskyBase(const MatrixType& matrix)
+      : m_info(Success), m_isInitialized(false), m_shiftOffset(0), m_shiftScale(1)
+    {
+      derived().compute(matrix);
+    }
+
+    ~SimplicialCholeskyBase()
+    {
+    }
+
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    
+    inline Index cols() const { return m_matrix.cols(); }
+    inline Index rows() const { return m_matrix.rows(); }
+    
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<SimplicialCholeskyBase, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Simplicial LLT or LDLT is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "SimplicialCholeskyBase::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<SimplicialCholeskyBase, Rhs>(*this, b.derived());
+    }
+    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::sparse_solve_retval<SimplicialCholeskyBase, Rhs>
+    solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Simplicial LLT or LDLT is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "SimplicialCholesky::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::sparse_solve_retval<SimplicialCholeskyBase, Rhs>(*this, b.derived());
+    }
+    
+    /** \returns the permutation P
+      * \sa permutationPinv() */
+    const PermutationMatrix<Dynamic,Dynamic,Index>& permutationP() const
+    { return m_P; }
+    
+    /** \returns the inverse P^-1 of the permutation P
+      * \sa permutationP() */
+    const PermutationMatrix<Dynamic,Dynamic,Index>& permutationPinv() const
+    { return m_Pinv; }
+
+    /** Sets the shift parameters that will be used to adjust the diagonal coefficients during the numerical factorization.
+      *
+      * During the numerical factorization, the diagonal coefficients are transformed by the following linear model:\n
+      * \c d_ii = \a offset + \a scale * \c d_ii
+      *
+      * The default is the identity transformation with \a offset=0, and \a scale=1.
+      *
+      * \returns a reference to \c *this.
+      */
+    Derived& setShift(const RealScalar& offset, const RealScalar& scale = 1)
+    {
+      m_shiftOffset = offset;
+      m_shiftScale = scale;
+      return derived();
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal */
+    template<typename Stream>
+    void dumpMemory(Stream& s)
+    {
+      int total = 0;
+      s << "  L:        " << ((total+=(m_matrix.cols()+1) * sizeof(int) + m_matrix.nonZeros()*(sizeof(int)+sizeof(Scalar))) >> 20) << "Mb" << "\n";
+      s << "  diag:     " << ((total+=m_diag.size() * sizeof(Scalar)) >> 20) << "Mb" << "\n";
+      s << "  tree:     " << ((total+=m_parent.size() * sizeof(int)) >> 20) << "Mb" << "\n";
+      s << "  nonzeros: " << ((total+=m_nonZerosPerCol.size() * sizeof(int)) >> 20) << "Mb" << "\n";
+      s << "  perm:     " << ((total+=m_P.size() * sizeof(int)) >> 20) << "Mb" << "\n";
+      s << "  perm^-1:  " << ((total+=m_Pinv.size() * sizeof(int)) >> 20) << "Mb" << "\n";
+      s << "  TOTAL:    " << (total>> 20) << "Mb" << "\n";
+    }
+
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    {
+      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
+      eigen_assert(m_matrix.rows()==b.rows());
+
+      if(m_info!=Success)
+        return;
+
+      if(m_P.size()>0)
+        dest = m_P * b;
+      else
+        dest = b;
+
+      if(m_matrix.nonZeros()>0) // otherwise L==I
+        derived().matrixL().solveInPlace(dest);
+
+      if(m_diag.size()>0)
+        dest = m_diag.asDiagonal().inverse() * dest;
+
+      if (m_matrix.nonZeros()>0) // otherwise U==I
+        derived().matrixU().solveInPlace(dest);
+
+      if(m_P.size()>0)
+        dest = m_Pinv * dest;
+    }
+
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
+  protected:
+    
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    template<bool DoLDLT>
+    void compute(const MatrixType& matrix)
+    {
+      eigen_assert(matrix.rows()==matrix.cols());
+      Index size = matrix.cols();
+      CholMatrixType ap(size,size);
+      ordering(matrix, ap);
+      analyzePattern_preordered(ap, DoLDLT);
+      factorize_preordered<DoLDLT>(ap);
+    }
+    
+    template<bool DoLDLT>
+    void factorize(const MatrixType& a)
+    {
+      eigen_assert(a.rows()==a.cols());
+      int size = a.cols();
+      CholMatrixType ap(size,size);
+      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+      factorize_preordered<DoLDLT>(ap);
+    }
+
+    template<bool DoLDLT>
+    void factorize_preordered(const CholMatrixType& a);
+
+    void analyzePattern(const MatrixType& a, bool doLDLT)
+    {
+      eigen_assert(a.rows()==a.cols());
+      int size = a.cols();
+      CholMatrixType ap(size,size);
+      ordering(a, ap);
+      analyzePattern_preordered(ap,doLDLT);
+    }
+    void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
+    
+    void ordering(const MatrixType& a, CholMatrixType& ap);
+
+    /** keeps off-diagonal entries; drops diagonal entries */
+    struct keep_diag {
+      inline bool operator() (const Index& row, const Index& col, const Scalar&) const
+      {
+        return row!=col;
+      }
+    };
+
+    mutable ComputationInfo m_info;
+    bool m_isInitialized;
+    bool m_factorizationIsOk;
+    bool m_analysisIsOk;
+    
+    CholMatrixType m_matrix;
+    VectorType m_diag;                                // the diagonal coefficients (LDLT mode)
+    VectorXi m_parent;                                // elimination tree
+    VectorXi m_nonZerosPerCol;
+    PermutationMatrix<Dynamic,Dynamic,Index> m_P;     // the permutation
+    PermutationMatrix<Dynamic,Dynamic,Index> m_Pinv;  // the inverse permutation
+
+    RealScalar m_shiftOffset;
+    RealScalar m_shiftScale;
+};
+
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::Index> > class SimplicialLLT;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::Index> > class SimplicialLDLT;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::Index> > class SimplicialCholesky;
+
+namespace internal {
+
+template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialLLT<_MatrixType,_UpLo,_Ordering> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Ordering OrderingType;
+  enum { UpLo = _UpLo };
+  typedef typename MatrixType::Scalar                         Scalar;
+  typedef typename MatrixType::Index                          Index;
+  typedef SparseMatrix<Scalar, ColMajor, Index>               CholMatrixType;
+  typedef SparseTriangularView<CholMatrixType, Eigen::Lower>  MatrixL;
+  typedef SparseTriangularView<typename CholMatrixType::AdjointReturnType, Eigen::Upper>   MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return m; }
+  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+};
+
+template<typename _MatrixType,int _UpLo, typename _Ordering> struct traits<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Ordering OrderingType;
+  enum { UpLo = _UpLo };
+  typedef typename MatrixType::Scalar                             Scalar;
+  typedef typename MatrixType::Index                              Index;
+  typedef SparseMatrix<Scalar, ColMajor, Index>                   CholMatrixType;
+  typedef SparseTriangularView<CholMatrixType, Eigen::UnitLower>  MatrixL;
+  typedef SparseTriangularView<typename CholMatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return m; }
+  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+};
+
+template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Ordering OrderingType;
+  enum { UpLo = _UpLo };
+};
+
+}
+
+/** \ingroup SparseCholesky_Module
+  * \class SimplicialLLT
+  * \brief A direct sparse LLT Cholesky factorizations
+  *
+  * This class provides a LL^T Cholesky factorizations of sparse matrices that are
+  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+  * X and B can be either dense or sparse.
+  * 
+  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+  * such that the factorized matrix is P A P^-1.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
+  *
+  * \sa class SimplicialLDLT, class AMDOrdering, class NaturalOrdering
+  */
+template<typename _MatrixType, int _UpLo, typename _Ordering>
+    class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<_MatrixType,_UpLo,_Ordering> >
+{
+public:
+    typedef _MatrixType MatrixType;
+    enum { UpLo = _UpLo };
+    typedef SimplicialCholeskyBase<SimplicialLLT> Base;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef internal::traits<SimplicialLLT> Traits;
+    typedef typename Traits::MatrixL  MatrixL;
+    typedef typename Traits::MatrixU  MatrixU;
+public:
+    /** Default constructor */
+    SimplicialLLT() : Base() {}
+    /** Constructs and performs the LLT factorization of \a matrix */
+    SimplicialLLT(const MatrixType& matrix)
+        : Base(matrix) {}
+
+    /** \returns an expression of the factor L */
+    inline const MatrixL matrixL() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
+        return Traits::getL(Base::m_matrix);
+    }
+
+    /** \returns an expression of the factor U (= L^*) */
+    inline const MatrixU matrixU() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
+        return Traits::getU(Base::m_matrix);
+    }
+    
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    SimplicialLLT& compute(const MatrixType& matrix)
+    {
+      Base::template compute<false>(matrix);
+      return *this;
+    }
+
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& a)
+    {
+      Base::analyzePattern(a, false);
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    void factorize(const MatrixType& a)
+    {
+      Base::template factorize<false>(a);
+    }
+
+    /** \returns the determinant of the underlying matrix from the current factorization */
+    Scalar determinant() const
+    {
+      Scalar detL = Base::m_matrix.diagonal().prod();
+      return numext::abs2(detL);
+    }
+};
+
+/** \ingroup SparseCholesky_Module
+  * \class SimplicialLDLT
+  * \brief A direct sparse LDLT Cholesky factorizations without square root.
+  *
+  * This class provides a LDL^T Cholesky factorizations without square root of sparse matrices that are
+  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+  * X and B can be either dense or sparse.
+  * 
+  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+  * such that the factorized matrix is P A P^-1.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
+  *
+  * \sa class SimplicialLLT, class AMDOrdering, class NaturalOrdering
+  */
+template<typename _MatrixType, int _UpLo, typename _Ordering>
+    class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
+{
+public:
+    typedef _MatrixType MatrixType;
+    enum { UpLo = _UpLo };
+    typedef SimplicialCholeskyBase<SimplicialLDLT> Base;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef internal::traits<SimplicialLDLT> Traits;
+    typedef typename Traits::MatrixL  MatrixL;
+    typedef typename Traits::MatrixU  MatrixU;
+public:
+    /** Default constructor */
+    SimplicialLDLT() : Base() {}
+
+    /** Constructs and performs the LLT factorization of \a matrix */
+    SimplicialLDLT(const MatrixType& matrix)
+        : Base(matrix) {}
+
+    /** \returns a vector expression of the diagonal D */
+    inline const VectorType vectorD() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+        return Base::m_diag;
+    }
+    /** \returns an expression of the factor L */
+    inline const MatrixL matrixL() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+        return Traits::getL(Base::m_matrix);
+    }
+
+    /** \returns an expression of the factor U (= L^*) */
+    inline const MatrixU matrixU() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+        return Traits::getU(Base::m_matrix);
+    }
+
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    SimplicialLDLT& compute(const MatrixType& matrix)
+    {
+      Base::template compute<true>(matrix);
+      return *this;
+    }
+    
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& a)
+    {
+      Base::analyzePattern(a, true);
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    void factorize(const MatrixType& a)
+    {
+      Base::template factorize<true>(a);
+    }
+
+    /** \returns the determinant of the underlying matrix from the current factorization */
+    Scalar determinant() const
+    {
+      return Base::m_diag.prod();
+    }
+};
+
+/** \deprecated use SimplicialLDLT or class SimplicialLLT
+  * \ingroup SparseCholesky_Module
+  * \class SimplicialCholesky
+  *
+  * \sa class SimplicialLDLT, class SimplicialLLT
+  */
+template<typename _MatrixType, int _UpLo, typename _Ordering>
+    class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
+{
+public:
+    typedef _MatrixType MatrixType;
+    enum { UpLo = _UpLo };
+    typedef SimplicialCholeskyBase<SimplicialCholesky> Base;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef internal::traits<SimplicialCholesky> Traits;
+    typedef internal::traits<SimplicialLDLT<MatrixType,UpLo> > LDLTTraits;
+    typedef internal::traits<SimplicialLLT<MatrixType,UpLo>  > LLTTraits;
+  public:
+    SimplicialCholesky() : Base(), m_LDLT(true) {}
+
+    SimplicialCholesky(const MatrixType& matrix)
+      : Base(), m_LDLT(true)
+    {
+      compute(matrix);
+    }
+
+    SimplicialCholesky& setMode(SimplicialCholeskyMode mode)
+    {
+      switch(mode)
+      {
+      case SimplicialCholeskyLLT:
+        m_LDLT = false;
+        break;
+      case SimplicialCholeskyLDLT:
+        m_LDLT = true;
+        break;
+      default:
+        break;
+      }
+
+      return *this;
+    }
+
+    inline const VectorType vectorD() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial Cholesky not factorized");
+        return Base::m_diag;
+    }
+    inline const CholMatrixType rawMatrix() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial Cholesky not factorized");
+        return Base::m_matrix;
+    }
+    
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    SimplicialCholesky& compute(const MatrixType& matrix)
+    {
+      if(m_LDLT)
+        Base::template compute<true>(matrix);
+      else
+        Base::template compute<false>(matrix);
+      return *this;
+    }
+
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& a)
+    {
+      Base::analyzePattern(a, m_LDLT);
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    void factorize(const MatrixType& a)
+    {
+      if(m_LDLT)
+        Base::template factorize<true>(a);
+      else
+        Base::template factorize<false>(a);
+    }
+
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    {
+      eigen_assert(Base::m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
+      eigen_assert(Base::m_matrix.rows()==b.rows());
+
+      if(Base::m_info!=Success)
+        return;
+
+      if(Base::m_P.size()>0)
+        dest = Base::m_P * b;
+      else
+        dest = b;
+
+      if(Base::m_matrix.nonZeros()>0) // otherwise L==I
+      {
+        if(m_LDLT)
+          LDLTTraits::getL(Base::m_matrix).solveInPlace(dest);
+        else
+          LLTTraits::getL(Base::m_matrix).solveInPlace(dest);
+      }
+
+      if(Base::m_diag.size()>0)
+        dest = Base::m_diag.asDiagonal().inverse() * dest;
+
+      if (Base::m_matrix.nonZeros()>0) // otherwise I==I
+      {
+        if(m_LDLT)
+          LDLTTraits::getU(Base::m_matrix).solveInPlace(dest);
+        else
+          LLTTraits::getU(Base::m_matrix).solveInPlace(dest);
+      }
+
+      if(Base::m_P.size()>0)
+        dest = Base::m_Pinv * dest;
+    }
+    
+    Scalar determinant() const
+    {
+      if(m_LDLT)
+      {
+        return Base::m_diag.prod();
+      }
+      else
+      {
+        Scalar detL = Diagonal<const CholMatrixType>(Base::m_matrix).prod();
+        return numext::abs2(detL);
+      }
+    }
+    
+  protected:
+    bool m_LDLT;
+};
+
+template<typename Derived>
+void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap)
+{
+  eigen_assert(a.rows()==a.cols());
+  const Index size = a.rows();
+  // Note that amd compute the inverse permutation
+  {
+    CholMatrixType C;
+    C = a.template selfadjointView<UpLo>();
+    
+    OrderingType ordering;
+    ordering(C,m_Pinv);
+  }
+
+  if(m_Pinv.size()>0)
+    m_P = m_Pinv.inverse();
+  else
+    m_P.resize(0);
+
+  ap.resize(size,size);
+  ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+}
+
+namespace internal {
+  
+template<typename Derived, typename Rhs>
+struct solve_retval<SimplicialCholeskyBase<Derived>, Rhs>
+  : solve_retval_base<SimplicialCholeskyBase<Derived>, Rhs>
+{
+  typedef SimplicialCholeskyBase<Derived> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec().derived()._solve(rhs(),dst);
+  }
+};
+
+template<typename Derived, typename Rhs>
+struct sparse_solve_retval<SimplicialCholeskyBase<Derived>, Rhs>
+  : sparse_solve_retval_base<SimplicialCholeskyBase<Derived>, Rhs>
+{
+  typedef SimplicialCholeskyBase<Derived> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    this->defaultEvalTo(dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SIMPLICIAL_CHOLESKY_H
diff --git a/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
new file mode 100644
index 0000000..7aaf702
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -0,0 +1,199 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+
+/*
+
+NOTE: thes functions vave been adapted from the LDL library:
+
+LDL Copyright (c) 2005 by Timothy A. Davis.  All Rights Reserved.
+
+LDL License:
+
+    Your use or distribution of LDL or any modified version of
+    LDL implies that you agree to this License.
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+    USA
+
+    Permission is hereby granted to use or copy this program under the
+    terms of the GNU LGPL, provided that the Copyright, this License,
+    and the Availability of the original version is retained on all copies.
+    User documentation of any code that uses this code or any modified
+    version of this code must cite the Copyright, this License, the
+    Availability note, and "Used by permission." Permission to modify
+    the code and to distribute modified code is granted, provided the
+    Copyright, this License, and the Availability note are retained,
+    and a notice that the code was modified is included.
+ */
+
+#include "../Core/util/NonMPL2.h"
+
+#ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
+#define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
+
+namespace Eigen {
+
+template<typename Derived>
+void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrixType& ap, bool doLDLT)
+{
+  const Index size = ap.rows();
+  m_matrix.resize(size, size);
+  m_parent.resize(size);
+  m_nonZerosPerCol.resize(size);
+
+  ei_declare_aligned_stack_constructed_variable(Index, tags, size, 0);
+
+  for(Index k = 0; k < size; ++k)
+  {
+    /* L(k,:) pattern: all nodes reachable in etree from nz in A(0:k-1,k) */
+    m_parent[k] = -1;             /* parent of k is not yet known */
+    tags[k] = k;                  /* mark node k as visited */
+    m_nonZerosPerCol[k] = 0;      /* count of nonzeros in column k of L */
+    for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
+    {
+      Index i = it.index();
+      if(i < k)
+      {
+        /* follow path from i to root of etree, stop at flagged node */
+        for(; tags[i] != k; i = m_parent[i])
+        {
+          /* find parent of i if not yet determined */
+          if (m_parent[i] == -1)
+            m_parent[i] = k;
+          m_nonZerosPerCol[i]++;        /* L (k,i) is nonzero */
+          tags[i] = k;                  /* mark i as visited */
+        }
+      }
+    }
+  }
+
+  /* construct Lp index array from m_nonZerosPerCol column counts */
+  Index* Lp = m_matrix.outerIndexPtr();
+  Lp[0] = 0;
+  for(Index k = 0; k < size; ++k)
+    Lp[k+1] = Lp[k] + m_nonZerosPerCol[k] + (doLDLT ? 0 : 1);
+
+  m_matrix.resizeNonZeros(Lp[size]);
+
+  m_isInitialized     = true;
+  m_info              = Success;
+  m_analysisIsOk      = true;
+  m_factorizationIsOk = false;
+}
+
+
+template<typename Derived>
+template<bool DoLDLT>
+void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType& ap)
+{
+  using std::sqrt;
+
+  eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+  eigen_assert(ap.rows()==ap.cols());
+  const Index size = ap.rows();
+  eigen_assert(m_parent.size()==size);
+  eigen_assert(m_nonZerosPerCol.size()==size);
+
+  const Index* Lp = m_matrix.outerIndexPtr();
+  Index* Li = m_matrix.innerIndexPtr();
+  Scalar* Lx = m_matrix.valuePtr();
+
+  ei_declare_aligned_stack_constructed_variable(Scalar, y, size, 0);
+  ei_declare_aligned_stack_constructed_variable(Index,  pattern, size, 0);
+  ei_declare_aligned_stack_constructed_variable(Index,  tags, size, 0);
+
+  bool ok = true;
+  m_diag.resize(DoLDLT ? size : 0);
+
+  for(Index k = 0; k < size; ++k)
+  {
+    // compute nonzero pattern of kth row of L, in topological order
+    y[k] = 0.0;                     // Y(0:k) is now all zero
+    Index top = size;               // stack for pattern is empty
+    tags[k] = k;                    // mark node k as visited
+    m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
+    for(typename MatrixType::InnerIterator it(ap,k); it; ++it)
+    {
+      Index i = it.index();
+      if(i <= k)
+      {
+        y[i] += numext::conj(it.value());            /* scatter A(i,k) into Y (sum duplicates) */
+        Index len;
+        for(len = 0; tags[i] != k; i = m_parent[i])
+        {
+          pattern[len++] = i;     /* L(k,i) is nonzero */
+          tags[i] = k;            /* mark i as visited */
+        }
+        while(len > 0)
+          pattern[--top] = pattern[--len];
+      }
+    }
+
+    /* compute numerical values kth row of L (a sparse triangular solve) */
+
+    RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset;    // get D(k,k), apply the shift function, and clear Y(k)
+    y[k] = 0.0;
+    for(; top < size; ++top)
+    {
+      Index i = pattern[top];       /* pattern[top:n-1] is pattern of L(:,k) */
+      Scalar yi = y[i];             /* get and clear Y(i) */
+      y[i] = 0.0;
+
+      /* the nonzero entry L(k,i) */
+      Scalar l_ki;
+      if(DoLDLT)
+        l_ki = yi / m_diag[i];
+      else
+        yi = l_ki = yi / Lx[Lp[i]];
+
+      Index p2 = Lp[i] + m_nonZerosPerCol[i];
+      Index p;
+      for(p = Lp[i] + (DoLDLT ? 0 : 1); p < p2; ++p)
+        y[Li[p]] -= numext::conj(Lx[p]) * yi;
+      d -= numext::real(l_ki * numext::conj(yi));
+      Li[p] = k;                          /* store L(k,i) in column form of L */
+      Lx[p] = l_ki;
+      ++m_nonZerosPerCol[i];              /* increment count of nonzeros in col i */
+    }
+    if(DoLDLT)
+    {
+      m_diag[k] = d;
+      if(d == RealScalar(0))
+      {
+        ok = false;                         /* failure, D(k,k) is zero */
+        break;
+      }
+    }
+    else
+    {
+      Index p = Lp[k] + m_nonZerosPerCol[k]++;
+      Li[p] = k ;                /* store L(k,k) = sqrt (d) in column k */
+      if(d <= RealScalar(0)) {
+        ok = false;              /* failure, matrix is not positive definite */
+        break;
+      }
+      Lx[p] = sqrt(d) ;
+    }
+  }
+
+  m_info = ok ? Success : NumericalIssue;
+  m_factorizationIsOk = true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
diff --git a/include/eigen3/Eigen/src/SparseCore/AmbiVector.h b/include/eigen3/Eigen/src/SparseCore/AmbiVector.h
new file mode 100644
index 0000000..220c645
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/AmbiVector.h
@@ -0,0 +1,373 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AMBIVECTOR_H
+#define EIGEN_AMBIVECTOR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal
+  * Hybrid sparse/dense vector class designed for intensive read-write operations.
+  *
+  * See BasicSparseLLT and SparseProduct for usage examples.
+  */
+template<typename _Scalar, typename _Index>
+class AmbiVector
+{
+  public:
+    typedef _Scalar Scalar;
+    typedef _Index Index;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    AmbiVector(Index size)
+      : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
+    {
+      resize(size);
+    }
+
+    void init(double estimatedDensity);
+    void init(int mode);
+
+    Index nonZeros() const;
+
+    /** Specifies a sub-vector to work on */
+    void setBounds(Index start, Index end) { m_start = start; m_end = end; }
+
+    void setZero();
+
+    void restart();
+    Scalar& coeffRef(Index i);
+    Scalar& coeff(Index i);
+
+    class Iterator;
+
+    ~AmbiVector() { delete[] m_buffer; }
+
+    void resize(Index size)
+    {
+      if (m_allocatedSize < size)
+        reallocate(size);
+      m_size = size;
+    }
+
+    Index size() const { return m_size; }
+
+  protected:
+
+    void reallocate(Index size)
+    {
+      // if the size of the matrix is not too large, let's allocate a bit more than needed such
+      // that we can handle dense vector even in sparse mode.
+      delete[] m_buffer;
+      if (size<1000)
+      {
+        Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar);
+        m_allocatedElements = (allocSize*sizeof(Scalar))/sizeof(ListEl);
+        m_buffer = new Scalar[allocSize];
+      }
+      else
+      {
+        m_allocatedElements = (size*sizeof(Scalar))/sizeof(ListEl);
+        m_buffer = new Scalar[size];
+      }
+      m_size = size;
+      m_start = 0;
+      m_end = m_size;
+    }
+
+    void reallocateSparse()
+    {
+      Index copyElements = m_allocatedElements;
+      m_allocatedElements = (std::min)(Index(m_allocatedElements*1.5),m_size);
+      Index allocSize = m_allocatedElements * sizeof(ListEl);
+      allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
+      Scalar* newBuffer = new Scalar[allocSize];
+      memcpy(newBuffer,  m_buffer,  copyElements * sizeof(ListEl));
+      delete[] m_buffer;
+      m_buffer = newBuffer;
+    }
+
+  protected:
+    // element type of the linked list
+    struct ListEl
+    {
+      Index next;
+      Index index;
+      Scalar value;
+    };
+
+    // used to store data in both mode
+    Scalar* m_buffer;
+    Scalar m_zero;
+    Index m_size;
+    Index m_start;
+    Index m_end;
+    Index m_allocatedSize;
+    Index m_allocatedElements;
+    Index m_mode;
+
+    // linked list mode
+    Index m_llStart;
+    Index m_llCurrent;
+    Index m_llSize;
+};
+
+/** \returns the number of non zeros in the current sub vector */
+template<typename _Scalar,typename _Index>
+_Index AmbiVector<_Scalar,_Index>::nonZeros() const
+{
+  if (m_mode==IsSparse)
+    return m_llSize;
+  else
+    return m_end - m_start;
+}
+
+template<typename _Scalar,typename _Index>
+void AmbiVector<_Scalar,_Index>::init(double estimatedDensity)
+{
+  if (estimatedDensity>0.1)
+    init(IsDense);
+  else
+    init(IsSparse);
+}
+
+template<typename _Scalar,typename _Index>
+void AmbiVector<_Scalar,_Index>::init(int mode)
+{
+  m_mode = mode;
+  if (m_mode==IsSparse)
+  {
+    m_llSize = 0;
+    m_llStart = -1;
+  }
+}
+
+/** Must be called whenever we might perform a write access
+  * with an index smaller than the previous one.
+  *
+  * Don't worry, this function is extremely cheap.
+  */
+template<typename _Scalar,typename _Index>
+void AmbiVector<_Scalar,_Index>::restart()
+{
+  m_llCurrent = m_llStart;
+}
+
+/** Set all coefficients of current subvector to zero */
+template<typename _Scalar,typename _Index>
+void AmbiVector<_Scalar,_Index>::setZero()
+{
+  if (m_mode==IsDense)
+  {
+    for (Index i=m_start; i<m_end; ++i)
+      m_buffer[i] = Scalar(0);
+  }
+  else
+  {
+    eigen_assert(m_mode==IsSparse);
+    m_llSize = 0;
+    m_llStart = -1;
+  }
+}
+
+template<typename _Scalar,typename _Index>
+_Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
+{
+  if (m_mode==IsDense)
+    return m_buffer[i];
+  else
+  {
+    ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_buffer);
+    // TODO factorize the following code to reduce code generation
+    eigen_assert(m_mode==IsSparse);
+    if (m_llSize==0)
+    {
+      // this is the first element
+      m_llStart = 0;
+      m_llCurrent = 0;
+      ++m_llSize;
+      llElements[0].value = Scalar(0);
+      llElements[0].index = i;
+      llElements[0].next = -1;
+      return llElements[0].value;
+    }
+    else if (i<llElements[m_llStart].index)
+    {
+      // this is going to be the new first element of the list
+      ListEl& el = llElements[m_llSize];
+      el.value = Scalar(0);
+      el.index = i;
+      el.next = m_llStart;
+      m_llStart = m_llSize;
+      ++m_llSize;
+      m_llCurrent = m_llStart;
+      return el.value;
+    }
+    else
+    {
+      Index nextel = llElements[m_llCurrent].next;
+      eigen_assert(i>=llElements[m_llCurrent].index && "you must call restart() before inserting an element with lower or equal index");
+      while (nextel >= 0 && llElements[nextel].index<=i)
+      {
+        m_llCurrent = nextel;
+        nextel = llElements[nextel].next;
+      }
+
+      if (llElements[m_llCurrent].index==i)
+      {
+        // the coefficient already exists and we found it !
+        return llElements[m_llCurrent].value;
+      }
+      else
+      {
+        if (m_llSize>=m_allocatedElements)
+        {
+          reallocateSparse();
+          llElements = reinterpret_cast<ListEl*>(m_buffer);
+        }
+        eigen_internal_assert(m_llSize<m_allocatedElements && "internal error: overflow in sparse mode");
+        // let's insert a new coefficient
+        ListEl& el = llElements[m_llSize];
+        el.value = Scalar(0);
+        el.index = i;
+        el.next = llElements[m_llCurrent].next;
+        llElements[m_llCurrent].next = m_llSize;
+        ++m_llSize;
+        return el.value;
+      }
+    }
+  }
+}
+
+template<typename _Scalar,typename _Index>
+_Scalar& AmbiVector<_Scalar,_Index>::coeff(_Index i)
+{
+  if (m_mode==IsDense)
+    return m_buffer[i];
+  else
+  {
+    ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_buffer);
+    eigen_assert(m_mode==IsSparse);
+    if ((m_llSize==0) || (i<llElements[m_llStart].index))
+    {
+      return m_zero;
+    }
+    else
+    {
+      Index elid = m_llStart;
+      while (elid >= 0 && llElements[elid].index<i)
+        elid = llElements[elid].next;
+
+      if (llElements[elid].index==i)
+        return llElements[m_llCurrent].value;
+      else
+        return m_zero;
+    }
+  }
+}
+
+/** Iterator over the nonzero coefficients */
+template<typename _Scalar,typename _Index>
+class AmbiVector<_Scalar,_Index>::Iterator
+{
+  public:
+    typedef _Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    /** Default constructor
+      * \param vec the vector on which we iterate
+      * \param epsilon the minimal value used to prune zero coefficients.
+      * In practice, all coefficients having a magnitude smaller than \a epsilon
+      * are skipped.
+      */
+    Iterator(const AmbiVector& vec, const RealScalar& epsilon = 0)
+      : m_vector(vec)
+    {
+      using std::abs;
+      m_epsilon = epsilon;
+      m_isDense = m_vector.m_mode==IsDense;
+      if (m_isDense)
+      {
+        m_currentEl = 0;   // this is to avoid a compilation warning
+        m_cachedValue = 0; // this is to avoid a compilation warning
+        m_cachedIndex = m_vector.m_start-1;
+        ++(*this);
+      }
+      else
+      {
+        ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer);
+        m_currentEl = m_vector.m_llStart;
+        while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<=m_epsilon)
+          m_currentEl = llElements[m_currentEl].next;
+        if (m_currentEl<0)
+        {
+          m_cachedValue = 0; // this is to avoid a compilation warning
+          m_cachedIndex = -1;
+        }
+        else
+        {
+          m_cachedIndex = llElements[m_currentEl].index;
+          m_cachedValue = llElements[m_currentEl].value;
+        }
+      }
+    }
+
+    Index index() const { return m_cachedIndex; }
+    Scalar value() const { return m_cachedValue; }
+
+    operator bool() const { return m_cachedIndex>=0; }
+
+    Iterator& operator++()
+    {
+      using std::abs;
+      if (m_isDense)
+      {
+        do {
+          ++m_cachedIndex;
+        } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<m_epsilon);
+        if (m_cachedIndex<m_vector.m_end)
+          m_cachedValue = m_vector.m_buffer[m_cachedIndex];
+        else
+          m_cachedIndex=-1;
+      }
+      else
+      {
+        ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer);
+        do {
+          m_currentEl = llElements[m_currentEl].next;
+        } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<m_epsilon);
+        if (m_currentEl<0)
+        {
+          m_cachedIndex = -1;
+        }
+        else
+        {
+          m_cachedIndex = llElements[m_currentEl].index;
+          m_cachedValue = llElements[m_currentEl].value;
+        }
+      }
+      return *this;
+    }
+
+  protected:
+    const AmbiVector& m_vector; // the target vector
+    Index m_currentEl;            // the current element in sparse/linked-list mode
+    RealScalar m_epsilon;       // epsilon used to prune zero coefficients
+    Index m_cachedIndex;          // current coordinate
+    Scalar m_cachedValue;       // current value
+    bool m_isDense;             // mode of the vector
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_AMBIVECTOR_H
diff --git a/include/eigen3/Eigen/src/SparseCore/CompressedStorage.h b/include/eigen3/Eigen/src/SparseCore/CompressedStorage.h
new file mode 100644
index 0000000..a667cb5
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/CompressedStorage.h
@@ -0,0 +1,233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPRESSED_STORAGE_H
+#define EIGEN_COMPRESSED_STORAGE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal
+  * Stores a sparse set of values as a list of values and a list of indices.
+  *
+  */
+template<typename _Scalar,typename _Index>
+class CompressedStorage
+{
+  public:
+
+    typedef _Scalar Scalar;
+    typedef _Index Index;
+
+  protected:
+
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  public:
+
+    CompressedStorage()
+      : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
+    {}
+
+    CompressedStorage(size_t size)
+      : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
+    {
+      resize(size);
+    }
+
+    CompressedStorage(const CompressedStorage& other)
+      : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
+    {
+      *this = other;
+    }
+
+    CompressedStorage& operator=(const CompressedStorage& other)
+    {
+      resize(other.size());
+      internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
+      internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
+      return *this;
+    }
+
+    void swap(CompressedStorage& other)
+    {
+      std::swap(m_values, other.m_values);
+      std::swap(m_indices, other.m_indices);
+      std::swap(m_size, other.m_size);
+      std::swap(m_allocatedSize, other.m_allocatedSize);
+    }
+
+    ~CompressedStorage()
+    {
+      delete[] m_values;
+      delete[] m_indices;
+    }
+
+    void reserve(size_t size)
+    {
+      size_t newAllocatedSize = m_size + size;
+      if (newAllocatedSize > m_allocatedSize)
+        reallocate(newAllocatedSize);
+    }
+
+    void squeeze()
+    {
+      if (m_allocatedSize>m_size)
+        reallocate(m_size);
+    }
+
+    void resize(size_t size, double reserveSizeFactor = 0)
+    {
+      if (m_allocatedSize<size)
+        reallocate(size + size_t(reserveSizeFactor*double(size)));
+      m_size = size;
+    }
+
+    void append(const Scalar& v, Index i)
+    {
+      Index id = static_cast<Index>(m_size);
+      resize(m_size+1, 1);
+      m_values[id] = v;
+      m_indices[id] = i;
+    }
+
+    inline size_t size() const { return m_size; }
+    inline size_t allocatedSize() const { return m_allocatedSize; }
+    inline void clear() { m_size = 0; }
+
+    inline Scalar& value(size_t i) { return m_values[i]; }
+    inline const Scalar& value(size_t i) const { return m_values[i]; }
+
+    inline Index& index(size_t i) { return m_indices[i]; }
+    inline const Index& index(size_t i) const { return m_indices[i]; }
+
+    static CompressedStorage Map(Index* indices, Scalar* values, size_t size)
+    {
+      CompressedStorage res;
+      res.m_indices = indices;
+      res.m_values = values;
+      res.m_allocatedSize = res.m_size = size;
+      return res;
+    }
+
+    /** \returns the largest \c k such that for all \c j in [0,k) index[\c j]\<\a key */
+    inline Index searchLowerIndex(Index key) const
+    {
+      return searchLowerIndex(0, m_size, key);
+    }
+
+    /** \returns the largest \c k in [start,end) such that for all \c j in [start,k) index[\c j]\<\a key */
+    inline Index searchLowerIndex(size_t start, size_t end, Index key) const
+    {
+      while(end>start)
+      {
+        size_t mid = (end+start)>>1;
+        if (m_indices[mid]<key)
+          start = mid+1;
+        else
+          end = mid;
+      }
+      return static_cast<Index>(start);
+    }
+
+    /** \returns the stored value at index \a key
+      * If the value does not exist, then the value \a defaultValue is returned without any insertion. */
+    inline Scalar at(Index key, const Scalar& defaultValue = Scalar(0)) const
+    {
+      if (m_size==0)
+        return defaultValue;
+      else if (key==m_indices[m_size-1])
+        return m_values[m_size-1];
+      // ^^  optimization: let's first check if it is the last coefficient
+      // (very common in high level algorithms)
+      const size_t id = searchLowerIndex(0,m_size-1,key);
+      return ((id<m_size) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
+    }
+
+    /** Like at(), but the search is performed in the range [start,end) */
+    inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const
+    {
+      if (start>=end)
+        return Scalar(0);
+      else if (end>start && key==m_indices[end-1])
+        return m_values[end-1];
+      // ^^  optimization: let's first check if it is the last coefficient
+      // (very common in high level algorithms)
+      const size_t id = searchLowerIndex(start,end-1,key);
+      return ((id<end) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
+    }
+
+    /** \returns a reference to the value at index \a key
+      * If the value does not exist, then the value \a defaultValue is inserted
+      * such that the keys are sorted. */
+    inline Scalar& atWithInsertion(Index key, const Scalar& defaultValue = Scalar(0))
+    {
+      size_t id = searchLowerIndex(0,m_size,key);
+      if (id>=m_size || m_indices[id]!=key)
+      {
+        resize(m_size+1,1);
+        for (size_t j=m_size-1; j>id; --j)
+        {
+          m_indices[j] = m_indices[j-1];
+          m_values[j] = m_values[j-1];
+        }
+        m_indices[id] = key;
+        m_values[id] = defaultValue;
+      }
+      return m_values[id];
+    }
+
+    void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
+    {
+      size_t k = 0;
+      size_t n = size();
+      for (size_t i=0; i<n; ++i)
+      {
+        if (!internal::isMuchSmallerThan(value(i), reference, epsilon))
+        {
+          value(k) = value(i);
+          index(k) = index(i);
+          ++k;
+        }
+      }
+      resize(k,0);
+    }
+
+  protected:
+
+    inline void reallocate(size_t size)
+    {
+      Scalar* newValues  = new Scalar[size];
+      Index* newIndices = new Index[size];
+      size_t copySize = (std::min)(size, m_size);
+      // copy
+      internal::smart_copy(m_values, m_values+copySize, newValues);
+      internal::smart_copy(m_indices, m_indices+copySize, newIndices);
+      // delete old stuff
+      delete[] m_values;
+      delete[] m_indices;
+      m_values = newValues;
+      m_indices = newIndices;
+      m_allocatedSize = size;
+    }
+
+  protected:
+    Scalar* m_values;
+    Index* m_indices;
+    size_t m_size;
+    size_t m_allocatedSize;
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPRESSED_STORAGE_H
diff --git a/include/eigen3/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/include/eigen3/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
new file mode 100644
index 0000000..5c320e2
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -0,0 +1,245 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
+#define EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType>
+static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+{
+  typedef typename remove_all<Lhs>::type::Scalar Scalar;
+  typedef typename remove_all<Lhs>::type::Index Index;
+
+  // make sure to call innerSize/outerSize since we fake the storage order.
+  Index rows = lhs.innerSize();
+  Index cols = rhs.outerSize();
+  eigen_assert(lhs.outerSize() == rhs.innerSize());
+
+  std::vector<bool> mask(rows,false);
+  Matrix<Scalar,Dynamic,1> values(rows);
+  Matrix<Index,Dynamic,1>  indices(rows);
+
+  // estimate the number of non zero entries
+  // given a rhs column containing Y non zeros, we assume that the respective Y columns
+  // of the lhs differs in average of one non zeros, thus the number of non zeros for
+  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
+  // per column of the lhs.
+  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
+  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
+
+  res.setZero();
+  res.reserve(Index(estimated_nnz_prod));
+  // we compute each column of the result, one after the other
+  for (Index j=0; j<cols; ++j)
+  {
+
+    res.startVec(j);
+    Index nnz = 0;
+    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+    {
+      Scalar y = rhsIt.value();
+      Index k = rhsIt.index();
+      for (typename Lhs::InnerIterator lhsIt(lhs, k); lhsIt; ++lhsIt)
+      {
+        Index i = lhsIt.index();
+        Scalar x = lhsIt.value();
+        if(!mask[i])
+        {
+          mask[i] = true;
+          values[i] = x * y;
+          indices[nnz] = i;
+          ++nnz;
+        }
+        else
+          values[i] += x * y;
+      }
+    }
+
+    // unordered insertion
+    for(Index k=0; k<nnz; ++k)
+    {
+      Index i = indices[k];
+      res.insertBackByOuterInnerUnordered(j,i) = values[i];
+      mask[i] = false;
+    }
+
+#if 0
+    // alternative ordered insertion code:
+
+    Index t200 = rows/(log2(200)*1.39);
+    Index t = (rows*100)/139;
+
+    // FIXME reserve nnz non zeros
+    // FIXME implement fast sort algorithms for very small nnz
+    // if the result is sparse enough => use a quick sort
+    // otherwise => loop through the entire vector
+    // In order to avoid to perform an expensive log2 when the
+    // result is clearly very sparse we use a linear bound up to 200.
+    //if((nnz<200 && nnz<t200) || nnz * log2(nnz) < t)
+    //res.startVec(j);
+    if(true)
+    {
+      if(nnz>1) std::sort(indices.data(),indices.data()+nnz);
+      for(Index k=0; k<nnz; ++k)
+      {
+        Index i = indices[k];
+        res.insertBackByOuterInner(j,i) = values[i];
+        mask[i] = false;
+      }
+    }
+    else
+    {
+      // dense path
+      for(Index i=0; i<rows; ++i)
+      {
+        if(mask[i])
+        {
+          mask[i] = false;
+          res.insertBackByOuterInner(j,i) = values[i];
+        }
+      }
+    }
+#endif
+
+  }
+  res.finalize();
+}
+
+
+} // end namespace internal
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType,
+  int LhsStorageOrder = (traits<Lhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+  int RhsStorageOrder = (traits<Rhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+  int ResStorageOrder = (traits<ResultType>::Flags&RowMajorBit) ? RowMajor : ColMajor>
+struct conservative_sparse_sparse_product_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
+{
+  typedef typename remove_all<Lhs>::type LhsCleaned;
+  typedef typename LhsCleaned::Scalar Scalar;
+
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    ColMajorMatrix resCol(lhs.rows(),rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
+    // sort the non zeros:
+    RowMajorMatrix resRow(resCol);
+    res = resRow;
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+     typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
+     RowMajorMatrix rhsRow = rhs;
+     RowMajorMatrix resRow(lhs.rows(), rhs.cols());
+     internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow);
+     res = resRow;
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
+    RowMajorMatrix lhsRow = lhs;
+    RowMajorMatrix resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow);
+    res = resRow;
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
+    RowMajorMatrix resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
+    res = resRow;
+  }
+};
+
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,RowMajor>
+{
+  typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
+
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
+    res = resCol;
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    ColMajorMatrix lhsCol = lhs;
+    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol);
+    res = resCol;
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    ColMajorMatrix rhsCol = rhs;
+    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol);
+    res = resCol;
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    RowMajorMatrix resRow(lhs.rows(),rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
+    // sort the non zeros:
+    ColMajorMatrix resCol(resRow);
+    res = resCol;
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
diff --git a/include/eigen3/Eigen/src/SparseCore/MappedSparseMatrix.h b/include/eigen3/Eigen/src/SparseCore/MappedSparseMatrix.h
new file mode 100644
index 0000000..ab1a266
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/MappedSparseMatrix.h
@@ -0,0 +1,181 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MAPPED_SPARSEMATRIX_H
+#define EIGEN_MAPPED_SPARSEMATRIX_H
+
+namespace Eigen { 
+
+/** \class MappedSparseMatrix
+  *
+  * \brief Sparse matrix
+  *
+  * \param _Scalar the scalar type, i.e. the type of the coefficients
+  *
+  * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme.
+  *
+  */
+namespace internal {
+template<typename _Scalar, int _Flags, typename _Index>
+struct traits<MappedSparseMatrix<_Scalar, _Flags, _Index> > : traits<SparseMatrix<_Scalar, _Flags, _Index> >
+{};
+}
+
+template<typename _Scalar, int _Flags, typename _Index>
+class MappedSparseMatrix
+  : public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _Index> >
+{
+  public:
+    EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  protected:
+
+    Index   m_outerSize;
+    Index   m_innerSize;
+    Index   m_nnz;
+    Index*  m_outerIndex;
+    Index*  m_innerIndices;
+    Scalar* m_values;
+
+  public:
+
+    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+    inline Index innerSize() const { return m_innerSize; }
+    inline Index outerSize() const { return m_outerSize; }
+    
+    bool isCompressed() const { return true; }
+
+    //----------------------------------------
+    // direct access interface
+    inline const Scalar* valuePtr() const { return m_values; }
+    inline Scalar* valuePtr() { return m_values; }
+
+    inline const Index* innerIndexPtr() const { return m_innerIndices; }
+    inline Index* innerIndexPtr() { return m_innerIndices; }
+
+    inline const Index* outerIndexPtr() const { return m_outerIndex; }
+    inline Index* outerIndexPtr() { return m_outerIndex; }
+    //----------------------------------------
+
+    inline Scalar coeff(Index row, Index col) const
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = m_outerIndex[outer];
+      Index end = m_outerIndex[outer+1];
+      if (start==end)
+        return Scalar(0);
+      else if (end>0 && inner==m_innerIndices[end-1])
+        return m_values[end-1];
+      // ^^  optimization: let's first check if it is the last coefficient
+      // (very common in high level algorithms)
+
+      const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
+      const Index id = r-&m_innerIndices[0];
+      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
+    }
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = m_outerIndex[outer];
+      Index end = m_outerIndex[outer+1];
+      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
+      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
+      Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner);
+      const Index id = r-&m_innerIndices[0];
+      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
+      return m_values[id];
+    }
+
+    class InnerIterator;
+    class ReverseInnerIterator;
+
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const  { return m_nnz; }
+
+    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, Scalar* valuePtr)
+      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr)
+    {}
+
+    /** Empty destructor */
+    inline ~MappedSparseMatrix() {}
+};
+
+template<typename Scalar, int _Flags, typename _Index>
+class MappedSparseMatrix<Scalar,_Flags,_Index>::InnerIterator
+{
+  public:
+    InnerIterator(const MappedSparseMatrix& mat, Index outer)
+      : m_matrix(mat),
+        m_outer(outer),
+        m_id(mat.outerIndexPtr()[outer]),
+        m_start(m_id),
+        m_end(mat.outerIndexPtr()[outer+1])
+    {}
+
+    inline InnerIterator& operator++() { m_id++; return *this; }
+
+    inline Scalar value() const { return m_matrix.valuePtr()[m_id]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); }
+
+    inline Index index() const { return m_matrix.innerIndexPtr()[m_id]; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); }
+
+  protected:
+    const MappedSparseMatrix& m_matrix;
+    const Index m_outer;
+    Index m_id;
+    const Index m_start;
+    const Index m_end;
+};
+
+template<typename Scalar, int _Flags, typename _Index>
+class MappedSparseMatrix<Scalar,_Flags,_Index>::ReverseInnerIterator
+{
+  public:
+    ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer)
+      : m_matrix(mat),
+        m_outer(outer),
+        m_id(mat.outerIndexPtr()[outer+1]),
+        m_start(mat.outerIndexPtr()[outer]),
+        m_end(m_id)
+    {}
+
+    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
+
+    inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); }
+
+    inline Index index() const { return m_matrix.innerIndexPtr()[m_id-1]; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); }
+
+  protected:
+    const MappedSparseMatrix& m_matrix;
+    const Index m_outer;
+    Index m_id;
+    const Index m_start;
+    const Index m_end;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_MAPPED_SPARSEMATRIX_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseBlock.h b/include/eigen3/Eigen/src/SparseCore/SparseBlock.h
new file mode 100644
index 0000000..0ede034
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseBlock.h
@@ -0,0 +1,499 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_BLOCK_H
+#define EIGEN_SPARSE_BLOCK_H
+
+namespace Eigen { 
+
+template<typename XprType, int BlockRows, int BlockCols>
+class BlockImpl<XprType,BlockRows,BlockCols,true,Sparse>
+  : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,true> >
+{
+    typedef typename internal::remove_all<typename XprType::Nested>::type _MatrixTypeNested;
+    typedef Block<XprType, BlockRows, BlockCols, true> BlockType;
+public:
+    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
+protected:
+    enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
+public:
+    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+    
+    class InnerIterator: public XprType::InnerIterator
+    {
+        typedef typename BlockImpl::Index Index;
+      public:
+        inline InnerIterator(const BlockType& xpr, Index outer)
+          : XprType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
+        {}
+        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
+        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
+      protected:
+        Index m_outer;
+    };
+    class ReverseInnerIterator: public XprType::ReverseInnerIterator
+    {
+        typedef typename BlockImpl::Index Index;
+      public:
+        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
+          : XprType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
+        {}
+        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
+        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
+      protected:
+        Index m_outer;
+    };
+
+    inline BlockImpl(const XprType& xpr, int i)
+      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
+    {}
+
+    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
+      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
+    {}
+
+    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+  protected:
+
+    typename XprType::Nested m_matrix;
+    Index m_outerStart;
+    const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+  private:
+    Index nonZeros() const;
+};
+
+
+/***************************************************************************
+* specialisation for SparseMatrix
+***************************************************************************/
+
+template<typename _Scalar, int _Options, typename _Index, int BlockRows, int BlockCols>
+class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true,Sparse>
+  : public SparseMatrixBase<Block<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true> >
+{
+    typedef SparseMatrix<_Scalar, _Options, _Index> SparseMatrixType;
+    typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
+    typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
+    typedef Block<const SparseMatrixType, BlockRows, BlockCols, true> ConstBlockType;
+public:
+    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
+    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+protected:
+    enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
+public:
+    
+    class InnerIterator: public SparseMatrixType::InnerIterator
+    {
+      public:
+        inline InnerIterator(const BlockType& xpr, Index outer)
+          : SparseMatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
+        {}
+        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
+        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
+      protected:
+        Index m_outer;
+    };
+    class ReverseInnerIterator: public SparseMatrixType::ReverseInnerIterator
+    {
+      public:
+        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
+          : SparseMatrixType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
+        {}
+        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
+        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
+      protected:
+        Index m_outer;
+    };
+
+    inline BlockImpl(const SparseMatrixType& xpr, int i)
+      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
+    {}
+
+    inline BlockImpl(const SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
+      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
+    {}
+
+    template<typename OtherDerived>
+    inline BlockType& operator=(const SparseMatrixBase<OtherDerived>& other)
+    {
+      typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _NestedMatrixType;
+      _NestedMatrixType& matrix = const_cast<_NestedMatrixType&>(m_matrix);;
+      // This assignement is slow if this vector set is not empty
+      // and/or it is not at the end of the nonzeros of the underlying matrix.
+
+      // 1 - eval to a temporary to avoid transposition and/or aliasing issues
+      SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, Index> tmp(other);
+
+      // 2 - let's check whether there is enough allocated memory
+      Index nnz           = tmp.nonZeros();
+      Index start         = m_outerStart==0 ? 0 : matrix.outerIndexPtr()[m_outerStart]; // starting position of the current block
+      Index end           = m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()]; // ending posiiton of the current block
+      Index block_size    = end - start;                                                // available room in the current block
+      Index tail_size     = m_matrix.outerIndexPtr()[m_matrix.outerSize()] - end;
+      
+      Index free_size     = m_matrix.isCompressed()
+                          ? Index(matrix.data().allocatedSize()) + block_size
+                          : block_size;
+
+      if(nnz>free_size) 
+      {
+        // realloc manually to reduce copies
+        typename SparseMatrixType::Storage newdata(m_matrix.data().allocatedSize() - block_size + nnz);
+
+        std::memcpy(&newdata.value(0), &m_matrix.data().value(0), start*sizeof(Scalar));
+        std::memcpy(&newdata.index(0), &m_matrix.data().index(0), start*sizeof(Index));
+
+        std::memcpy(&newdata.value(start), &tmp.data().value(0), nnz*sizeof(Scalar));
+        std::memcpy(&newdata.index(start), &tmp.data().index(0), nnz*sizeof(Index));
+
+        std::memcpy(&newdata.value(start+nnz), &matrix.data().value(end), tail_size*sizeof(Scalar));
+        std::memcpy(&newdata.index(start+nnz), &matrix.data().index(end), tail_size*sizeof(Index));
+        
+        newdata.resize(m_matrix.outerIndexPtr()[m_matrix.outerSize()] - block_size + nnz);
+
+        matrix.data().swap(newdata);
+      }
+      else
+      {
+        // no need to realloc, simply copy the tail at its respective position and insert tmp
+        matrix.data().resize(start + nnz + tail_size);
+
+        std::memmove(&matrix.data().value(start+nnz), &matrix.data().value(end), tail_size*sizeof(Scalar));
+        std::memmove(&matrix.data().index(start+nnz), &matrix.data().index(end), tail_size*sizeof(Index));
+
+        std::memcpy(&matrix.data().value(start), &tmp.data().value(0), nnz*sizeof(Scalar));
+        std::memcpy(&matrix.data().index(start), &tmp.data().index(0), nnz*sizeof(Index));
+      }
+      
+      // update innerNonZeros
+      if(!m_matrix.isCompressed())
+        for(Index j=0; j<m_outerSize.value(); ++j)
+          matrix.innerNonZeroPtr()[m_outerStart+j] = tmp.innerVector(j).nonZeros();
+
+      // update outer index pointers
+      Index p = start;
+      for(Index k=0; k<m_outerSize.value(); ++k)
+      {
+        matrix.outerIndexPtr()[m_outerStart+k] = p;
+        p += tmp.innerVector(k).nonZeros();
+      }
+      std::ptrdiff_t offset = nnz - block_size;
+      for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
+      {
+        matrix.outerIndexPtr()[k] += offset;
+      }
+
+      return derived();
+    }
+
+    inline BlockType& operator=(const BlockType& other)
+    {
+      return operator=<BlockType>(other);
+    }
+
+    inline const Scalar* valuePtr() const
+    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    inline Scalar* valuePtr()
+    { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+
+    inline const Index* innerIndexPtr() const
+    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    inline Index* innerIndexPtr()
+    { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+
+    inline const Index* outerIndexPtr() const
+    { return m_matrix.outerIndexPtr() + m_outerStart; }
+    inline Index* outerIndexPtr()
+    { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
+
+    Index nonZeros() const
+    {
+      if(m_matrix.isCompressed())
+        return  std::size_t(m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
+              - std::size_t(m_matrix.outerIndexPtr()[m_outerStart]);
+      else if(m_outerSize.value()==0)
+        return 0;
+      else
+        return Map<const Matrix<Index,OuterSize,1> >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
+    }
+
+    const Scalar& lastCoeff() const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(BlockImpl);
+      eigen_assert(nonZeros()>0);
+      if(m_matrix.isCompressed())
+        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
+      else
+        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart]+m_matrix.innerNonZeroPtr()[m_outerStart]-1];
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+  protected:
+
+    typename SparseMatrixType::Nested m_matrix;
+    Index m_outerStart;
+    const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
+
+};
+
+
+template<typename _Scalar, int _Options, typename _Index, int BlockRows, int BlockCols>
+class BlockImpl<const SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true,Sparse>
+  : public SparseMatrixBase<Block<const SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true> >
+{
+    typedef SparseMatrix<_Scalar, _Options, _Index> SparseMatrixType;
+    typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
+    typedef Block<const SparseMatrixType, BlockRows, BlockCols, true> BlockType;
+public:
+    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
+    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+protected:
+    enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
+public:
+    
+    class InnerIterator: public SparseMatrixType::InnerIterator
+    {
+      public:
+        inline InnerIterator(const BlockType& xpr, Index outer)
+          : SparseMatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
+        {}
+        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
+        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
+      protected:
+        Index m_outer;
+    };
+    class ReverseInnerIterator: public SparseMatrixType::ReverseInnerIterator
+    {
+      public:
+        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
+          : SparseMatrixType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
+        {}
+        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
+        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
+      protected:
+        Index m_outer;
+    };
+
+    inline BlockImpl(const SparseMatrixType& xpr, int i)
+      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
+    {}
+
+    inline BlockImpl(const SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
+      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
+    {}
+
+    inline const Scalar* valuePtr() const
+    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+
+    inline const Index* innerIndexPtr() const
+    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+
+    inline const Index* outerIndexPtr() const
+    { return m_matrix.outerIndexPtr() + m_outerStart; }
+
+    Index nonZeros() const
+    {
+      if(m_matrix.isCompressed())
+        return  std::size_t(m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
+              - std::size_t(m_matrix.outerIndexPtr()[m_outerStart]);
+      else if(m_outerSize.value()==0)
+        return 0;
+      else
+        return Map<const Matrix<Index,OuterSize,1> >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
+    }
+
+    const Scalar& lastCoeff() const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(BlockImpl);
+      eigen_assert(nonZeros()>0);
+      if(m_matrix.isCompressed())
+        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
+      else
+        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart]+m_matrix.innerNonZeroPtr()[m_outerStart]-1];
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+  protected:
+
+    typename SparseMatrixType::Nested m_matrix;
+    Index m_outerStart;
+    const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
+
+};
+
+//----------
+
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major).
+  */
+template<typename Derived>
+typename SparseMatrixBase<Derived>::InnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer)
+{ return InnerVectorReturnType(derived(), outer); }
+
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major). Read-only.
+  */
+template<typename Derived>
+const typename SparseMatrixBase<Derived>::ConstInnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer) const
+{ return ConstInnerVectorReturnType(derived(), outer); }
+
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major).
+  */
+template<typename Derived>
+Block<Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
+{
+  return Block<Derived,Dynamic,Dynamic,true>(derived(),
+                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+  
+}
+
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major). Read-only.
+  */
+template<typename Derived>
+const Block<const Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
+{
+  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
+                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+  
+}
+
+/** Generic implementation of sparse Block expression.
+  * Real-only. 
+  */
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+class BlockImpl<XprType,BlockRows,BlockCols,InnerPanel,Sparse>
+  : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,InnerPanel> >, internal::no_assignment_operator
+{
+  typedef typename internal::remove_all<typename XprType::Nested>::type _MatrixTypeNested;
+  typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+public:
+    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
+    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+
+    /** Column or Row constructor
+      */
+    inline BlockImpl(const XprType& xpr, int i)
+      : m_matrix(xpr),
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
+        m_blockRows(xpr.rows()),
+        m_blockCols(xpr.cols())
+    {}
+
+    /** Dynamic-size constructor
+      */
+    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
+      : m_matrix(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(blockRows), m_blockCols(blockCols)
+    {}
+
+    inline int rows() const { return m_blockRows.value(); }
+    inline int cols() const { return m_blockCols.value(); }
+
+    inline Scalar& coeffRef(int row, int col)
+    {
+      return m_matrix.const_cast_derived()
+               .coeffRef(row + m_startRow.value(), col + m_startCol.value());
+    }
+
+    inline const Scalar coeff(int row, int col) const
+    {
+      return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
+    }
+
+    inline Scalar& coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived()
+             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    inline const Scalar coeff(int index) const
+    {
+      return m_matrix
+             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+    
+    inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
+    
+    class InnerIterator : public _MatrixTypeNested::InnerIterator
+    {
+      typedef typename _MatrixTypeNested::InnerIterator Base;
+      const BlockType& m_block;
+      Index m_end;
+    public:
+
+      EIGEN_STRONG_INLINE InnerIterator(const BlockType& block, Index outer)
+        : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
+          m_block(block),
+          m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
+      {
+        while( (Base::operator bool()) && (Base::index() < (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value())) )
+          Base::operator++();
+      }
+
+      inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
+      inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
+      inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
+      inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
+      
+      inline operator bool() const { return Base::operator bool() && Base::index() < m_end; }
+    };
+    class ReverseInnerIterator : public _MatrixTypeNested::ReverseInnerIterator
+    {
+      typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
+      const BlockType& m_block;
+      Index m_begin;
+    public:
+
+      EIGEN_STRONG_INLINE ReverseInnerIterator(const BlockType& block, Index outer)
+        : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
+          m_block(block),
+          m_begin(IsRowMajor ? block.m_startCol.value() : block.m_startRow.value())
+      {
+        while( (Base::operator bool()) && (Base::index() >= (IsRowMajor ? m_block.m_startCol.value()+block.m_blockCols.value() : m_block.m_startRow.value()+block.m_blockRows.value())) )
+          Base::operator--();
+      }
+
+      inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
+      inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
+      inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
+      inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
+      
+      inline operator bool() const { return Base::operator bool() && Base::index() >= m_begin; }
+    };
+  protected:
+    friend class InnerIterator;
+    friend class ReverseInnerIterator;
+    
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+
+    typename XprType::Nested m_matrix;
+    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
+    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_BLOCK_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseColEtree.h b/include/eigen3/Eigen/src/SparseCore/SparseColEtree.h
new file mode 100644
index 0000000..f8745f4
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseColEtree.h
@@ -0,0 +1,206 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+/* 
+ 
+ * NOTE: This file is the modified version of sp_coletree.c file in SuperLU 
+ 
+ * -- SuperLU routine (version 3.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * August 1, 2008
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+#ifndef SPARSE_COLETREE_H
+#define SPARSE_COLETREE_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** Find the root of the tree/set containing the vertex i : Use Path halving */ 
+template<typename Index, typename IndexVector>
+Index etree_find (Index i, IndexVector& pp)
+{
+  Index p = pp(i); // Parent 
+  Index gp = pp(p); // Grand parent 
+  while (gp != p) 
+  {
+    pp(i) = gp; // Parent pointer on find path is changed to former grand parent
+    i = gp; 
+    p = pp(i);
+    gp = pp(p);
+  }
+  return p; 
+}
+
+/** Compute the column elimination tree of a sparse matrix
+  * \param mat The matrix in column-major format. 
+  * \param parent The elimination tree
+  * \param firstRowElt The column index of the first element in each row
+  * \param perm The permutation to apply to the column of \b mat
+  */
+template <typename MatrixType, typename IndexVector>
+int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowElt, typename MatrixType::Index *perm=0)
+{
+  typedef typename MatrixType::Index Index;
+  Index nc = mat.cols(); // Number of columns 
+  Index m = mat.rows();
+  Index diagSize = (std::min)(nc,m);
+  IndexVector root(nc); // root of subtree of etree 
+  root.setZero();
+  IndexVector pp(nc); // disjoint sets 
+  pp.setZero(); // Initialize disjoint sets 
+  parent.resize(mat.cols());
+  //Compute first nonzero column in each row 
+  Index row,col; 
+  firstRowElt.resize(m);
+  firstRowElt.setConstant(nc);
+  firstRowElt.segment(0, diagSize).setLinSpaced(diagSize, 0, diagSize-1);
+  bool found_diag;
+  for (col = 0; col < nc; col++)
+  {
+    Index pcol = col;
+    if(perm) pcol  = perm[col];
+    for (typename MatrixType::InnerIterator it(mat, pcol); it; ++it)
+    { 
+      row = it.row();
+      firstRowElt(row) = (std::min)(firstRowElt(row), col);
+    }
+  }
+  /* Compute etree by Liu's algorithm for symmetric matrices,
+          except use (firstRowElt[r],c) in place of an edge (r,c) of A.
+    Thus each row clique in A'*A is replaced by a star
+    centered at its first vertex, which has the same fill. */
+  Index rset, cset, rroot; 
+  for (col = 0; col < nc; col++) 
+  {
+    found_diag = col>=m;
+    pp(col) = col; 
+    cset = col; 
+    root(cset) = col; 
+    parent(col) = nc; 
+    /* The diagonal element is treated here even if it does not exist in the matrix
+     * hence the loop is executed once more */ 
+    Index pcol = col;
+    if(perm) pcol  = perm[col];
+    for (typename MatrixType::InnerIterator it(mat, pcol); it||!found_diag; ++it)
+    { //  A sequence of interleaved find and union is performed 
+      Index i = col;
+      if(it) i = it.index();
+      if (i == col) found_diag = true;
+      
+      row = firstRowElt(i);
+      if (row >= col) continue; 
+      rset = internal::etree_find(row, pp); // Find the name of the set containing row
+      rroot = root(rset);
+      if (rroot != col) 
+      {
+        parent(rroot) = col; 
+        pp(cset) = rset; 
+        cset = rset; 
+        root(cset) = col; 
+      }
+    }
+  }
+  return 0;  
+}
+
+/** 
+  * Depth-first search from vertex n.  No recursion.
+  * This routine was contributed by Cédric Doucet, CEDRAT Group, Meylan, France.
+*/
+template <typename Index, typename IndexVector>
+void nr_etdfs (Index n, IndexVector& parent, IndexVector& first_kid, IndexVector& next_kid, IndexVector& post, Index postnum)
+{
+  Index current = n, first, next;
+  while (postnum != n) 
+  {
+    // No kid for the current node
+    first = first_kid(current);
+    
+    // no kid for the current node
+    if (first == -1) 
+    {
+      // Numbering this node because it has no kid 
+      post(current) = postnum++;
+      
+      // looking for the next kid 
+      next = next_kid(current); 
+      while (next == -1) 
+      {
+        // No more kids : back to the parent node
+        current = parent(current); 
+        // numbering the parent node 
+        post(current) = postnum++;
+        
+        // Get the next kid 
+        next = next_kid(current); 
+      }
+      // stopping criterion 
+      if (postnum == n+1) return; 
+      
+      // Updating current node 
+      current = next; 
+    }
+    else 
+    {
+      current = first; 
+    }
+  }
+}
+
+
+/**
+  * \brief Post order a tree 
+  * \param n the number of nodes
+  * \param parent Input tree
+  * \param post postordered tree
+  */
+template <typename Index, typename IndexVector>
+void treePostorder(Index n, IndexVector& parent, IndexVector& post)
+{
+  IndexVector first_kid, next_kid; // Linked list of children 
+  Index postnum; 
+  // Allocate storage for working arrays and results 
+  first_kid.resize(n+1); 
+  next_kid.setZero(n+1);
+  post.setZero(n+1);
+  
+  // Set up structure describing children
+  Index v, dad; 
+  first_kid.setConstant(-1); 
+  for (v = n-1; v >= 0; v--) 
+  {
+    dad = parent(v);
+    next_kid(v) = first_kid(dad); 
+    first_kid(dad) = v; 
+  }
+  
+  // Depth-first search from dummy root vertex #n
+  postnum = 0; 
+  internal::nr_etdfs(n, parent, first_kid, next_kid, post, postnum);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // SPARSE_COLETREE_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/include/eigen3/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
new file mode 100644
index 0000000..60ca769
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -0,0 +1,325 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_CWISE_BINARY_OP_H
+#define EIGEN_SPARSE_CWISE_BINARY_OP_H
+
+namespace Eigen { 
+
+// Here we have to handle 3 cases:
+//  1 - sparse op dense
+//  2 - dense op sparse
+//  3 - sparse op sparse
+// We also need to implement a 4th iterator for:
+//  4 - dense op dense
+// Finally, we also need to distinguish between the product and other operations :
+//                configuration      returned mode
+//  1 - sparse op dense    product      sparse
+//                         generic      dense
+//  2 - dense op sparse    product      sparse
+//                         generic      dense
+//  3 - sparse op sparse   product      sparse
+//                         generic      sparse
+//  4 - dense op dense     product      dense
+//                         generic      dense
+
+namespace internal {
+
+template<> struct promote_storage_type<Dense,Sparse>
+{ typedef Sparse ret; };
+
+template<> struct promote_storage_type<Sparse,Dense>
+{ typedef Sparse ret; };
+
+template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived,
+  typename _LhsStorageMode = typename traits<Lhs>::StorageKind,
+  typename _RhsStorageMode = typename traits<Rhs>::StorageKind>
+class sparse_cwise_binary_op_inner_iterator_selector;
+
+} // end namespace internal
+
+template<typename BinaryOp, typename Lhs, typename Rhs>
+class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
+  : public SparseMatrixBase<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  public:
+    class InnerIterator;
+    class ReverseInnerIterator;
+    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+    CwiseBinaryOpImpl()
+    {
+      typedef typename internal::traits<Lhs>::StorageKind LhsStorageKind;
+      typedef typename internal::traits<Rhs>::StorageKind RhsStorageKind;
+      EIGEN_STATIC_ASSERT((
+                (!internal::is_same<LhsStorageKind,RhsStorageKind>::value)
+            ||  ((Lhs::Flags&RowMajorBit) == (Rhs::Flags&RowMajorBit))),
+            THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH);
+    }
+};
+
+template<typename BinaryOp, typename Lhs, typename Rhs>
+class CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator
+  : public internal::sparse_cwise_binary_op_inner_iterator_selector<BinaryOp,Lhs,Rhs,typename CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator>
+{
+  public:
+    typedef typename Lhs::Index Index;
+    typedef internal::sparse_cwise_binary_op_inner_iterator_selector<
+      BinaryOp,Lhs,Rhs, InnerIterator> Base;
+
+    // NOTE: we have to prefix Index by "typename Lhs::" to avoid an ICE with VC11
+    EIGEN_STRONG_INLINE InnerIterator(const CwiseBinaryOpImpl& binOp, typename Lhs::Index outer)
+      : Base(binOp.derived(),outer)
+    {}
+};
+
+/***************************************************************************
+* Implementation of inner-iterators
+***************************************************************************/
+
+// template<typename T> struct internal::func_is_conjunction { enum { ret = false }; };
+// template<typename T> struct internal::func_is_conjunction<internal::scalar_product_op<T> > { enum { ret = true }; };
+
+// TODO generalize the internal::scalar_product_op specialization to all conjunctions if any !
+
+namespace internal {
+
+// sparse - sparse  (generic)
+template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived>
+class sparse_cwise_binary_op_inner_iterator_selector<BinaryOp, Lhs, Rhs, Derived, Sparse, Sparse>
+{
+    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> CwiseBinaryXpr;
+    typedef typename traits<CwiseBinaryXpr>::Scalar Scalar;
+    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
+    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
+    typedef typename _LhsNested::InnerIterator LhsIterator;
+    typedef typename _RhsNested::InnerIterator RhsIterator;
+    typedef typename Lhs::Index Index;
+
+  public:
+
+    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
+      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
+    {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE Derived& operator++()
+    {
+      if (m_lhsIter && m_rhsIter && (m_lhsIter.index() == m_rhsIter.index()))
+      {
+        m_id = m_lhsIter.index();
+        m_value = m_functor(m_lhsIter.value(), m_rhsIter.value());
+        ++m_lhsIter;
+        ++m_rhsIter;
+      }
+      else if (m_lhsIter && (!m_rhsIter || (m_lhsIter.index() < m_rhsIter.index())))
+      {
+        m_id = m_lhsIter.index();
+        m_value = m_functor(m_lhsIter.value(), Scalar(0));
+        ++m_lhsIter;
+      }
+      else if (m_rhsIter && (!m_lhsIter || (m_lhsIter.index() > m_rhsIter.index())))
+      {
+        m_id = m_rhsIter.index();
+        m_value = m_functor(Scalar(0), m_rhsIter.value());
+        ++m_rhsIter;
+      }
+      else
+      {
+        m_value = 0; // this is to avoid a compilation warning
+        m_id = -1;
+      }
+      return *static_cast<Derived*>(this);
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
+
+    EIGEN_STRONG_INLINE Index index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index row() const { return Lhs::IsRowMajor ? m_lhsIter.row() : index(); }
+    EIGEN_STRONG_INLINE Index col() const { return Lhs::IsRowMajor ? index() : m_lhsIter.col(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id>=0; }
+
+  protected:
+    LhsIterator m_lhsIter;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    Index m_id;
+};
+
+// sparse - sparse  (product)
+template<typename T, typename Lhs, typename Rhs, typename Derived>
+class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Sparse>
+{
+    typedef scalar_product_op<T> BinaryFunc;
+    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
+    typedef typename CwiseBinaryXpr::Scalar Scalar;
+    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
+    typedef typename _LhsNested::InnerIterator LhsIterator;
+    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
+    typedef typename _RhsNested::InnerIterator RhsIterator;
+    typedef typename Lhs::Index Index;
+  public:
+
+    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
+      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
+    {
+      while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index()))
+      {
+        if (m_lhsIter.index() < m_rhsIter.index())
+          ++m_lhsIter;
+        else
+          ++m_rhsIter;
+      }
+    }
+
+    EIGEN_STRONG_INLINE Derived& operator++()
+    {
+      ++m_lhsIter;
+      ++m_rhsIter;
+      while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index()))
+      {
+        if (m_lhsIter.index() < m_rhsIter.index())
+          ++m_lhsIter;
+        else
+          ++m_rhsIter;
+      }
+      return *static_cast<Derived*>(this);
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { return m_functor(m_lhsIter.value(), m_rhsIter.value()); }
+
+    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
+    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return (m_lhsIter && m_rhsIter); }
+
+  protected:
+    LhsIterator m_lhsIter;
+    RhsIterator m_rhsIter;
+    const BinaryFunc& m_functor;
+};
+
+// sparse - dense  (product)
+template<typename T, typename Lhs, typename Rhs, typename Derived>
+class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Dense>
+{
+    typedef scalar_product_op<T> BinaryFunc;
+    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
+    typedef typename CwiseBinaryXpr::Scalar Scalar;
+    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
+    typedef typename traits<CwiseBinaryXpr>::RhsNested RhsNested;
+    typedef typename _LhsNested::InnerIterator LhsIterator;
+    typedef typename Lhs::Index Index;
+    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
+  public:
+
+    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
+      : m_rhs(xpr.rhs()), m_lhsIter(xpr.lhs(),outer), m_functor(xpr.functor()), m_outer(outer)
+    {}
+
+    EIGEN_STRONG_INLINE Derived& operator++()
+    {
+      ++m_lhsIter;
+      return *static_cast<Derived*>(this);
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const
+    { return m_functor(m_lhsIter.value(),
+                       m_rhs.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }
+
+    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
+    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
+
+  protected:
+    RhsNested m_rhs;
+    LhsIterator m_lhsIter;
+    const BinaryFunc m_functor;
+    const Index m_outer;
+};
+
+// sparse - dense  (product)
+template<typename T, typename Lhs, typename Rhs, typename Derived>
+class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Dense, Sparse>
+{
+    typedef scalar_product_op<T> BinaryFunc;
+    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
+    typedef typename CwiseBinaryXpr::Scalar Scalar;
+    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
+    typedef typename _RhsNested::InnerIterator RhsIterator;
+    typedef typename Lhs::Index Index;
+
+    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
+  public:
+
+    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
+      : m_xpr(xpr), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor()), m_outer(outer)
+    {}
+
+    EIGEN_STRONG_INLINE Derived& operator++()
+    {
+      ++m_rhsIter;
+      return *static_cast<Derived*>(this);
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const
+    { return m_functor(m_xpr.lhs().coeff(IsRowMajor?m_outer:m_rhsIter.index(),IsRowMajor?m_rhsIter.index():m_outer), m_rhsIter.value()); }
+
+    EIGEN_STRONG_INLINE Index index() const { return m_rhsIter.index(); }
+    EIGEN_STRONG_INLINE Index row() const { return m_rhsIter.row(); }
+    EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
+
+  protected:
+    const CwiseBinaryXpr& m_xpr;
+    RhsIterator m_rhsIter;
+    const BinaryFunc& m_functor;
+    const Index m_outer;
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* Implementation of SparseMatrixBase and SparseCwise functions/operators
+***************************************************************************/
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived &
+SparseMatrixBase<Derived>::operator-=(const SparseMatrixBase<OtherDerived> &other)
+{
+  return derived() = derived() - other.derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE Derived &
+SparseMatrixBase<Derived>::operator+=(const SparseMatrixBase<OtherDerived>& other)
+{
+  return derived() = derived() + other.derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived> &other) const
+{
+  return EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE(derived(), other.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_CWISE_BINARY_OP_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/include/eigen3/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
new file mode 100644
index 0000000..5a50c78
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -0,0 +1,163 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_CWISE_UNARY_OP_H
+#define EIGEN_SPARSE_CWISE_UNARY_OP_H
+
+namespace Eigen { 
+
+template<typename UnaryOp, typename MatrixType>
+class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>
+  : public SparseMatrixBase<CwiseUnaryOp<UnaryOp, MatrixType> >
+{
+  public:
+
+    class InnerIterator;
+    class ReverseInnerIterator;
+
+    typedef CwiseUnaryOp<UnaryOp, MatrixType> Derived;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+
+  protected:
+    typedef typename internal::traits<Derived>::_XprTypeNested _MatrixTypeNested;
+    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
+    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
+};
+
+template<typename UnaryOp, typename MatrixType>
+class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::InnerIterator
+    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator
+{
+    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
+    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator Base;
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
+      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    {}
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    { Base::operator++(); return *this; }
+
+    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
+
+  protected:
+    const UnaryOp m_functor;
+  private:
+    typename CwiseUnaryOpImpl::Scalar& valueRef();
+};
+
+template<typename UnaryOp, typename MatrixType>
+class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::ReverseInnerIterator
+    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator
+{
+    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
+    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
+  public:
+
+    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
+      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    {}
+
+    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
+    { Base::operator--(); return *this; }
+
+    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
+
+  protected:
+    const UnaryOp m_functor;
+  private:
+    typename CwiseUnaryOpImpl::Scalar& valueRef();
+};
+
+template<typename ViewOp, typename MatrixType>
+class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>
+  : public SparseMatrixBase<CwiseUnaryView<ViewOp, MatrixType> >
+{
+  public:
+
+    class InnerIterator;
+    class ReverseInnerIterator;
+
+    typedef CwiseUnaryView<ViewOp, MatrixType> Derived;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+
+  protected:
+    typedef typename internal::traits<Derived>::_MatrixTypeNested _MatrixTypeNested;
+    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
+    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
+};
+
+template<typename ViewOp, typename MatrixType>
+class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::InnerIterator
+    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator
+{
+    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
+    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator Base;
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
+      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    {}
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    { Base::operator++(); return *this; }
+
+    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
+    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
+
+  protected:
+    const ViewOp m_functor;
+};
+
+template<typename ViewOp, typename MatrixType>
+class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::ReverseInnerIterator
+    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator
+{
+    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
+    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
+  public:
+
+    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
+      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    {}
+
+    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
+    { Base::operator--(); return *this; }
+
+    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
+    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
+
+  protected:
+    const ViewOp m_functor;
+};
+
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+SparseMatrixBase<Derived>::operator*=(const Scalar& other)
+{
+  for (Index j=0; j<outerSize(); ++j)
+    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+      i.valueRef() *= other;
+  return derived();
+}
+
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+SparseMatrixBase<Derived>::operator/=(const Scalar& other)
+{
+  for (Index j=0; j<outerSize(); ++j)
+    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+      i.valueRef() /= other;
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_CWISE_UNARY_OP_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseDenseProduct.h b/include/eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
new file mode 100644
index 0000000..a908419
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -0,0 +1,311 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEDENSEPRODUCT_H
+#define EIGEN_SPARSEDENSEPRODUCT_H
+
+namespace Eigen { 
+
+template<typename Lhs, typename Rhs, int InnerSize> struct SparseDenseProductReturnType
+{
+  typedef SparseTimeDenseProduct<Lhs,Rhs> Type;
+};
+
+template<typename Lhs, typename Rhs> struct SparseDenseProductReturnType<Lhs,Rhs,1>
+{
+  typedef typename internal::conditional<
+    Lhs::IsRowMajor,
+    SparseDenseOuterProduct<Rhs,Lhs,true>,
+    SparseDenseOuterProduct<Lhs,Rhs,false> >::type Type;
+};
+
+template<typename Lhs, typename Rhs, int InnerSize> struct DenseSparseProductReturnType
+{
+  typedef DenseTimeSparseProduct<Lhs,Rhs> Type;
+};
+
+template<typename Lhs, typename Rhs> struct DenseSparseProductReturnType<Lhs,Rhs,1>
+{
+  typedef typename internal::conditional<
+    Rhs::IsRowMajor,
+    SparseDenseOuterProduct<Rhs,Lhs,true>,
+    SparseDenseOuterProduct<Lhs,Rhs,false> >::type Type;
+};
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, bool Tr>
+struct traits<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
+{
+  typedef Sparse StorageKind;
+  typedef typename scalar_product_traits<typename traits<Lhs>::Scalar,
+                                         typename traits<Rhs>::Scalar>::ReturnType Scalar;
+  typedef typename Lhs::Index Index;
+  typedef typename Lhs::Nested LhsNested;
+  typedef typename Rhs::Nested RhsNested;
+  typedef typename remove_all<LhsNested>::type _LhsNested;
+  typedef typename remove_all<RhsNested>::type _RhsNested;
+
+  enum {
+    LhsCoeffReadCost = traits<_LhsNested>::CoeffReadCost,
+    RhsCoeffReadCost = traits<_RhsNested>::CoeffReadCost,
+
+    RowsAtCompileTime    = Tr ? int(traits<Rhs>::RowsAtCompileTime)     : int(traits<Lhs>::RowsAtCompileTime),
+    ColsAtCompileTime    = Tr ? int(traits<Lhs>::ColsAtCompileTime)     : int(traits<Rhs>::ColsAtCompileTime),
+    MaxRowsAtCompileTime = Tr ? int(traits<Rhs>::MaxRowsAtCompileTime)  : int(traits<Lhs>::MaxRowsAtCompileTime),
+    MaxColsAtCompileTime = Tr ? int(traits<Lhs>::MaxColsAtCompileTime)  : int(traits<Rhs>::MaxColsAtCompileTime),
+
+    Flags = Tr ? RowMajorBit : 0,
+
+    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + NumTraits<Scalar>::MulCost
+  };
+};
+
+} // end namespace internal
+
+template<typename Lhs, typename Rhs, bool Tr>
+class SparseDenseOuterProduct
+ : public SparseMatrixBase<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
+{
+  public:
+
+    typedef SparseMatrixBase<SparseDenseOuterProduct> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(SparseDenseOuterProduct)
+    typedef internal::traits<SparseDenseOuterProduct> Traits;
+
+  private:
+
+    typedef typename Traits::LhsNested LhsNested;
+    typedef typename Traits::RhsNested RhsNested;
+    typedef typename Traits::_LhsNested _LhsNested;
+    typedef typename Traits::_RhsNested _RhsNested;
+
+  public:
+
+    class InnerIterator;
+
+    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Lhs& lhs, const Rhs& rhs)
+      : m_lhs(lhs), m_rhs(rhs)
+    {
+      EIGEN_STATIC_ASSERT(!Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
+    }
+
+    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Rhs& rhs, const Lhs& lhs)
+      : m_lhs(lhs), m_rhs(rhs)
+    {
+      EIGEN_STATIC_ASSERT(Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const { return Tr ? m_rhs.rows() : m_lhs.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return Tr ? m_lhs.cols() : m_rhs.cols(); }
+
+    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
+    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
+
+  protected:
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+};
+
+template<typename Lhs, typename Rhs, bool Transpose>
+class SparseDenseOuterProduct<Lhs,Rhs,Transpose>::InnerIterator : public _LhsNested::InnerIterator
+{
+    typedef typename _LhsNested::InnerIterator Base;
+    typedef typename SparseDenseOuterProduct::Index Index;
+  public:
+    EIGEN_STRONG_INLINE InnerIterator(const SparseDenseOuterProduct& prod, Index outer)
+      : Base(prod.lhs(), 0), m_outer(outer), m_factor(get(prod.rhs(), outer, typename internal::traits<Rhs>::StorageKind() ))
+    { }
+
+    inline Index outer() const { return m_outer; }
+    inline Index row() const { return Transpose ? m_outer : Base::index(); }
+    inline Index col() const { return Transpose ? Base::index() : m_outer; }
+
+    inline Scalar value() const { return Base::value() * m_factor; }
+
+  protected:
+    static Scalar get(const _RhsNested &rhs, Index outer, Dense = Dense())
+    {
+      return rhs.coeff(outer);
+    }
+    
+    static Scalar get(const _RhsNested &rhs, Index outer, Sparse = Sparse())
+    {
+      typename Traits::_RhsNested::InnerIterator it(rhs, outer);
+      if (it && it.index()==0)
+        return it.value();
+      
+      return Scalar(0);
+    }
+    
+    Index m_outer;
+    Scalar m_factor;
+};
+
+namespace internal {
+template<typename Lhs, typename Rhs>
+struct traits<SparseTimeDenseProduct<Lhs,Rhs> >
+ : traits<ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs> >
+{
+  typedef Dense StorageKind;
+  typedef MatrixXpr XprKind;
+};
+
+template<typename SparseLhsType, typename DenseRhsType, typename DenseResType,
+         int LhsStorageOrder = ((SparseLhsType::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor,
+         bool ColPerCol = ((DenseRhsType::Flags&RowMajorBit)==0) || DenseRhsType::ColsAtCompileTime==1>
+struct sparse_time_dense_product_impl;
+
+template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, RowMajor, true>
+{
+  typedef typename internal::remove_all<SparseLhsType>::type Lhs;
+  typedef typename internal::remove_all<DenseRhsType>::type Rhs;
+  typedef typename internal::remove_all<DenseResType>::type Res;
+  typedef typename Lhs::Index Index;
+  typedef typename Lhs::InnerIterator LhsInnerIterator;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
+  {
+    for(Index c=0; c<rhs.cols(); ++c)
+    {
+      Index n = lhs.outerSize();
+      for(Index j=0; j<n; ++j)
+      {
+        typename Res::Scalar tmp(0);
+        for(LhsInnerIterator it(lhs,j); it ;++it)
+          tmp += it.value() * rhs.coeff(it.index(),c);
+        res.coeffRef(j,c) = alpha * tmp;
+      }
+    }
+  }
+};
+
+template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, ColMajor, true>
+{
+  typedef typename internal::remove_all<SparseLhsType>::type Lhs;
+  typedef typename internal::remove_all<DenseRhsType>::type Rhs;
+  typedef typename internal::remove_all<DenseResType>::type Res;
+  typedef typename Lhs::InnerIterator LhsInnerIterator;
+  typedef typename Lhs::Index Index;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
+  {
+    for(Index c=0; c<rhs.cols(); ++c)
+    {
+      for(Index j=0; j<lhs.outerSize(); ++j)
+      {
+        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
+        for(LhsInnerIterator it(lhs,j); it ;++it)
+          res.coeffRef(it.index(),c) += it.value() * rhs_j;
+      }
+    }
+  }
+};
+
+template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, RowMajor, false>
+{
+  typedef typename internal::remove_all<SparseLhsType>::type Lhs;
+  typedef typename internal::remove_all<DenseRhsType>::type Rhs;
+  typedef typename internal::remove_all<DenseResType>::type Res;
+  typedef typename Lhs::InnerIterator LhsInnerIterator;
+  typedef typename Lhs::Index Index;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
+  {
+    for(Index j=0; j<lhs.outerSize(); ++j)
+    {
+      typename Res::RowXpr res_j(res.row(j));
+      for(LhsInnerIterator it(lhs,j); it ;++it)
+        res_j += (alpha*it.value()) * rhs.row(it.index());
+    }
+  }
+};
+
+template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, ColMajor, false>
+{
+  typedef typename internal::remove_all<SparseLhsType>::type Lhs;
+  typedef typename internal::remove_all<DenseRhsType>::type Rhs;
+  typedef typename internal::remove_all<DenseResType>::type Res;
+  typedef typename Lhs::InnerIterator LhsInnerIterator;
+  typedef typename Lhs::Index Index;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
+  {
+    for(Index j=0; j<lhs.outerSize(); ++j)
+    {
+      typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
+      for(LhsInnerIterator it(lhs,j); it ;++it)
+        res.row(it.index()) += (alpha*it.value()) * rhs_j;
+    }
+  }
+};
+
+template<typename SparseLhsType, typename DenseRhsType, typename DenseResType,typename AlphaType>
+inline void sparse_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
+{
+  sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType>::run(lhs, rhs, res, alpha);
+}
+
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class SparseTimeDenseProduct
+  : public ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs>
+{
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseTimeDenseProduct)
+
+    SparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {}
+
+    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+    {
+      internal::sparse_time_dense_product(m_lhs, m_rhs, dest, alpha);
+    }
+
+  private:
+    SparseTimeDenseProduct& operator=(const SparseTimeDenseProduct&);
+};
+
+
+// dense = dense * sparse
+namespace internal {
+template<typename Lhs, typename Rhs>
+struct traits<DenseTimeSparseProduct<Lhs,Rhs> >
+ : traits<ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs> >
+{
+  typedef Dense StorageKind;
+};
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class DenseTimeSparseProduct
+  : public ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs>
+{
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseProduct)
+
+    DenseTimeSparseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {}
+
+    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+    {
+      Transpose<const _LhsNested> lhs_t(m_lhs);
+      Transpose<const _RhsNested> rhs_t(m_rhs);
+      Transpose<Dest> dest_t(dest);
+      internal::sparse_time_dense_product(rhs_t, lhs_t, dest_t, alpha);
+    }
+
+  private:
+    DenseTimeSparseProduct& operator=(const DenseTimeSparseProduct&);
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEDENSEPRODUCT_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseDiagonalProduct.h b/include/eigen3/Eigen/src/SparseCore/SparseDiagonalProduct.h
new file mode 100644
index 0000000..1bb590e
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseDiagonalProduct.h
@@ -0,0 +1,196 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_DIAGONAL_PRODUCT_H
+#define EIGEN_SPARSE_DIAGONAL_PRODUCT_H
+
+namespace Eigen { 
+
+// The product of a diagonal matrix with a sparse matrix can be easily
+// implemented using expression template.
+// We have two consider very different cases:
+// 1 - diag * row-major sparse
+//     => each inner vector <=> scalar * sparse vector product
+//     => so we can reuse CwiseUnaryOp::InnerIterator
+// 2 - diag * col-major sparse
+//     => each inner vector <=> densevector * sparse vector cwise product
+//     => again, we can reuse specialization of CwiseBinaryOp::InnerIterator
+//        for that particular case
+// The two other cases are symmetric.
+
+namespace internal {
+
+template<typename Lhs, typename Rhs>
+struct traits<SparseDiagonalProduct<Lhs, Rhs> >
+{
+  typedef typename remove_all<Lhs>::type _Lhs;
+  typedef typename remove_all<Rhs>::type _Rhs;
+  typedef typename _Lhs::Scalar Scalar;
+  typedef typename promote_index_type<typename traits<Lhs>::Index,
+                                         typename traits<Rhs>::Index>::type Index;
+  typedef Sparse StorageKind;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = _Lhs::RowsAtCompileTime,
+    ColsAtCompileTime = _Rhs::ColsAtCompileTime,
+
+    MaxRowsAtCompileTime = _Lhs::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = _Rhs::MaxColsAtCompileTime,
+
+    SparseFlags = is_diagonal<_Lhs>::ret ? int(_Rhs::Flags) : int(_Lhs::Flags),
+    Flags = (SparseFlags&RowMajorBit),
+    CoeffReadCost = Dynamic
+  };
+};
+
+enum {SDP_IsDiagonal, SDP_IsSparseRowMajor, SDP_IsSparseColMajor};
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType, int RhsMode, int LhsMode>
+class sparse_diagonal_product_inner_iterator_selector;
+
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class SparseDiagonalProduct
+  : public SparseMatrixBase<SparseDiagonalProduct<Lhs,Rhs> >,
+    internal::no_assignment_operator
+{
+    typedef typename Lhs::Nested LhsNested;
+    typedef typename Rhs::Nested RhsNested;
+
+    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
+    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
+
+    enum {
+      LhsMode = internal::is_diagonal<_LhsNested>::ret ? internal::SDP_IsDiagonal
+              : (_LhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor,
+      RhsMode = internal::is_diagonal<_RhsNested>::ret ? internal::SDP_IsDiagonal
+              : (_RhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor
+    };
+
+  public:
+
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseDiagonalProduct)
+
+    typedef internal::sparse_diagonal_product_inner_iterator_selector
+                      <_LhsNested,_RhsNested,SparseDiagonalProduct,LhsMode,RhsMode> InnerIterator;
+    
+    // We do not want ReverseInnerIterator for diagonal-sparse products,
+    // but this dummy declaration is needed to make diag * sparse * diag compile.
+    class ReverseInnerIterator;
+
+    EIGEN_STRONG_INLINE SparseDiagonalProduct(const Lhs& lhs, const Rhs& rhs)
+      : m_lhs(lhs), m_rhs(rhs)
+    {
+      eigen_assert(lhs.cols() == rhs.rows() && "invalid sparse matrix * diagonal matrix product");
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+
+    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
+    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
+
+  protected:
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+};
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
+class sparse_diagonal_product_inner_iterator_selector
+<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseRowMajor>
+  : public CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator
+{
+    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator Base;
+    typedef typename Lhs::Index Index;
+  public:
+    inline sparse_diagonal_product_inner_iterator_selector(
+              const SparseDiagonalProductType& expr, Index outer)
+      : Base(expr.rhs()*(expr.lhs().diagonal().coeff(outer)), outer)
+    {}
+};
+
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
+class sparse_diagonal_product_inner_iterator_selector
+<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseColMajor>
+  : public CwiseBinaryOp<
+      scalar_product_op<typename Lhs::Scalar>,
+      const typename Rhs::ConstInnerVectorReturnType,
+      const typename Lhs::DiagonalVectorType>::InnerIterator
+{
+    typedef typename CwiseBinaryOp<
+      scalar_product_op<typename Lhs::Scalar>,
+      const typename Rhs::ConstInnerVectorReturnType,
+      const typename Lhs::DiagonalVectorType>::InnerIterator Base;
+    typedef typename Lhs::Index Index;
+    Index m_outer;
+  public:
+    inline sparse_diagonal_product_inner_iterator_selector(
+              const SparseDiagonalProductType& expr, Index outer)
+      : Base(expr.rhs().innerVector(outer) .cwiseProduct(expr.lhs().diagonal()), 0), m_outer(outer)
+    {}
+    
+    inline Index outer() const { return m_outer; }
+    inline Index col() const { return m_outer; }
+};
+
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
+class sparse_diagonal_product_inner_iterator_selector
+<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseColMajor,SDP_IsDiagonal>
+  : public CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator
+{
+    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator Base;
+    typedef typename Lhs::Index Index;
+  public:
+    inline sparse_diagonal_product_inner_iterator_selector(
+              const SparseDiagonalProductType& expr, Index outer)
+      : Base(expr.lhs()*expr.rhs().diagonal().coeff(outer), outer)
+    {}
+};
+
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
+class sparse_diagonal_product_inner_iterator_selector
+<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseRowMajor,SDP_IsDiagonal>
+  : public CwiseBinaryOp<
+      scalar_product_op<typename Rhs::Scalar>,
+      const typename Lhs::ConstInnerVectorReturnType,
+      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator
+{
+    typedef typename CwiseBinaryOp<
+      scalar_product_op<typename Rhs::Scalar>,
+      const typename Lhs::ConstInnerVectorReturnType,
+      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator Base;
+    typedef typename Lhs::Index Index;
+    Index m_outer;
+  public:
+    inline sparse_diagonal_product_inner_iterator_selector(
+              const SparseDiagonalProductType& expr, Index outer)
+      : Base(expr.lhs().innerVector(outer) .cwiseProduct(expr.rhs().diagonal().transpose()), 0), m_outer(outer)
+    {}
+    
+    inline Index outer() const { return m_outer; }
+    inline Index row() const { return m_outer; }
+};
+
+} // end namespace internal
+
+// SparseMatrixBase functions
+
+template<typename Derived>
+template<typename OtherDerived>
+const SparseDiagonalProduct<Derived,OtherDerived>
+SparseMatrixBase<Derived>::operator*(const DiagonalBase<OtherDerived> &other) const
+{
+  return SparseDiagonalProduct<Derived,OtherDerived>(this->derived(), other.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_DIAGONAL_PRODUCT_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseDot.h b/include/eigen3/Eigen/src/SparseCore/SparseDot.h
new file mode 100644
index 0000000..db39c9a
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseDot.h
@@ -0,0 +1,101 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_DOT_H
+#define EIGEN_SPARSE_DOT_H
+
+namespace Eigen { 
+
+template<typename Derived>
+template<typename OtherDerived>
+typename internal::traits<Derived>::Scalar
+SparseMatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  eigen_assert(size() == other.size());
+  eigen_assert(other.size()>0 && "you are using a non initialized vector");
+
+  typename Derived::InnerIterator i(derived(),0);
+  Scalar res(0);
+  while (i)
+  {
+    res += numext::conj(i.value()) * other.coeff(i.index());
+    ++i;
+  }
+  return res;
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+typename internal::traits<Derived>::Scalar
+SparseMatrixBase<Derived>::dot(const SparseMatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  eigen_assert(size() == other.size());
+
+  typedef typename Derived::Nested  Nested;
+  typedef typename OtherDerived::Nested  OtherNested;
+  typedef typename internal::remove_all<Nested>::type  NestedCleaned;
+  typedef typename internal::remove_all<OtherNested>::type  OtherNestedCleaned;
+
+  Nested nthis(derived());
+  OtherNested nother(other.derived());
+
+  typename NestedCleaned::InnerIterator i(nthis,0);
+  typename OtherNestedCleaned::InnerIterator j(nother,0);
+  Scalar res(0);
+  while (i && j)
+  {
+    if (i.index()==j.index())
+    {
+      res += numext::conj(i.value()) * j.value();
+      ++i; ++j;
+    }
+    else if (i.index()<j.index())
+      ++i;
+    else
+      ++j;
+  }
+  return res;
+}
+
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+SparseMatrixBase<Derived>::squaredNorm() const
+{
+  return numext::real((*this).cwiseAbs2().sum());
+}
+
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+SparseMatrixBase<Derived>::norm() const
+{
+  using std::sqrt;
+  return sqrt(squaredNorm());
+}
+
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+SparseMatrixBase<Derived>::blueNorm() const
+{
+  return internal::blueNorm_impl(*this);
+}
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_DOT_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseFuzzy.h b/include/eigen3/Eigen/src/SparseCore/SparseFuzzy.h
new file mode 100644
index 0000000..45f36e9
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseFuzzy.h
@@ -0,0 +1,26 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_FUZZY_H
+#define EIGEN_SPARSE_FUZZY_H
+
+// template<typename Derived>
+// template<typename OtherDerived>
+// bool SparseMatrixBase<Derived>::isApprox(
+//   const OtherDerived& other,
+//   typename NumTraits<Scalar>::Real prec
+// ) const
+// {
+//   const typename internal::nested<Derived,2>::type nested(derived());
+//   const typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
+//   return    (nested - otherNested).cwise().abs2().sum()
+//          <= prec * prec * (std::min)(nested.cwise().abs2().sum(), otherNested.cwise().abs2().sum());
+// }
+
+#endif // EIGEN_SPARSE_FUZZY_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseMatrix.h b/include/eigen3/Eigen/src/SparseCore/SparseMatrix.h
new file mode 100644
index 0000000..ba5e3a9
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseMatrix.h
@@ -0,0 +1,1259 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEMATRIX_H
+#define EIGEN_SPARSEMATRIX_H
+
+namespace Eigen { 
+
+/** \ingroup SparseCore_Module
+  *
+  * \class SparseMatrix
+  *
+  * \brief A versatible sparse matrix representation
+  *
+  * This class implements a more versatile variants of the common \em compressed row/column storage format.
+  * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index.
+  * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra
+  * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
+  * can be done with limited memory reallocation and copies.
+  *
+  * A call to the function makeCompressed() turns the matrix into the standard \em compressed format
+  * compatible with many library.
+  *
+  * More details on this storage sceheme are given in the \ref TutorialSparse "manual pages".
+  *
+  * \tparam _Scalar the scalar type, i.e. the type of the coefficients
+  * \tparam _Options Union of bit flags controlling the storage scheme. Currently the only possibility
+  *                 is ColMajor or RowMajor. The default is 0 which means column-major.
+  * \tparam _Index the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
+  */
+
+namespace internal {
+template<typename _Scalar, int _Options, typename _Index>
+struct traits<SparseMatrix<_Scalar, _Options, _Index> >
+{
+  typedef _Scalar Scalar;
+  typedef _Index Index;
+  typedef Sparse StorageKind;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    Flags = _Options | NestByRefBit | LvalueBit,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    SupportedAccessPatterns = InnerRandomAccessPattern
+  };
+};
+
+template<typename _Scalar, int _Options, typename _Index, int DiagIndex>
+struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
+{
+  typedef SparseMatrix<_Scalar, _Options, _Index> MatrixType;
+  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef _Index Index;
+  typedef MatrixXpr XprKind;
+
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = 1,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = 1,
+    Flags = 0,
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost*10
+  };
+};
+
+} // end namespace internal
+
+template<typename _Scalar, int _Options, typename _Index>
+class SparseMatrix
+  : public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> >
+{
+  public:
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
+    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
+    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
+
+    typedef MappedSparseMatrix<Scalar,Flags> Map;
+    using Base::IsRowMajor;
+    typedef internal::CompressedStorage<Scalar,Index> Storage;
+    enum {
+      Options = _Options
+    };
+
+  protected:
+
+    typedef SparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0)> TransposedSparseMatrix;
+
+    Index m_outerSize;
+    Index m_innerSize;
+    Index* m_outerIndex;
+    Index* m_innerNonZeros;     // optional, if null then the data is compressed
+    Storage m_data;
+    
+    Eigen::Map<Matrix<Index,Dynamic,1> > innerNonZeros() { return Eigen::Map<Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
+    const  Eigen::Map<const Matrix<Index,Dynamic,1> > innerNonZeros() const { return Eigen::Map<const Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
+
+  public:
+    
+    /** \returns whether \c *this is in compressed form. */
+    inline bool isCompressed() const { return m_innerNonZeros==0; }
+
+    /** \returns the number of rows of the matrix */
+    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+    /** \returns the number of columns of the matrix */
+    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+
+    /** \returns the number of rows (resp. columns) of the matrix if the storage order column major (resp. row major) */
+    inline Index innerSize() const { return m_innerSize; }
+    /** \returns the number of columns (resp. rows) of the matrix if the storage order column major (resp. row major) */
+    inline Index outerSize() const { return m_outerSize; }
+    
+    /** \returns a const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline const Scalar* valuePtr() const { return &m_data.value(0); }
+    /** \returns a non-const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline Scalar* valuePtr() { return &m_data.value(0); }
+
+    /** \returns a const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline const Index* innerIndexPtr() const { return &m_data.index(0); }
+    /** \returns a non-const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline Index* innerIndexPtr() { return &m_data.index(0); }
+
+    /** \returns a const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), innerIndexPtr() */
+    inline const Index* outerIndexPtr() const { return m_outerIndex; }
+    /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), innerIndexPtr() */
+    inline Index* outerIndexPtr() { return m_outerIndex; }
+
+    /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline const Index* innerNonZeroPtr() const { return m_innerNonZeros; }
+    /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline Index* innerNonZeroPtr() { return m_innerNonZeros; }
+
+    /** \internal */
+    inline Storage& data() { return m_data; }
+    /** \internal */
+    inline const Storage& data() const { return m_data; }
+
+    /** \returns the value of the matrix at position \a i, \a j
+      * This function returns Scalar(0) if the element is an explicit \em zero */
+    inline Scalar coeff(Index row, Index col) const
+    {
+      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
+      
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+      Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
+      return m_data.atInRange(m_outerIndex[outer], end, inner);
+    }
+
+    /** \returns a non-const reference to the value of the matrix at position \a i, \a j
+      *
+      * If the element does not exist then it is inserted via the insert(Index,Index) function
+      * which itself turns the matrix into a non compressed form if that was not the case.
+      *
+      * This is a O(log(nnz_j)) operation (binary search) plus the cost of insert(Index,Index)
+      * function if the element does not already exist.
+      */
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
+      
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = m_outerIndex[outer];
+      Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
+      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
+      if(end<=start)
+        return insert(row,col);
+      const Index p = m_data.searchLowerIndex(start,end-1,inner);
+      if((p<end) && (m_data.index(p)==inner))
+        return m_data.value(p);
+      else
+        return insert(row,col);
+    }
+
+    /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
+      * The non zero coefficient must \b not already exist.
+      *
+      * If the matrix \c *this is in compressed mode, then \c *this is turned into uncompressed
+      * mode while reserving room for 2 non zeros per inner vector. It is strongly recommended to first
+      * call reserve(const SizesType &) to reserve a more appropriate number of elements per
+      * inner vector that better match your scenario.
+      *
+      * This function performs a sorted insertion in O(1) if the elements of each inner vector are
+      * inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
+      *
+      */
+    Scalar& insert(Index row, Index col)
+    {
+      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
+      
+      if(isCompressed())
+      {
+        reserve(Matrix<Index,Dynamic,1>::Constant(outerSize(), 2));
+      }
+      return insertUncompressed(row,col);
+    }
+
+  public:
+
+    class InnerIterator;
+    class ReverseInnerIterator;
+
+    /** Removes all non zeros but keep allocated memory */
+    inline void setZero()
+    {
+      m_data.clear();
+      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
+      if(m_innerNonZeros)
+        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(Index));
+    }
+
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const
+    {
+      if(m_innerNonZeros)
+        return innerNonZeros().sum();
+      return static_cast<Index>(m_data.size());
+    }
+
+    /** Preallocates \a reserveSize non zeros.
+      *
+      * Precondition: the matrix must be in compressed mode. */
+    inline void reserve(Index reserveSize)
+    {
+      eigen_assert(isCompressed() && "This function does not make sense in non compressed mode.");
+      m_data.reserve(reserveSize);
+    }
+    
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** Preallocates \a reserveSize[\c j] non zeros for each column (resp. row) \c j.
+      *
+      * This function turns the matrix in non-compressed mode */
+    template<class SizesType>
+    inline void reserve(const SizesType& reserveSizes);
+    #else
+    template<class SizesType>
+    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif = typename SizesType::value_type())
+    {
+      EIGEN_UNUSED_VARIABLE(enableif);
+      reserveInnerVectors(reserveSizes);
+    }
+    template<class SizesType>
+    inline void reserve(const SizesType& reserveSizes, const typename SizesType::Scalar& enableif =
+    #if (!defined(_MSC_VER)) || (_MSC_VER>=1500) // MSVC 2005 fails to compile with this typename
+        typename
+    #endif
+        SizesType::Scalar())
+    {
+      EIGEN_UNUSED_VARIABLE(enableif);
+      reserveInnerVectors(reserveSizes);
+    }
+    #endif // EIGEN_PARSED_BY_DOXYGEN
+  protected:
+    template<class SizesType>
+    inline void reserveInnerVectors(const SizesType& reserveSizes)
+    {
+      if(isCompressed())
+      {
+        std::size_t totalReserveSize = 0;
+        // turn the matrix into non-compressed mode
+        m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
+        if (!m_innerNonZeros) internal::throw_std_bad_alloc();
+        
+        // temporarily use m_innerSizes to hold the new starting points.
+        Index* newOuterIndex = m_innerNonZeros;
+        
+        Index count = 0;
+        for(Index j=0; j<m_outerSize; ++j)
+        {
+          newOuterIndex[j] = count;
+          count += reserveSizes[j] + (m_outerIndex[j+1]-m_outerIndex[j]);
+          totalReserveSize += reserveSizes[j];
+        }
+        m_data.reserve(totalReserveSize);
+        Index previousOuterIndex = m_outerIndex[m_outerSize];
+        for(Index j=m_outerSize-1; j>=0; --j)
+        {
+          Index innerNNZ = previousOuterIndex - m_outerIndex[j];
+          for(Index i=innerNNZ-1; i>=0; --i)
+          {
+            m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
+            m_data.value(newOuterIndex[j]+i) = m_data.value(m_outerIndex[j]+i);
+          }
+          previousOuterIndex = m_outerIndex[j];
+          m_outerIndex[j] = newOuterIndex[j];
+          m_innerNonZeros[j] = innerNNZ;
+        }
+        m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1];
+        
+        m_data.resize(m_outerIndex[m_outerSize]);
+      }
+      else
+      {
+        Index* newOuterIndex = static_cast<Index*>(std::malloc((m_outerSize+1)*sizeof(Index)));
+        if (!newOuterIndex) internal::throw_std_bad_alloc();
+        
+        Index count = 0;
+        for(Index j=0; j<m_outerSize; ++j)
+        {
+          newOuterIndex[j] = count;
+          Index alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
+          Index toReserve = std::max<Index>(reserveSizes[j], alreadyReserved);
+          count += toReserve + m_innerNonZeros[j];
+        }
+        newOuterIndex[m_outerSize] = count;
+        
+        m_data.resize(count);
+        for(Index j=m_outerSize-1; j>=0; --j)
+        {
+          Index offset = newOuterIndex[j] - m_outerIndex[j];
+          if(offset>0)
+          {
+            Index innerNNZ = m_innerNonZeros[j];
+            for(Index i=innerNNZ-1; i>=0; --i)
+            {
+              m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
+              m_data.value(newOuterIndex[j]+i) = m_data.value(m_outerIndex[j]+i);
+            }
+          }
+        }
+        
+        std::swap(m_outerIndex, newOuterIndex);
+        std::free(newOuterIndex);
+      }
+      
+    }
+  public:
+
+    //--- low level purely coherent filling ---
+
+    /** \internal
+      * \returns a reference to the non zero coefficient at position \a row, \a col assuming that:
+      * - the nonzero does not already exist
+      * - the new coefficient is the last one according to the storage order
+      *
+      * Before filling a given inner vector you must call the statVec(Index) function.
+      *
+      * After an insertion session, you should call the finalize() function.
+      *
+      * \sa insert, insertBackByOuterInner, startVec */
+    inline Scalar& insertBack(Index row, Index col)
+    {
+      return insertBackByOuterInner(IsRowMajor?row:col, IsRowMajor?col:row);
+    }
+
+    /** \internal
+      * \sa insertBack, startVec */
+    inline Scalar& insertBackByOuterInner(Index outer, Index inner)
+    {
+      eigen_assert(size_t(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
+      eigen_assert( (m_outerIndex[outer+1]-m_outerIndex[outer]==0 || m_data.index(m_data.size()-1)<inner) && "Invalid ordered insertion (invalid inner index)");
+      Index p = m_outerIndex[outer+1];
+      ++m_outerIndex[outer+1];
+      m_data.append(0, inner);
+      return m_data.value(p);
+    }
+
+    /** \internal
+      * \warning use it only if you know what you are doing */
+    inline Scalar& insertBackByOuterInnerUnordered(Index outer, Index inner)
+    {
+      Index p = m_outerIndex[outer+1];
+      ++m_outerIndex[outer+1];
+      m_data.append(0, inner);
+      return m_data.value(p);
+    }
+
+    /** \internal
+      * \sa insertBack, insertBackByOuterInner */
+    inline void startVec(Index outer)
+    {
+      eigen_assert(m_outerIndex[outer]==Index(m_data.size()) && "You must call startVec for each inner vector sequentially");
+      eigen_assert(m_outerIndex[outer+1]==0 && "You must call startVec for each inner vector sequentially");
+      m_outerIndex[outer+1] = m_outerIndex[outer];
+    }
+
+    /** \internal
+      * Must be called after inserting a set of non zero entries using the low level compressed API.
+      */
+    inline void finalize()
+    {
+      if(isCompressed())
+      {
+        Index size = static_cast<Index>(m_data.size());
+        Index i = m_outerSize;
+        // find the last filled column
+        while (i>=0 && m_outerIndex[i]==0)
+          --i;
+        ++i;
+        while (i<=m_outerSize)
+        {
+          m_outerIndex[i] = size;
+          ++i;
+        }
+      }
+    }
+
+    //---
+
+    template<typename InputIterators>
+    void setFromTriplets(const InputIterators& begin, const InputIterators& end);
+
+    void sumupDuplicates();
+
+    //---
+    
+    /** \internal
+      * same as insert(Index,Index) except that the indices are given relative to the storage order */
+    Scalar& insertByOuterInner(Index j, Index i)
+    {
+      return insert(IsRowMajor ? j : i, IsRowMajor ? i : j);
+    }
+
+    /** Turns the matrix into the \em compressed format.
+      */
+    void makeCompressed()
+    {
+      if(isCompressed())
+        return;
+      
+      Index oldStart = m_outerIndex[1];
+      m_outerIndex[1] = m_innerNonZeros[0];
+      for(Index j=1; j<m_outerSize; ++j)
+      {
+        Index nextOldStart = m_outerIndex[j+1];
+        Index offset = oldStart - m_outerIndex[j];
+        if(offset>0)
+        {
+          for(Index k=0; k<m_innerNonZeros[j]; ++k)
+          {
+            m_data.index(m_outerIndex[j]+k) = m_data.index(oldStart+k);
+            m_data.value(m_outerIndex[j]+k) = m_data.value(oldStart+k);
+          }
+        }
+        m_outerIndex[j+1] = m_outerIndex[j] + m_innerNonZeros[j];
+        oldStart = nextOldStart;
+      }
+      std::free(m_innerNonZeros);
+      m_innerNonZeros = 0;
+      m_data.resize(m_outerIndex[m_outerSize]);
+      m_data.squeeze();
+    }
+
+    /** Turns the matrix into the uncompressed mode */
+    void uncompress()
+    {
+      if(m_innerNonZeros != 0)
+        return; 
+      m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
+      for (Index i = 0; i < m_outerSize; i++)
+      {
+        m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; 
+      }
+    }
+    
+    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */
+    void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
+    {
+      prune(default_prunning_func(reference,epsilon));
+    }
+    
+    /** Turns the matrix into compressed format, and suppresses all nonzeros which do not satisfy the predicate \a keep.
+      * The functor type \a KeepFunc must implement the following function:
+      * \code
+      * bool operator() (const Index& row, const Index& col, const Scalar& value) const;
+      * \endcode
+      * \sa prune(Scalar,RealScalar)
+      */
+    template<typename KeepFunc>
+    void prune(const KeepFunc& keep = KeepFunc())
+    {
+      // TODO optimize the uncompressed mode to avoid moving and allocating the data twice
+      // TODO also implement a unit test
+      makeCompressed();
+
+      Index k = 0;
+      for(Index j=0; j<m_outerSize; ++j)
+      {
+        Index previousStart = m_outerIndex[j];
+        m_outerIndex[j] = k;
+        Index end = m_outerIndex[j+1];
+        for(Index i=previousStart; i<end; ++i)
+        {
+          if(keep(IsRowMajor?j:m_data.index(i), IsRowMajor?m_data.index(i):j, m_data.value(i)))
+          {
+            m_data.value(k) = m_data.value(i);
+            m_data.index(k) = m_data.index(i);
+            ++k;
+          }
+        }
+      }
+      m_outerIndex[m_outerSize] = k;
+      m_data.resize(k,0);
+    }
+
+    /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
+      * \sa resizeNonZeros(Index), reserve(), setZero()
+      */
+    void conservativeResize(Index rows, Index cols) 
+    {
+      // No change
+      if (this->rows() == rows && this->cols() == cols) return;
+      
+      // If one dimension is null, then there is nothing to be preserved
+      if(rows==0 || cols==0) return resize(rows,cols);
+
+      Index innerChange = IsRowMajor ? cols - this->cols() : rows - this->rows();
+      Index outerChange = IsRowMajor ? rows - this->rows() : cols - this->cols();
+      Index newInnerSize = IsRowMajor ? cols : rows;
+
+      // Deals with inner non zeros
+      if (m_innerNonZeros)
+      {
+        // Resize m_innerNonZeros
+        Index *newInnerNonZeros = static_cast<Index*>(std::realloc(m_innerNonZeros, (m_outerSize + outerChange) * sizeof(Index)));
+        if (!newInnerNonZeros) internal::throw_std_bad_alloc();
+        m_innerNonZeros = newInnerNonZeros;
+        
+        for(Index i=m_outerSize; i<m_outerSize+outerChange; i++)          
+          m_innerNonZeros[i] = 0;
+      } 
+      else if (innerChange < 0) 
+      {
+        // Inner size decreased: allocate a new m_innerNonZeros
+        m_innerNonZeros = static_cast<Index*>(std::malloc((m_outerSize+outerChange+1) * sizeof(Index)));
+        if (!m_innerNonZeros) internal::throw_std_bad_alloc();
+        for(Index i = 0; i < m_outerSize; i++)
+          m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
+      }
+      
+      // Change the m_innerNonZeros in case of a decrease of inner size
+      if (m_innerNonZeros && innerChange < 0)
+      {
+        for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
+        {
+          Index &n = m_innerNonZeros[i];
+          Index start = m_outerIndex[i];
+          while (n > 0 && m_data.index(start+n-1) >= newInnerSize) --n; 
+        }
+      }
+      
+      m_innerSize = newInnerSize;
+
+      // Re-allocate outer index structure if necessary
+      if (outerChange == 0)
+        return;
+          
+      Index *newOuterIndex = static_cast<Index*>(std::realloc(m_outerIndex, (m_outerSize + outerChange + 1) * sizeof(Index)));
+      if (!newOuterIndex) internal::throw_std_bad_alloc();
+      m_outerIndex = newOuterIndex;
+      if (outerChange > 0)
+      {
+        Index last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
+        for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)          
+          m_outerIndex[i] = last; 
+      }
+      m_outerSize += outerChange;
+    }
+    
+    /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero.
+      * \sa resizeNonZeros(Index), reserve(), setZero()
+      */
+    void resize(Index rows, Index cols)
+    {
+      const Index outerSize = IsRowMajor ? rows : cols;
+      m_innerSize = IsRowMajor ? cols : rows;
+      m_data.clear();
+      if (m_outerSize != outerSize || m_outerSize==0)
+      {
+        std::free(m_outerIndex);
+        m_outerIndex = static_cast<Index*>(std::malloc((outerSize + 1) * sizeof(Index)));
+        if (!m_outerIndex) internal::throw_std_bad_alloc();
+        
+        m_outerSize = outerSize;
+      }
+      if(m_innerNonZeros)
+      {
+        std::free(m_innerNonZeros);
+        m_innerNonZeros = 0;
+      }
+      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
+    }
+
+    /** \internal
+      * Resize the nonzero vector to \a size */
+    void resizeNonZeros(Index size)
+    {
+      // TODO remove this function
+      m_data.resize(size);
+    }
+
+    /** \returns a const expression of the diagonal coefficients */
+    const Diagonal<const SparseMatrix> diagonal() const { return *this; }
+
+    /** Default constructor yielding an empty \c 0 \c x \c 0 matrix */
+    inline SparseMatrix()
+      : m_outerSize(-1), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      check_template_parameters();
+      resize(0, 0);
+    }
+
+    /** Constructs a \a rows \c x \a cols empty matrix */
+    inline SparseMatrix(Index rows, Index cols)
+      : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      check_template_parameters();
+      resize(rows, cols);
+    }
+
+    /** Constructs a sparse matrix from the sparse expression \a other */
+    template<typename OtherDerived>
+    inline SparseMatrix(const SparseMatrixBase<OtherDerived>& other)
+      : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+      check_template_parameters();
+      *this = other.derived();
+    }
+    
+    /** Constructs a sparse matrix from the sparse selfadjoint view \a other */
+    template<typename OtherDerived, unsigned int UpLo>
+    inline SparseMatrix(const SparseSelfAdjointView<OtherDerived, UpLo>& other)
+      : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      check_template_parameters();
+      *this = other;
+    }
+
+    /** Copy constructor (it performs a deep copy) */
+    inline SparseMatrix(const SparseMatrix& other)
+      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      check_template_parameters();
+      *this = other.derived();
+    }
+
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    SparseMatrix(const ReturnByValue<OtherDerived>& other)
+      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      check_template_parameters();
+      initAssignment(other);
+      other.evalTo(*this);
+    }
+
+    /** Swaps the content of two sparse matrices of the same type.
+      * This is a fast operation that simply swaps the underlying pointers and parameters. */
+    inline void swap(SparseMatrix& other)
+    {
+      //EIGEN_DBG_SPARSE(std::cout << "SparseMatrix:: swap\n");
+      std::swap(m_outerIndex, other.m_outerIndex);
+      std::swap(m_innerSize, other.m_innerSize);
+      std::swap(m_outerSize, other.m_outerSize);
+      std::swap(m_innerNonZeros, other.m_innerNonZeros);
+      m_data.swap(other.m_data);
+    }
+
+    /** Sets *this to the identity matrix */
+    inline void setIdentity()
+    {
+      eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
+      this->m_data.resize(rows());
+      Eigen::Map<Matrix<Index, Dynamic, 1> >(&this->m_data.index(0), rows()).setLinSpaced(0, rows()-1);
+      Eigen::Map<Matrix<Scalar, Dynamic, 1> >(&this->m_data.value(0), rows()).setOnes();
+      Eigen::Map<Matrix<Index, Dynamic, 1> >(this->m_outerIndex, rows()+1).setLinSpaced(0, rows());
+    }
+    inline SparseMatrix& operator=(const SparseMatrix& other)
+    {
+      if (other.isRValue())
+      {
+        swap(other.const_cast_derived());
+      }
+      else if(this!=&other)
+      {
+        initAssignment(other);
+        if(other.isCompressed())
+        {
+          memcpy(m_outerIndex, other.m_outerIndex, (m_outerSize+1)*sizeof(Index));
+          m_data = other.m_data;
+        }
+        else
+        {
+          Base::operator=(other);
+        }
+      }
+      return *this;
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename Lhs, typename Rhs>
+    inline SparseMatrix& operator=(const SparseSparseProduct<Lhs,Rhs>& product)
+    { return Base::operator=(product); }
+    
+    template<typename OtherDerived>
+    inline SparseMatrix& operator=(const ReturnByValue<OtherDerived>& other)
+    {
+      initAssignment(other);
+      return Base::operator=(other.derived());
+    }
+    
+    template<typename OtherDerived>
+    inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other)
+    { return Base::operator=(other.derived()); }
+    #endif
+
+    template<typename OtherDerived>
+    EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
+
+    friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m)
+    {
+      EIGEN_DBG_SPARSE(
+        s << "Nonzero entries:\n";
+        if(m.isCompressed())
+          for (Index i=0; i<m.nonZeros(); ++i)
+            s << "(" << m.m_data.value(i) << "," << m.m_data.index(i) << ") ";
+        else
+          for (Index i=0; i<m.outerSize(); ++i)
+          {
+            Index p = m.m_outerIndex[i];
+            Index pe = m.m_outerIndex[i]+m.m_innerNonZeros[i];
+            Index k=p;
+            for (; k<pe; ++k)
+              s << "(" << m.m_data.value(k) << "," << m.m_data.index(k) << ") ";
+            for (; k<m.m_outerIndex[i+1]; ++k)
+              s << "(_,_) ";
+          }
+        s << std::endl;
+        s << std::endl;
+        s << "Outer pointers:\n";
+        for (Index i=0; i<m.outerSize(); ++i)
+          s << m.m_outerIndex[i] << " ";
+        s << " $" << std::endl;
+        if(!m.isCompressed())
+        {
+          s << "Inner non zeros:\n";
+          for (Index i=0; i<m.outerSize(); ++i)
+            s << m.m_innerNonZeros[i] << " ";
+          s << " $" << std::endl;
+        }
+        s << std::endl;
+      );
+      s << static_cast<const SparseMatrixBase<SparseMatrix>&>(m);
+      return s;
+    }
+
+    /** Destructor */
+    inline ~SparseMatrix()
+    {
+      std::free(m_outerIndex);
+      std::free(m_innerNonZeros);
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Overloaded for performance */
+    Scalar sum() const;
+#endif
+    
+#   ifdef EIGEN_SPARSEMATRIX_PLUGIN
+#     include EIGEN_SPARSEMATRIX_PLUGIN
+#   endif
+
+protected:
+
+    template<typename Other>
+    void initAssignment(const Other& other)
+    {
+      resize(other.rows(), other.cols());
+      if(m_innerNonZeros)
+      {
+        std::free(m_innerNonZeros);
+        m_innerNonZeros = 0;
+      }
+    }
+
+    /** \internal
+      * \sa insert(Index,Index) */
+    EIGEN_DONT_INLINE Scalar& insertCompressed(Index row, Index col);
+
+    /** \internal
+      * A vector object that is equal to 0 everywhere but v at the position i */
+    class SingletonVector
+    {
+        Index m_index;
+        Index m_value;
+      public:
+        typedef Index value_type;
+        SingletonVector(Index i, Index v)
+          : m_index(i), m_value(v)
+        {}
+
+        Index operator[](Index i) const { return i==m_index ? m_value : 0; }
+    };
+
+    /** \internal
+      * \sa insert(Index,Index) */
+    EIGEN_DONT_INLINE Scalar& insertUncompressed(Index row, Index col);
+
+public:
+    /** \internal
+      * \sa insert(Index,Index) */
+    EIGEN_STRONG_INLINE Scalar& insertBackUncompressed(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      eigen_assert(!isCompressed());
+      eigen_assert(m_innerNonZeros[outer]<=(m_outerIndex[outer+1] - m_outerIndex[outer]));
+
+      Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++;
+      m_data.index(p) = inner;
+      return (m_data.value(p) = 0);
+    }
+
+private:
+  static void check_template_parameters()
+  {
+    EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
+    EIGEN_STATIC_ASSERT((Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
+  }
+
+  struct default_prunning_func {
+    default_prunning_func(const Scalar& ref, const RealScalar& eps) : reference(ref), epsilon(eps) {}
+    inline bool operator() (const Index&, const Index&, const Scalar& value) const
+    {
+      return !internal::isMuchSmallerThan(value, reference, epsilon);
+    }
+    Scalar reference;
+    RealScalar epsilon;
+  };
+};
+
+template<typename Scalar, int _Options, typename _Index>
+class SparseMatrix<Scalar,_Options,_Index>::InnerIterator
+{
+  public:
+    InnerIterator(const SparseMatrix& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.m_outerIndex[outer])
+    {
+      if(mat.isCompressed())
+        m_end = mat.m_outerIndex[outer+1];
+      else
+        m_end = m_id + mat.m_innerNonZeros[outer];
+    }
+
+    inline InnerIterator& operator++() { m_id++; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
+
+    inline Index index() const { return m_indices[m_id]; }
+    inline Index outer() const { return m_outer; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    inline operator bool() const { return (m_id < m_end); }
+
+  protected:
+    const Scalar* m_values;
+    const Index* m_indices;
+    const Index m_outer;
+    Index m_id;
+    Index m_end;
+};
+
+template<typename Scalar, int _Options, typename _Index>
+class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator
+{
+  public:
+    ReverseInnerIterator(const SparseMatrix& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer])
+    {
+      if(mat.isCompressed())
+        m_id = mat.m_outerIndex[outer+1];
+      else
+        m_id = m_start + mat.m_innerNonZeros[outer];
+    }
+
+    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id-1]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
+
+    inline Index index() const { return m_indices[m_id-1]; }
+    inline Index outer() const { return m_outer; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    inline operator bool() const { return (m_id > m_start); }
+
+  protected:
+    const Scalar* m_values;
+    const Index* m_indices;
+    const Index m_outer;
+    Index m_id;
+    const Index m_start;
+};
+
+namespace internal {
+
+template<typename InputIterator, typename SparseMatrixType>
+void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, int Options = 0)
+{
+  EIGEN_UNUSED_VARIABLE(Options);
+  enum { IsRowMajor = SparseMatrixType::IsRowMajor };
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::Index Index;
+  SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor,Index> trMat(mat.rows(),mat.cols());
+
+  if(begin!=end)
+  {
+    // pass 1: count the nnz per inner-vector
+    Matrix<Index,Dynamic,1> wi(trMat.outerSize());
+    wi.setZero();
+    for(InputIterator it(begin); it!=end; ++it)
+    {
+      eigen_assert(it->row()>=0 && it->row()<mat.rows() && it->col()>=0 && it->col()<mat.cols());
+      wi(IsRowMajor ? it->col() : it->row())++;
+    }
+
+    // pass 2: insert all the elements into trMat
+    trMat.reserve(wi);
+    for(InputIterator it(begin); it!=end; ++it)
+      trMat.insertBackUncompressed(it->row(),it->col()) = it->value();
+
+    // pass 3:
+    trMat.sumupDuplicates();
+  }
+
+  // pass 4: transposed copy -> implicit sorting
+  mat = trMat;
+}
+
+}
+
+
+/** Fill the matrix \c *this with the list of \em triplets defined by the iterator range \a begin - \a end.
+  *
+  * A \em triplet is a tuple (i,j,value) defining a non-zero element.
+  * The input list of triplets does not have to be sorted, and can contains duplicated elements.
+  * In any case, the result is a \b sorted and \b compressed sparse matrix where the duplicates have been summed up.
+  * This is a \em O(n) operation, with \em n the number of triplet elements.
+  * The initial contents of \c *this is destroyed.
+  * The matrix \c *this must be properly resized beforehand using the SparseMatrix(Index,Index) constructor,
+  * or the resize(Index,Index) method. The sizes are not extracted from the triplet list.
+  *
+  * The \a InputIterators value_type must provide the following interface:
+  * \code
+  * Scalar value() const; // the value
+  * Scalar row() const;   // the row index i
+  * Scalar col() const;   // the column index j
+  * \endcode
+  * See for instance the Eigen::Triplet template class.
+  *
+  * Here is a typical usage example:
+  * \code
+    typedef Triplet<double> T;
+    std::vector<T> tripletList;
+    triplets.reserve(estimation_of_entries);
+    for(...)
+    {
+      // ...
+      tripletList.push_back(T(i,j,v_ij));
+    }
+    SparseMatrixType m(rows,cols);
+    m.setFromTriplets(tripletList.begin(), tripletList.end());
+    // m is ready to go!
+  * \endcode
+  *
+  * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define
+  * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
+  * be explicitely stored into a std::vector for instance.
+  */
+template<typename Scalar, int _Options, typename _Index>
+template<typename InputIterators>
+void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
+{
+  internal::set_from_triplets(begin, end, *this);
+}
+
+/** \internal */
+template<typename Scalar, int _Options, typename _Index>
+void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
+{
+  eigen_assert(!isCompressed());
+  // TODO, in practice we should be able to use m_innerNonZeros for that task
+  Matrix<Index,Dynamic,1> wi(innerSize());
+  wi.fill(-1);
+  Index count = 0;
+  // for each inner-vector, wi[inner_index] will hold the position of first element into the index/value buffers
+  for(Index j=0; j<outerSize(); ++j)
+  {
+    Index start   = count;
+    Index oldEnd  = m_outerIndex[j]+m_innerNonZeros[j];
+    for(Index k=m_outerIndex[j]; k<oldEnd; ++k)
+    {
+      Index i = m_data.index(k);
+      if(wi(i)>=start)
+      {
+        // we already meet this entry => accumulate it
+        m_data.value(wi(i)) += m_data.value(k);
+      }
+      else
+      {
+        m_data.value(count) = m_data.value(k);
+        m_data.index(count) = m_data.index(k);
+        wi(i) = count;
+        ++count;
+      }
+    }
+    m_outerIndex[j] = start;
+  }
+  m_outerIndex[m_outerSize] = count;
+
+  // turn the matrix into compressed form
+  std::free(m_innerNonZeros);
+  m_innerNonZeros = 0;
+  m_data.resize(m_outerIndex[m_outerSize]);
+}
+
+template<typename Scalar, int _Options, typename _Index>
+template<typename OtherDerived>
+EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Options,_Index>::operator=(const SparseMatrixBase<OtherDerived>& other)
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  
+  const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+  if (needToTranspose)
+  {
+    // two passes algorithm:
+    //  1 - compute the number of coeffs per dest inner vector
+    //  2 - do the actual copy/eval
+    // Since each coeff of the rhs has to be evaluated twice, let's evaluate it if needed
+    typedef typename internal::nested<OtherDerived,2>::type OtherCopy;
+    typedef typename internal::remove_all<OtherCopy>::type _OtherCopy;
+    OtherCopy otherCopy(other.derived());
+
+    SparseMatrix dest(other.rows(),other.cols());
+    Eigen::Map<Matrix<Index, Dynamic, 1> > (dest.m_outerIndex,dest.outerSize()).setZero();
+
+    // pass 1
+    // FIXME the above copy could be merged with that pass
+    for (Index j=0; j<otherCopy.outerSize(); ++j)
+      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+        ++dest.m_outerIndex[it.index()];
+
+    // prefix sum
+    Index count = 0;
+    Matrix<Index,Dynamic,1> positions(dest.outerSize());
+    for (Index j=0; j<dest.outerSize(); ++j)
+    {
+      Index tmp = dest.m_outerIndex[j];
+      dest.m_outerIndex[j] = count;
+      positions[j] = count;
+      count += tmp;
+    }
+    dest.m_outerIndex[dest.outerSize()] = count;
+    // alloc
+    dest.m_data.resize(count);
+    // pass 2
+    for (Index j=0; j<otherCopy.outerSize(); ++j)
+    {
+      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+      {
+        Index pos = positions[it.index()]++;
+        dest.m_data.index(pos) = j;
+        dest.m_data.value(pos) = it.value();
+      }
+    }
+    this->swap(dest);
+    return *this;
+  }
+  else
+  {
+    if(other.isRValue())
+      initAssignment(other.derived());
+    // there is no special optimization
+    return Base::operator=(other.derived());
+  }
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col)
+{
+  eigen_assert(!isCompressed());
+
+  const Index outer = IsRowMajor ? row : col;
+  const Index inner = IsRowMajor ? col : row;
+
+  Index room = m_outerIndex[outer+1] - m_outerIndex[outer];
+  Index innerNNZ = m_innerNonZeros[outer];
+  if(innerNNZ>=room)
+  {
+    // this inner vector is full, we need to reallocate the whole buffer :(
+    reserve(SingletonVector(outer,std::max<Index>(2,innerNNZ)));
+  }
+
+  Index startId = m_outerIndex[outer];
+  Index p = startId + m_innerNonZeros[outer];
+  while ( (p > startId) && (m_data.index(p-1) > inner) )
+  {
+    m_data.index(p) = m_data.index(p-1);
+    m_data.value(p) = m_data.value(p-1);
+    --p;
+  }
+  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end");
+
+  m_innerNonZeros[outer]++;
+
+  m_data.index(p) = inner;
+  return (m_data.value(p) = 0);
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertCompressed(Index row, Index col)
+{
+  eigen_assert(isCompressed());
+
+  const Index outer = IsRowMajor ? row : col;
+  const Index inner = IsRowMajor ? col : row;
+
+  Index previousOuter = outer;
+  if (m_outerIndex[outer+1]==0)
+  {
+    // we start a new inner vector
+    while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
+    {
+      m_outerIndex[previousOuter] = static_cast<Index>(m_data.size());
+      --previousOuter;
+    }
+    m_outerIndex[outer+1] = m_outerIndex[outer];
+  }
+
+  // here we have to handle the tricky case where the outerIndex array
+  // starts with: [ 0 0 0 0 0 1 ...] and we are inserted in, e.g.,
+  // the 2nd inner vector...
+  bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0))
+                && (size_t(m_outerIndex[outer+1]) == m_data.size());
+
+  size_t startId = m_outerIndex[outer];
+  // FIXME let's make sure sizeof(long int) == sizeof(size_t)
+  size_t p = m_outerIndex[outer+1];
+  ++m_outerIndex[outer+1];
+
+  double reallocRatio = 1;
+  if (m_data.allocatedSize()<=m_data.size())
+  {
+    // if there is no preallocated memory, let's reserve a minimum of 32 elements
+    if (m_data.size()==0)
+    {
+      m_data.reserve(32);
+    }
+    else
+    {
+      // we need to reallocate the data, to reduce multiple reallocations
+      // we use a smart resize algorithm based on the current filling ratio
+      // in addition, we use double to avoid integers overflows
+      double nnzEstimate = double(m_outerIndex[outer])*double(m_outerSize)/double(outer+1);
+      reallocRatio = (nnzEstimate-double(m_data.size()))/double(m_data.size());
+      // furthermore we bound the realloc ratio to:
+      //   1) reduce multiple minor realloc when the matrix is almost filled
+      //   2) avoid to allocate too much memory when the matrix is almost empty
+      reallocRatio = (std::min)((std::max)(reallocRatio,1.5),8.);
+    }
+  }
+  m_data.resize(m_data.size()+1,reallocRatio);
+
+  if (!isLastVec)
+  {
+    if (previousOuter==-1)
+    {
+      // oops wrong guess.
+      // let's correct the outer offsets
+      for (Index k=0; k<=(outer+1); ++k)
+        m_outerIndex[k] = 0;
+      Index k=outer+1;
+      while(m_outerIndex[k]==0)
+        m_outerIndex[k++] = 1;
+      while (k<=m_outerSize && m_outerIndex[k]!=0)
+        m_outerIndex[k++]++;
+      p = 0;
+      --k;
+      k = m_outerIndex[k]-1;
+      while (k>0)
+      {
+        m_data.index(k) = m_data.index(k-1);
+        m_data.value(k) = m_data.value(k-1);
+        k--;
+      }
+    }
+    else
+    {
+      // we are not inserting into the last inner vec
+      // update outer indices:
+      Index j = outer+2;
+      while (j<=m_outerSize && m_outerIndex[j]!=0)
+        m_outerIndex[j++]++;
+      --j;
+      // shift data of last vecs:
+      Index k = m_outerIndex[j]-1;
+      while (k>=Index(p))
+      {
+        m_data.index(k) = m_data.index(k-1);
+        m_data.value(k) = m_data.value(k-1);
+        k--;
+      }
+    }
+  }
+
+  while ( (p > startId) && (m_data.index(p-1) > inner) )
+  {
+    m_data.index(p) = m_data.index(p-1);
+    m_data.value(p) = m_data.value(p-1);
+    --p;
+  }
+
+  m_data.index(p) = inner;
+  return (m_data.value(p) = 0);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEMATRIX_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseMatrixBase.h b/include/eigen3/Eigen/src/SparseCore/SparseMatrixBase.h
new file mode 100644
index 0000000..485e85b
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -0,0 +1,452 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEMATRIXBASE_H
+#define EIGEN_SPARSEMATRIXBASE_H
+
+namespace Eigen { 
+
+/** \ingroup SparseCore_Module
+  *
+  * \class SparseMatrixBase
+  *
+  * \brief Base class of any sparse matrices or sparse expressions
+  *
+  * \tparam Derived
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
+  */
+template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
+{
+  public:
+
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::add_const_on_value_type_if_arithmetic<
+                         typename internal::packet_traits<Scalar>::type
+                     >::type PacketReturnType;
+
+    typedef SparseMatrixBase StorageBaseType;
+    typedef EigenBase<Derived> Base;
+    
+    template<typename OtherDerived>
+    Derived& operator=(const EigenBase<OtherDerived> &other)
+    {
+      other.derived().evalTo(derived());
+      return derived();
+    }
+
+    enum {
+
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+        /**< The number of rows at compile-time. This is just a copy of the value provided
+          * by the \a Derived type. If a value is not known at compile-time,
+          * it is set to the \a Dynamic constant.
+          * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
+
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+        /**< The number of columns at compile-time. This is just a copy of the value provided
+          * by the \a Derived type. If a value is not known at compile-time,
+          * it is set to the \a Dynamic constant.
+          * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
+
+
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
+        /**< This is equal to the number of coefficients, i.e. the number of
+          * rows times the number of columns, or to \a Dynamic if this is not
+          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+      MaxRowsAtCompileTime = RowsAtCompileTime,
+      MaxColsAtCompileTime = ColsAtCompileTime,
+
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<MaxRowsAtCompileTime,
+                                                      MaxColsAtCompileTime>::ret),
+
+      IsVectorAtCompileTime = RowsAtCompileTime == 1 || ColsAtCompileTime == 1,
+        /**< This is set to true if either the number of rows or the number of
+          * columns is known at compile-time to be equal to 1. Indeed, in that case,
+          * we are dealing with a column-vector (if there is only one column) or with
+          * a row-vector (if there is only one row). */
+
+      Flags = internal::traits<Derived>::Flags,
+        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
+          * constructed from this one. See the \ref flags "list of flags".
+          */
+
+      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+        /**< This is a rough measure of how expensive it is to read one coefficient from
+          * this expression.
+          */
+
+      IsRowMajor = Flags&RowMajorBit ? 1 : 0,
+      
+      InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
+                             : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+
+      #ifndef EIGEN_PARSED_BY_DOXYGEN
+      _HasDirectAccess = (int(Flags)&DirectAccessBit) ? 1 : 0 // workaround sunCC
+      #endif
+    };
+
+    /** \internal the return type of MatrixBase::adjoint() */
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, Eigen::Transpose<const Derived> >,
+                        Transpose<const Derived>
+                     >::type AdjointReturnType;
+
+
+    typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, Index> PlainObject;
+
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is the "real scalar" type; if the \a Scalar type is already real numbers
+      * (e.g. int, float or double) then \a RealScalar is just the same as \a Scalar. If
+      * \a Scalar is \a std::complex<T> then RealScalar is \a T.
+      *
+      * \sa class NumTraits
+      */
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    /** \internal the return type of coeff()
+      */
+    typedef typename internal::conditional<_HasDirectAccess, const Scalar&, Scalar>::type CoeffReturnType;
+
+    /** \internal Represents a matrix with all coefficients equal to one another*/
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Matrix<Scalar,Dynamic,Dynamic> > ConstantReturnType;
+
+    /** type of the equivalent square matrix */
+    typedef Matrix<Scalar,EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime),
+                          EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime)> SquareMatrixType;
+
+    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    inline Derived& derived() { return *static_cast<Derived*>(this); }
+    inline Derived& const_cast_derived() const
+    { return *static_cast<Derived*>(const_cast<SparseMatrixBase*>(this)); }
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
+#   include "../plugins/CommonCwiseUnaryOps.h"
+#   include "../plugins/CommonCwiseBinaryOps.h"
+#   include "../plugins/MatrixCwiseUnaryOps.h"
+#   include "../plugins/MatrixCwiseBinaryOps.h"
+#   include "../plugins/BlockMethods.h"
+#   ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
+#     include EIGEN_SPARSEMATRIXBASE_PLUGIN
+#   endif
+#   undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+
+    /** \returns the number of rows. \sa cols() */
+    inline Index rows() const { return derived().rows(); }
+    /** \returns the number of columns. \sa rows() */
+    inline Index cols() const { return derived().cols(); }
+    /** \returns the number of coefficients, which is \a rows()*cols().
+      * \sa rows(), cols(). */
+    inline Index size() const { return rows() * cols(); }
+    /** \returns the number of nonzero coefficients which is in practice the number
+      * of stored coefficients. */
+    inline Index nonZeros() const { return derived().nonZeros(); }
+    /** \returns true if either the number of rows or the number of columns is equal to 1.
+      * In other words, this function returns
+      * \code rows()==1 || cols()==1 \endcode
+      * \sa rows(), cols(), IsVectorAtCompileTime. */
+    inline bool isVector() const { return rows()==1 || cols()==1; }
+    /** \returns the size of the storage major dimension,
+      * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
+    Index outerSize() const { return (int(Flags)&RowMajorBit) ? this->rows() : this->cols(); }
+    /** \returns the size of the inner dimension according to the storage order,
+      * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
+    Index innerSize() const { return (int(Flags)&RowMajorBit) ? this->cols() : this->rows(); }
+
+    bool isRValue() const { return m_isRValue; }
+    Derived& markAsRValue() { m_isRValue = true; return derived(); }
+
+    SparseMatrixBase() : m_isRValue(false) { /* TODO check flags */ }
+
+    
+    template<typename OtherDerived>
+    Derived& operator=(const ReturnByValue<OtherDerived>& other)
+    {
+      other.evalTo(derived());
+      return derived();
+    }
+
+
+    template<typename OtherDerived>
+    inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other)
+    {
+      return assign(other.derived());
+    }
+
+    inline Derived& operator=(const Derived& other)
+    {
+//       if (other.isRValue())
+//         derived().swap(other.const_cast_derived());
+//       else
+      return assign(other.derived());
+    }
+
+  protected:
+
+    template<typename OtherDerived>
+    inline Derived& assign(const OtherDerived& other)
+    {
+      const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+      const Index outerSize = (int(OtherDerived::Flags) & RowMajorBit) ? other.rows() : other.cols();
+      if ((!transpose) && other.isRValue())
+      {
+        // eval without temporary
+        derived().resize(other.rows(), other.cols());
+        derived().setZero();
+        derived().reserve((std::max)(this->rows(),this->cols())*2);
+        for (Index j=0; j<outerSize; ++j)
+        {
+          derived().startVec(j);
+          for (typename OtherDerived::InnerIterator it(other, j); it; ++it)
+          {
+            Scalar v = it.value();
+            derived().insertBackByOuterInner(j,it.index()) = v;
+          }
+        }
+        derived().finalize();
+      }
+      else
+      {
+        assignGeneric(other);
+      }
+      return derived();
+    }
+
+    template<typename OtherDerived>
+    inline void assignGeneric(const OtherDerived& other)
+    {
+      //const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+      eigen_assert(( ((internal::traits<Derived>::SupportedAccessPatterns&OuterRandomAccessPattern)==OuterRandomAccessPattern) ||
+                  (!((Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit)))) &&
+                  "the transpose operation is supposed to be handled in SparseMatrix::operator=");
+
+      enum { Flip = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit) };
+
+      const Index outerSize = other.outerSize();
+      //typedef typename internal::conditional<transpose, LinkedVectorMatrix<Scalar,Flags&RowMajorBit>, Derived>::type TempType;
+      // thanks to shallow copies, we always eval to a tempary
+      Derived temp(other.rows(), other.cols());
+
+      temp.reserve((std::max)(this->rows(),this->cols())*2);
+      for (Index j=0; j<outerSize; ++j)
+      {
+        temp.startVec(j);
+        for (typename OtherDerived::InnerIterator it(other.derived(), j); it; ++it)
+        {
+          Scalar v = it.value();
+          temp.insertBackByOuterInner(Flip?it.index():j,Flip?j:it.index()) = v;
+        }
+      }
+      temp.finalize();
+
+      derived() = temp.markAsRValue();
+    }
+
+  public:
+
+    template<typename Lhs, typename Rhs>
+    inline Derived& operator=(const SparseSparseProduct<Lhs,Rhs>& product);
+
+    friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m)
+    {
+      typedef typename Derived::Nested Nested;
+      typedef typename internal::remove_all<Nested>::type NestedCleaned;
+
+      if (Flags&RowMajorBit)
+      {
+        const Nested nm(m.derived());
+        for (Index row=0; row<nm.outerSize(); ++row)
+        {
+          Index col = 0;
+          for (typename NestedCleaned::InnerIterator it(nm.derived(), row); it; ++it)
+          {
+            for ( ; col<it.index(); ++col)
+              s << "0 ";
+            s << it.value() << " ";
+            ++col;
+          }
+          for ( ; col<m.cols(); ++col)
+            s << "0 ";
+          s << std::endl;
+        }
+      }
+      else
+      {
+        const Nested nm(m.derived());
+        if (m.cols() == 1) {
+          Index row = 0;
+          for (typename NestedCleaned::InnerIterator it(nm.derived(), 0); it; ++it)
+          {
+            for ( ; row<it.index(); ++row)
+              s << "0" << std::endl;
+            s << it.value() << std::endl;
+            ++row;
+          }
+          for ( ; row<m.rows(); ++row)
+            s << "0" << std::endl;
+        }
+        else
+        {
+          SparseMatrix<Scalar, RowMajorBit, Index> trans = m;
+          s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, Index> >&>(trans);
+        }
+      }
+      return s;
+    }
+
+    template<typename OtherDerived>
+    Derived& operator+=(const SparseMatrixBase<OtherDerived>& other);
+    template<typename OtherDerived>
+    Derived& operator-=(const SparseMatrixBase<OtherDerived>& other);
+
+    Derived& operator*=(const Scalar& other);
+    Derived& operator/=(const Scalar& other);
+
+    #define EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE \
+      CwiseBinaryOp< \
+        internal::scalar_product_op< \
+          typename internal::scalar_product_traits< \
+            typename internal::traits<Derived>::Scalar, \
+            typename internal::traits<OtherDerived>::Scalar \
+          >::ReturnType \
+        >, \
+        const Derived, \
+        const OtherDerived \
+      >
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+    cwiseProduct(const MatrixBase<OtherDerived> &other) const;
+
+    // sparse * sparse
+    template<typename OtherDerived>
+    const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
+    operator*(const SparseMatrixBase<OtherDerived> &other) const;
+
+    // sparse * diagonal
+    template<typename OtherDerived>
+    const SparseDiagonalProduct<Derived,OtherDerived>
+    operator*(const DiagonalBase<OtherDerived> &other) const;
+
+    // diagonal * sparse
+    template<typename OtherDerived> friend
+    const SparseDiagonalProduct<OtherDerived,Derived>
+    operator*(const DiagonalBase<OtherDerived> &lhs, const SparseMatrixBase& rhs)
+    { return SparseDiagonalProduct<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
+
+    /** dense * sparse (return a dense object unless it is an outer product) */
+    template<typename OtherDerived> friend
+    const typename DenseSparseProductReturnType<OtherDerived,Derived>::Type
+    operator*(const MatrixBase<OtherDerived>& lhs, const Derived& rhs)
+    { return typename DenseSparseProductReturnType<OtherDerived,Derived>::Type(lhs.derived(),rhs); }
+
+    /** sparse * dense (returns a dense object unless it is an outer product) */
+    template<typename OtherDerived>
+    const typename SparseDenseProductReturnType<Derived,OtherDerived>::Type
+    operator*(const MatrixBase<OtherDerived> &other) const
+    { return typename SparseDenseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived()); }
+    
+     /** \returns an expression of P H P^-1 where H is the matrix represented by \c *this */
+    SparseSymmetricPermutationProduct<Derived,Upper|Lower> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
+    {
+      return SparseSymmetricPermutationProduct<Derived,Upper|Lower>(derived(), perm);
+    }
+
+    template<typename OtherDerived>
+    Derived& operator*=(const SparseMatrixBase<OtherDerived>& other);
+
+    #ifdef EIGEN2_SUPPORT
+    // deprecated
+    template<typename OtherDerived>
+    typename internal::plain_matrix_type_column_major<OtherDerived>::type
+    solveTriangular(const MatrixBase<OtherDerived>& other) const;
+
+    // deprecated
+    template<typename OtherDerived>
+    void solveTriangularInPlace(MatrixBase<OtherDerived>& other) const;
+    #endif // EIGEN2_SUPPORT
+
+    template<int Mode>
+    inline const SparseTriangularView<Derived, Mode> triangularView() const;
+
+    template<unsigned int UpLo> inline const SparseSelfAdjointView<Derived, UpLo> selfadjointView() const;
+    template<unsigned int UpLo> inline SparseSelfAdjointView<Derived, UpLo> selfadjointView();
+
+    template<typename OtherDerived> Scalar dot(const MatrixBase<OtherDerived>& other) const;
+    template<typename OtherDerived> Scalar dot(const SparseMatrixBase<OtherDerived>& other) const;
+    RealScalar squaredNorm() const;
+    RealScalar norm()  const;
+    RealScalar blueNorm() const;
+
+    Transpose<Derived> transpose() { return derived(); }
+    const Transpose<const Derived> transpose() const { return derived(); }
+    const AdjointReturnType adjoint() const { return transpose(); }
+
+    // inner-vector
+    typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
+    typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
+    InnerVectorReturnType innerVector(Index outer);
+    const ConstInnerVectorReturnType innerVector(Index outer) const;
+
+    // set of inner-vectors
+    Block<Derived,Dynamic,Dynamic,true> innerVectors(Index outerStart, Index outerSize);
+    const Block<const Derived,Dynamic,Dynamic,true> innerVectors(Index outerStart, Index outerSize) const;
+
+    /** \internal use operator= */
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived>& dst) const
+    {
+      dst.setZero();
+      for (Index j=0; j<outerSize(); ++j)
+        for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+          dst.coeffRef(i.row(),i.col()) = i.value();
+    }
+
+    Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> toDense() const
+    {
+      return derived();
+    }
+
+    template<typename OtherDerived>
+    bool isApprox(const SparseMatrixBase<OtherDerived>& other,
+                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
+    { return toDense().isApprox(other.toDense(),prec); }
+
+    template<typename OtherDerived>
+    bool isApprox(const MatrixBase<OtherDerived>& other,
+                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
+    { return toDense().isApprox(other,prec); }
+
+    /** \returns the matrix or vector obtained by evaluating this expression.
+      *
+      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
+      * a const reference, in order to avoid a useless copy.
+      */
+    inline const typename internal::eval<Derived>::type eval() const
+    { return typename internal::eval<Derived>::type(derived()); }
+
+    Scalar sum() const;
+
+  protected:
+
+    bool m_isRValue;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEMATRIXBASE_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparsePermutation.h b/include/eigen3/Eigen/src/SparseCore/SparsePermutation.h
new file mode 100644
index 0000000..75e2100
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparsePermutation.h
@@ -0,0 +1,148 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_PERMUTATION_H
+#define EIGEN_SPARSE_PERMUTATION_H
+
+// This file implements sparse * permutation products
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
+struct traits<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+{
+  typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
+  typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
+  typedef typename MatrixTypeNestedCleaned::Index Index;
+  enum {
+    SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
+    MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
+  };
+
+  typedef typename internal::conditional<MoveOuter,
+        SparseMatrix<Scalar,SrcStorageOrder,Index>,
+        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> >::type ReturnType;
+};
+
+template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
+struct permut_sparsematrix_product_retval
+ : public ReturnByValue<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+{
+    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
+    typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
+    typedef typename MatrixTypeNestedCleaned::Index Index;
+
+    enum {
+      SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
+      MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
+    };
+
+    permut_sparsematrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
+      : m_permutation(perm), m_matrix(matrix)
+    {}
+
+    inline int rows() const { return m_matrix.rows(); }
+    inline int cols() const { return m_matrix.cols(); }
+
+    template<typename Dest> inline void evalTo(Dest& dst) const
+    {
+      if(MoveOuter)
+      {
+        SparseMatrix<Scalar,SrcStorageOrder,Index> tmp(m_matrix.rows(), m_matrix.cols());
+        Matrix<Index,Dynamic,1> sizes(m_matrix.outerSize());
+        for(Index j=0; j<m_matrix.outerSize(); ++j)
+        {
+          Index jp = m_permutation.indices().coeff(j);
+          sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = m_matrix.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).nonZeros();
+        }
+        tmp.reserve(sizes);
+        for(Index j=0; j<m_matrix.outerSize(); ++j)
+        {
+          Index jp = m_permutation.indices().coeff(j);
+          Index jsrc = ((Side==OnTheRight) ^ Transposed) ? jp : j;
+          Index jdst = ((Side==OnTheLeft) ^ Transposed) ? jp : j;
+          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,jsrc); it; ++it)
+            tmp.insertByOuterInner(jdst,it.index()) = it.value();
+        }
+        dst = tmp;
+      }
+      else
+      {
+        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> tmp(m_matrix.rows(), m_matrix.cols());
+        Matrix<Index,Dynamic,1> sizes(tmp.outerSize());
+        sizes.setZero();
+        PermutationMatrix<Dynamic,Dynamic,Index> perm;
+        if((Side==OnTheLeft) ^ Transposed)
+          perm = m_permutation;
+        else
+          perm = m_permutation.transpose();
+
+        for(Index j=0; j<m_matrix.outerSize(); ++j)
+          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
+            sizes[perm.indices().coeff(it.index())]++;
+        tmp.reserve(sizes);
+        for(Index j=0; j<m_matrix.outerSize(); ++j)
+          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
+            tmp.insertByOuterInner(perm.indices().coeff(it.index()),j) = it.value();
+        dst = tmp;
+      }
+    }
+
+  protected:
+    const PermutationType& m_permutation;
+    typename MatrixType::Nested m_matrix;
+};
+
+}
+
+
+
+/** \returns the matrix with the permutation applied to the columns
+  */
+template<typename SparseDerived, typename PermDerived>
+inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>
+operator*(const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm)
+{
+  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>(perm, matrix.derived());
+}
+
+/** \returns the matrix with the permutation applied to the rows
+  */
+template<typename SparseDerived, typename PermDerived>
+inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>
+operator*( const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix)
+{
+  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>(perm, matrix.derived());
+}
+
+
+
+/** \returns the matrix with the inverse permutation applied to the columns.
+  */
+template<typename SparseDerived, typename PermDerived>
+inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>
+operator*(const SparseMatrixBase<SparseDerived>& matrix, const Transpose<PermutationBase<PermDerived> >& tperm)
+{
+  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>(tperm.nestedPermutation(), matrix.derived());
+}
+
+/** \returns the matrix with the inverse permutation applied to the rows.
+  */
+template<typename SparseDerived, typename PermDerived>
+inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>
+operator*(const Transpose<PermutationBase<PermDerived> >& tperm, const SparseMatrixBase<SparseDerived>& matrix)
+{
+  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>(tperm.nestedPermutation(), matrix.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_SELFADJOINTVIEW_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseProduct.h b/include/eigen3/Eigen/src/SparseCore/SparseProduct.h
new file mode 100644
index 0000000..cf76630
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseProduct.h
@@ -0,0 +1,188 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEPRODUCT_H
+#define EIGEN_SPARSEPRODUCT_H
+
+namespace Eigen { 
+
+template<typename Lhs, typename Rhs>
+struct SparseSparseProductReturnType
+{
+  typedef typename internal::traits<Lhs>::Scalar Scalar;
+  typedef typename internal::traits<Lhs>::Index Index;
+  enum {
+    LhsRowMajor = internal::traits<Lhs>::Flags & RowMajorBit,
+    RhsRowMajor = internal::traits<Rhs>::Flags & RowMajorBit,
+    TransposeRhs = (!LhsRowMajor) && RhsRowMajor,
+    TransposeLhs = LhsRowMajor && (!RhsRowMajor)
+  };
+
+  typedef typename internal::conditional<TransposeLhs,
+    SparseMatrix<Scalar,0,Index>,
+    typename internal::nested<Lhs,Rhs::RowsAtCompileTime>::type>::type LhsNested;
+
+  typedef typename internal::conditional<TransposeRhs,
+    SparseMatrix<Scalar,0,Index>,
+    typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type>::type RhsNested;
+
+  typedef SparseSparseProduct<LhsNested, RhsNested> Type;
+};
+
+namespace internal {
+template<typename LhsNested, typename RhsNested>
+struct traits<SparseSparseProduct<LhsNested, RhsNested> >
+{
+  typedef MatrixXpr XprKind;
+  // clean the nested types:
+  typedef typename remove_all<LhsNested>::type _LhsNested;
+  typedef typename remove_all<RhsNested>::type _RhsNested;
+  typedef typename _LhsNested::Scalar Scalar;
+  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
+                                         typename traits<_RhsNested>::Index>::type Index;
+
+  enum {
+    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+    LhsFlags = _LhsNested::Flags,
+    RhsFlags = _RhsNested::Flags,
+
+    RowsAtCompileTime    = _LhsNested::RowsAtCompileTime,
+    ColsAtCompileTime    = _RhsNested::ColsAtCompileTime,
+    MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+    EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
+
+    RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
+
+    Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+          | EvalBeforeAssigningBit
+          | EvalBeforeNestingBit,
+
+    CoeffReadCost = Dynamic
+  };
+
+  typedef Sparse StorageKind;
+};
+
+} // end namespace internal
+
+template<typename LhsNested, typename RhsNested>
+class SparseSparseProduct : internal::no_assignment_operator,
+  public SparseMatrixBase<SparseSparseProduct<LhsNested, RhsNested> >
+{
+  public:
+
+    typedef SparseMatrixBase<SparseSparseProduct> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(SparseSparseProduct)
+
+  private:
+
+    typedef typename internal::traits<SparseSparseProduct>::_LhsNested _LhsNested;
+    typedef typename internal::traits<SparseSparseProduct>::_RhsNested _RhsNested;
+
+  public:
+
+    template<typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs)
+      : m_lhs(lhs), m_rhs(rhs), m_tolerance(0), m_conservative(true)
+    {
+      init();
+    }
+
+    template<typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs, const RealScalar& tolerance)
+      : m_lhs(lhs), m_rhs(rhs), m_tolerance(tolerance), m_conservative(false)
+    {
+      init();
+    }
+
+    SparseSparseProduct pruned(const Scalar& reference = 0, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) const
+    {
+      using std::abs;
+      return SparseSparseProduct(m_lhs,m_rhs,abs(reference)*epsilon);
+    }
+
+    template<typename Dest>
+    void evalTo(Dest& result) const
+    {
+      if(m_conservative)
+        internal::conservative_sparse_sparse_product_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result);
+      else
+        internal::sparse_sparse_product_with_pruning_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result,m_tolerance);
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+
+    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
+    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
+
+  protected:
+    void init()
+    {
+      eigen_assert(m_lhs.cols() == m_rhs.rows());
+
+      enum {
+        ProductIsValid = _LhsNested::ColsAtCompileTime==Dynamic
+                      || _RhsNested::RowsAtCompileTime==Dynamic
+                      || int(_LhsNested::ColsAtCompileTime)==int(_RhsNested::RowsAtCompileTime),
+        AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime,
+        SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested,_RhsNested)
+      };
+      // note to the lost user:
+      //    * for a dot product use: v1.dot(v2)
+      //    * for a coeff-wise product use: v1.cwise()*v2
+      EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
+        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+      EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
+        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+      EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
+    }
+
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+    RealScalar m_tolerance;
+    bool m_conservative;
+};
+
+// sparse = sparse * sparse
+template<typename Derived>
+template<typename Lhs, typename Rhs>
+inline Derived& SparseMatrixBase<Derived>::operator=(const SparseSparseProduct<Lhs,Rhs>& product)
+{
+  product.evalTo(derived());
+  return derived();
+}
+
+/** \returns an expression of the product of two sparse matrices.
+  * By default a conservative product preserving the symbolic non zeros is performed.
+  * The automatic pruning of the small values can be achieved by calling the pruned() function
+  * in which case a totally different product algorithm is employed:
+  * \code
+  * C = (A*B).pruned();             // supress numerical zeros (exact)
+  * C = (A*B).pruned(ref);
+  * C = (A*B).pruned(ref,epsilon);
+  * \endcode
+  * where \c ref is a meaningful non zero reference value.
+  * */
+template<typename Derived>
+template<typename OtherDerived>
+inline const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
+SparseMatrixBase<Derived>::operator*(const SparseMatrixBase<OtherDerived> &other) const
+{
+  return typename SparseSparseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEPRODUCT_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseRedux.h b/include/eigen3/Eigen/src/SparseCore/SparseRedux.h
new file mode 100644
index 0000000..f3da93a
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseRedux.h
@@ -0,0 +1,45 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEREDUX_H
+#define EIGEN_SPARSEREDUX_H
+
+namespace Eigen { 
+
+template<typename Derived>
+typename internal::traits<Derived>::Scalar
+SparseMatrixBase<Derived>::sum() const
+{
+  eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
+  Scalar res(0);
+  for (Index j=0; j<outerSize(); ++j)
+    for (typename Derived::InnerIterator iter(derived(),j); iter; ++iter)
+      res += iter.value();
+  return res;
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+typename internal::traits<SparseMatrix<_Scalar,_Options,_Index> >::Scalar
+SparseMatrix<_Scalar,_Options,_Index>::sum() const
+{
+  eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
+  return Matrix<Scalar,1,Dynamic>::Map(&m_data.value(0), m_data.size()).sum();
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+typename internal::traits<SparseVector<_Scalar,_Options, _Index> >::Scalar
+SparseVector<_Scalar,_Options,_Index>::sum() const
+{
+  eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
+  return Matrix<Scalar,1,Dynamic>::Map(&m_data.value(0), m_data.size()).sum();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEREDUX_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseSelfAdjointView.h b/include/eigen3/Eigen/src/SparseCore/SparseSelfAdjointView.h
new file mode 100644
index 0000000..0eda96b
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -0,0 +1,507 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_SELFADJOINTVIEW_H
+#define EIGEN_SPARSE_SELFADJOINTVIEW_H
+
+namespace Eigen { 
+
+/** \ingroup SparseCore_Module
+  * \class SparseSelfAdjointView
+  *
+  * \brief Pseudo expression to manipulate a triangular sparse matrix as a selfadjoint matrix.
+  *
+  * \param MatrixType the type of the dense matrix storing the coefficients
+  * \param UpLo can be either \c #Lower or \c #Upper
+  *
+  * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
+  * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
+  * and most of the time this is the only way that it is used.
+  *
+  * \sa SparseMatrixBase::selfadjointView()
+  */
+template<typename Lhs, typename Rhs, int UpLo>
+class SparseSelfAdjointTimeDenseProduct;
+
+template<typename Lhs, typename Rhs, int UpLo>
+class DenseTimeSparseSelfAdjointProduct;
+
+namespace internal {
+  
+template<typename MatrixType, unsigned int UpLo>
+struct traits<SparseSelfAdjointView<MatrixType,UpLo> > : traits<MatrixType> {
+};
+
+template<int SrcUpLo,int DstUpLo,typename MatrixType,int DestOrder>
+void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+
+template<int UpLo,typename MatrixType,int DestOrder>
+void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+
+}
+
+template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
+  : public EigenBase<SparseSelfAdjointView<MatrixType,UpLo> >
+{
+  public:
+
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    typedef Matrix<Index,Dynamic,1> VectorI;
+    typedef typename MatrixType::Nested MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+
+    inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix)
+    {
+      eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices");
+    }
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+
+    /** \internal \returns a reference to the nested matrix */
+    const _MatrixTypeNested& matrix() const { return m_matrix; }
+    _MatrixTypeNested& matrix() { return m_matrix.const_cast_derived(); }
+
+    /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a rhs.
+      *
+      * Note that there is no algorithmic advantage of performing such a product compared to a general sparse-sparse matrix product.
+      * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
+      */
+    template<typename OtherDerived>
+    SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>
+    operator*(const SparseMatrixBase<OtherDerived>& rhs) const
+    {
+      return SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>(*this, rhs.derived());
+    }
+
+    /** \returns an expression of the matrix product between a sparse matrix \a lhs and a sparse self-adjoint matrix \a rhs.
+      *
+      * Note that there is no algorithmic advantage of performing such a product compared to a general sparse-sparse matrix product.
+      * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
+      */
+    template<typename OtherDerived> friend
+    SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject >
+    operator*(const SparseMatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
+    {
+      return SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject>(lhs.derived(), rhs);
+    }
+    
+    /** Efficient sparse self-adjoint matrix times dense vector/matrix product */
+    template<typename OtherDerived>
+    SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>
+    operator*(const MatrixBase<OtherDerived>& rhs) const
+    {
+      return SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>(m_matrix, rhs.derived());
+    }
+
+    /** Efficient dense vector/matrix times sparse self-adjoint matrix product */
+    template<typename OtherDerived> friend
+    DenseTimeSparseSelfAdjointProduct<OtherDerived,MatrixType,UpLo>
+    operator*(const MatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
+    {
+      return DenseTimeSparseSelfAdjointProduct<OtherDerived,_MatrixTypeNested,UpLo>(lhs.derived(), rhs.m_matrix);
+    }
+
+    /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
+      * \f$ this = this + \alpha ( u u^* ) \f$ where \a u is a vector or matrix.
+      *
+      * \returns a reference to \c *this
+      *
+      * To perform \f$ this = this + \alpha ( u^* u ) \f$ you can simply
+      * call this function with u.adjoint().
+      */
+    template<typename DerivedU>
+    SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
+    
+    /** \internal triggered by sparse_matrix = SparseSelfadjointView; */
+    template<typename DestScalar,int StorageOrder> void evalTo(SparseMatrix<DestScalar,StorageOrder,Index>& _dest) const
+    {
+      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, _dest);
+    }
+    
+    template<typename DestScalar> void evalTo(DynamicSparseMatrix<DestScalar,ColMajor,Index>& _dest) const
+    {
+      // TODO directly evaluate into _dest;
+      SparseMatrix<DestScalar,ColMajor,Index> tmp(_dest.rows(),_dest.cols());
+      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, tmp);
+      _dest = tmp;
+    }
+    
+    /** \returns an expression of P H P^-1 */
+    SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
+    {
+      return SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo>(m_matrix, perm);
+    }
+    
+    template<typename SrcMatrixType,int SrcUpLo>
+    SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcUpLo>& permutedMatrix)
+    {
+      permutedMatrix.evalTo(*this);
+      return *this;
+    }
+
+
+    SparseSelfAdjointView& operator=(const SparseSelfAdjointView& src)
+    {
+      PermutationMatrix<Dynamic> pnull;
+      return *this = src.twistedBy(pnull);
+    }
+
+    template<typename SrcMatrixType,unsigned int SrcUpLo>
+    SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcUpLo>& src)
+    {
+      PermutationMatrix<Dynamic> pnull;
+      return *this = src.twistedBy(pnull);
+    }
+    
+
+    // const SparseLLT<PlainObject, UpLo> llt() const;
+    // const SparseLDLT<PlainObject, UpLo> ldlt() const;
+
+  protected:
+
+    typename MatrixType::Nested m_matrix;
+    mutable VectorI m_countPerRow;
+    mutable VectorI m_countPerCol;
+};
+
+/***************************************************************************
+* Implementation of SparseMatrixBase methods
+***************************************************************************/
+
+template<typename Derived>
+template<unsigned int UpLo>
+const SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView() const
+{
+  return derived();
+}
+
+template<typename Derived>
+template<unsigned int UpLo>
+SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView()
+{
+  return derived();
+}
+
+/***************************************************************************
+* Implementation of SparseSelfAdjointView methods
+***************************************************************************/
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU>
+SparseSelfAdjointView<MatrixType,UpLo>&
+SparseSelfAdjointView<MatrixType,UpLo>::rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha)
+{
+  SparseMatrix<Scalar,MatrixType::Flags&RowMajorBit?RowMajor:ColMajor> tmp = u * u.adjoint();
+  if(alpha==Scalar(0))
+    m_matrix.const_cast_derived() = tmp.template triangularView<UpLo>();
+  else
+    m_matrix.const_cast_derived() += alpha * tmp.template triangularView<UpLo>();
+
+  return *this;
+}
+
+/***************************************************************************
+* Implementation of sparse self-adjoint time dense matrix
+***************************************************************************/
+
+namespace internal {
+template<typename Lhs, typename Rhs, int UpLo>
+struct traits<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo> >
+ : traits<ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
+{
+  typedef Dense StorageKind;
+};
+}
+
+template<typename Lhs, typename Rhs, int UpLo>
+class SparseSelfAdjointTimeDenseProduct
+  : public ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
+{
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseSelfAdjointTimeDenseProduct)
+
+    SparseSelfAdjointTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {}
+
+    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(alpha);
+      // TODO use alpha
+      eigen_assert(alpha==Scalar(1) && "alpha != 1 is not implemented yet, sorry");
+      typedef typename internal::remove_all<Lhs>::type _Lhs;
+      typedef typename _Lhs::InnerIterator LhsInnerIterator;
+      enum {
+        LhsIsRowMajor = (_Lhs::Flags&RowMajorBit)==RowMajorBit,
+        ProcessFirstHalf =
+                 ((UpLo&(Upper|Lower))==(Upper|Lower))
+              || ( (UpLo&Upper) && !LhsIsRowMajor)
+              || ( (UpLo&Lower) && LhsIsRowMajor),
+        ProcessSecondHalf = !ProcessFirstHalf
+      };
+      for (Index j=0; j<m_lhs.outerSize(); ++j)
+      {
+        LhsInnerIterator i(m_lhs,j);
+        if (ProcessSecondHalf)
+        {
+          while (i && i.index()<j) ++i;
+          if(i && i.index()==j)
+          {
+            dest.row(j) += i.value() * m_rhs.row(j);
+            ++i;
+          }
+        }
+        for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
+        {
+          Index a = LhsIsRowMajor ? j : i.index();
+          Index b = LhsIsRowMajor ? i.index() : j;
+          typename Lhs::Scalar v = i.value();
+          dest.row(a) += (v) * m_rhs.row(b);
+          dest.row(b) += numext::conj(v) * m_rhs.row(a);
+        }
+        if (ProcessFirstHalf && i && (i.index()==j))
+          dest.row(j) += i.value() * m_rhs.row(j);
+      }
+    }
+
+  private:
+    SparseSelfAdjointTimeDenseProduct& operator=(const SparseSelfAdjointTimeDenseProduct&);
+};
+
+namespace internal {
+template<typename Lhs, typename Rhs, int UpLo>
+struct traits<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo> >
+ : traits<ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, typename Rhs, int UpLo>
+class DenseTimeSparseSelfAdjointProduct
+  : public ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
+{
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseSelfAdjointProduct)
+
+    DenseTimeSparseSelfAdjointProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {}
+
+    template<typename Dest> void scaleAndAddTo(Dest& /*dest*/, const Scalar& /*alpha*/) const
+    {
+      // TODO
+    }
+
+  private:
+    DenseTimeSparseSelfAdjointProduct& operator=(const DenseTimeSparseSelfAdjointProduct&);
+};
+
+/***************************************************************************
+* Implementation of symmetric copies and permutations
+***************************************************************************/
+namespace internal {
+  
+template<typename MatrixType, int UpLo>
+struct traits<SparseSymmetricPermutationProduct<MatrixType,UpLo> > : traits<MatrixType> {
+};
+
+template<int UpLo,typename MatrixType,int DestOrder>
+void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
+{
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef SparseMatrix<Scalar,DestOrder,Index> Dest;
+  typedef Matrix<Index,Dynamic,1> VectorI;
+  
+  Dest& dest(_dest.derived());
+  enum {
+    StorageOrderMatch = int(Dest::IsRowMajor) == int(MatrixType::IsRowMajor)
+  };
+  
+  Index size = mat.rows();
+  VectorI count;
+  count.resize(size);
+  count.setZero();
+  dest.resize(size,size);
+  for(Index j = 0; j<size; ++j)
+  {
+    Index jp = perm ? perm[j] : j;
+    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    {
+      Index i = it.index();
+      Index r = it.row();
+      Index c = it.col();
+      Index ip = perm ? perm[i] : i;
+      if(UpLo==(Upper|Lower))
+        count[StorageOrderMatch ? jp : ip]++;
+      else if(r==c)
+        count[ip]++;
+      else if(( UpLo==Lower && r>c) || ( UpLo==Upper && r<c))
+      {
+        count[ip]++;
+        count[jp]++;
+      }
+    }
+  }
+  Index nnz = count.sum();
+  
+  // reserve space
+  dest.resizeNonZeros(nnz);
+  dest.outerIndexPtr()[0] = 0;
+  for(Index j=0; j<size; ++j)
+    dest.outerIndexPtr()[j+1] = dest.outerIndexPtr()[j] + count[j];
+  for(Index j=0; j<size; ++j)
+    count[j] = dest.outerIndexPtr()[j];
+  
+  // copy data
+  for(Index j = 0; j<size; ++j)
+  {
+    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    {
+      Index i = it.index();
+      Index r = it.row();
+      Index c = it.col();
+      
+      Index jp = perm ? perm[j] : j;
+      Index ip = perm ? perm[i] : i;
+      
+      if(UpLo==(Upper|Lower))
+      {
+        Index k = count[StorageOrderMatch ? jp : ip]++;
+        dest.innerIndexPtr()[k] = StorageOrderMatch ? ip : jp;
+        dest.valuePtr()[k] = it.value();
+      }
+      else if(r==c)
+      {
+        Index k = count[ip]++;
+        dest.innerIndexPtr()[k] = ip;
+        dest.valuePtr()[k] = it.value();
+      }
+      else if(( (UpLo&Lower)==Lower && r>c) || ( (UpLo&Upper)==Upper && r<c))
+      {
+        if(!StorageOrderMatch)
+          std::swap(ip,jp);
+        Index k = count[jp]++;
+        dest.innerIndexPtr()[k] = ip;
+        dest.valuePtr()[k] = it.value();
+        k = count[ip]++;
+        dest.innerIndexPtr()[k] = jp;
+        dest.valuePtr()[k] = numext::conj(it.value());
+      }
+    }
+  }
+}
+
+template<int _SrcUpLo,int _DstUpLo,typename MatrixType,int DstOrder>
+void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DstOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
+{
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  SparseMatrix<Scalar,DstOrder,Index>& dest(_dest.derived());
+  typedef Matrix<Index,Dynamic,1> VectorI;
+  enum {
+    SrcOrder = MatrixType::IsRowMajor ? RowMajor : ColMajor,
+    StorageOrderMatch = int(SrcOrder) == int(DstOrder),
+    DstUpLo = DstOrder==RowMajor ? (_DstUpLo==Upper ? Lower : Upper) : _DstUpLo,
+    SrcUpLo = SrcOrder==RowMajor ? (_SrcUpLo==Upper ? Lower : Upper) : _SrcUpLo
+  };
+  
+  Index size = mat.rows();
+  VectorI count(size);
+  count.setZero();
+  dest.resize(size,size);
+  for(Index j = 0; j<size; ++j)
+  {
+    Index jp = perm ? perm[j] : j;
+    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    {
+      Index i = it.index();
+      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
+        continue;
+                  
+      Index ip = perm ? perm[i] : i;
+      count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
+    }
+  }
+  dest.outerIndexPtr()[0] = 0;
+  for(Index j=0; j<size; ++j)
+    dest.outerIndexPtr()[j+1] = dest.outerIndexPtr()[j] + count[j];
+  dest.resizeNonZeros(dest.outerIndexPtr()[size]);
+  for(Index j=0; j<size; ++j)
+    count[j] = dest.outerIndexPtr()[j];
+  
+  for(Index j = 0; j<size; ++j)
+  {
+    
+    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    {
+      Index i = it.index();
+      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
+        continue;
+                  
+      Index jp = perm ? perm[j] : j;
+      Index ip = perm? perm[i] : i;
+      
+      Index k = count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
+      dest.innerIndexPtr()[k] = int(DstUpLo)==int(Lower) ? (std::max)(ip,jp) : (std::min)(ip,jp);
+      
+      if(!StorageOrderMatch) std::swap(ip,jp);
+      if( ((int(DstUpLo)==int(Lower) && ip<jp) || (int(DstUpLo)==int(Upper) && ip>jp)))
+        dest.valuePtr()[k] = numext::conj(it.value());
+      else
+        dest.valuePtr()[k] = it.value();
+    }
+  }
+}
+
+}
+
+template<typename MatrixType,int UpLo>
+class SparseSymmetricPermutationProduct
+  : public EigenBase<SparseSymmetricPermutationProduct<MatrixType,UpLo> >
+{
+  public:
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+  protected:
+    typedef PermutationMatrix<Dynamic,Dynamic,Index> Perm;
+  public:
+    typedef Matrix<Index,Dynamic,1> VectorI;
+    typedef typename MatrixType::Nested MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+    
+    SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm)
+      : m_matrix(mat), m_perm(perm)
+    {}
+    
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+    
+    template<typename DestScalar, int Options, typename DstIndex>
+    void evalTo(SparseMatrix<DestScalar,Options,DstIndex>& _dest) const
+    {
+//       internal::permute_symm_to_fullsymm<UpLo>(m_matrix,_dest,m_perm.indices().data());
+      SparseMatrix<DestScalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
+      internal::permute_symm_to_fullsymm<UpLo>(m_matrix,tmp,m_perm.indices().data());
+      _dest = tmp;
+    }
+    
+    template<typename DestType,unsigned int DestUpLo> void evalTo(SparseSelfAdjointView<DestType,DestUpLo>& dest) const
+    {
+      internal::permute_symm_to_symm<UpLo,DestUpLo>(m_matrix,dest.matrix(),m_perm.indices().data());
+    }
+    
+  protected:
+    MatrixTypeNested m_matrix;
+    const Perm& m_perm;
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_SELFADJOINTVIEW_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/include/eigen3/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
new file mode 100644
index 0000000..fcc18f5
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -0,0 +1,150 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSESPARSEPRODUCTWITHPRUNING_H
+#define EIGEN_SPARSESPARSEPRODUCTWITHPRUNING_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+// perform a pseudo in-place sparse * sparse product assuming all matrices are col major
+template<typename Lhs, typename Rhs, typename ResultType>
+static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, const typename ResultType::RealScalar& tolerance)
+{
+  // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res);
+
+  typedef typename remove_all<Lhs>::type::Scalar Scalar;
+  typedef typename remove_all<Lhs>::type::Index Index;
+
+  // make sure to call innerSize/outerSize since we fake the storage order.
+  Index rows = lhs.innerSize();
+  Index cols = rhs.outerSize();
+  //Index size = lhs.outerSize();
+  eigen_assert(lhs.outerSize() == rhs.innerSize());
+
+  // allocate a temporary buffer
+  AmbiVector<Scalar,Index> tempVector(rows);
+
+  // estimate the number of non zero entries
+  // given a rhs column containing Y non zeros, we assume that the respective Y columns
+  // of the lhs differs in average of one non zeros, thus the number of non zeros for
+  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
+  // per column of the lhs.
+  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
+  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
+
+  // mimics a resizeByInnerOuter:
+  if(ResultType::IsRowMajor)
+    res.resize(cols, rows);
+  else
+    res.resize(rows, cols);
+
+  res.reserve(estimated_nnz_prod);
+  double ratioColRes = double(estimated_nnz_prod)/double(lhs.rows()*rhs.cols());
+  for (Index j=0; j<cols; ++j)
+  {
+    // FIXME:
+    //double ratioColRes = (double(rhs.innerVector(j).nonZeros()) + double(lhs.nonZeros())/double(lhs.cols()))/double(lhs.rows());
+    // let's do a more accurate determination of the nnz ratio for the current column j of res
+    tempVector.init(ratioColRes);
+    tempVector.setZero();
+    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+    {
+      // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
+      tempVector.restart();
+      Scalar x = rhsIt.value();
+      for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
+      {
+        tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x;
+      }
+    }
+    res.startVec(j);
+    for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector,tolerance); it; ++it)
+      res.insertBackByOuterInner(j,it.index()) = it.value();
+  }
+  res.finalize();
+}
+
+template<typename Lhs, typename Rhs, typename ResultType,
+  int LhsStorageOrder = traits<Lhs>::Flags&RowMajorBit,
+  int RhsStorageOrder = traits<Rhs>::Flags&RowMajorBit,
+  int ResStorageOrder = traits<ResultType>::Flags&RowMajorBit>
+struct sparse_sparse_product_with_pruning_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
+{
+  typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
+  typedef typename ResultType::RealScalar RealScalar;
+
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typename remove_all<ResultType>::type _res(res.rows(), res.cols());
+    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, _res, tolerance);
+    res.swap(_res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,RowMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    // we need a col-major matrix to hold the result
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> SparseTemporaryType;
+    SparseTemporaryType _res(res.rows(), res.cols());
+    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, _res, tolerance);
+    res = _res;
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,RowMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    // let's transpose the product to get a column x column product
+    typename remove_all<ResultType>::type _res(res.rows(), res.cols());
+    internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, _res, tolerance);
+    res.swap(_res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,ColMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::Index> ColMajorMatrixLhs;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::Index> ColMajorMatrixRhs;
+    ColMajorMatrixLhs colLhs(lhs);
+    ColMajorMatrixRhs colRhs(rhs);
+    internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance);
+
+    // let's transpose the product to get a column x column product
+//     typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
+//     SparseTemporaryType _res(res.cols(), res.rows());
+//     sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, _res);
+//     res = _res.transpose();
+  }
+};
+
+// NOTE the 2 others cases (col row *) must never occur since they are caught
+// by ProductReturnType which transforms it to (col col *) by evaluating rhs.
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSESPARSEPRODUCTWITHPRUNING_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseTranspose.h b/include/eigen3/Eigen/src/SparseCore/SparseTranspose.h
new file mode 100644
index 0000000..76d031d
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseTranspose.h
@@ -0,0 +1,63 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSETRANSPOSE_H
+#define EIGEN_SPARSETRANSPOSE_H
+
+namespace Eigen { 
+
+template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
+  : public SparseMatrixBase<Transpose<MatrixType> >
+{
+    typedef typename internal::remove_all<typename MatrixType::Nested>::type _MatrixTypeNested;
+  public:
+
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Transpose<MatrixType> )
+
+    class InnerIterator;
+    class ReverseInnerIterator;
+
+    inline Index nonZeros() const { return derived().nestedExpression().nonZeros(); }
+};
+
+// NOTE: VC10 and VC11 trigger an ICE if don't put typename TransposeImpl<MatrixType,Sparse>:: in front of Index,
+// a typedef typename TransposeImpl<MatrixType,Sparse>::Index Index;
+// does not fix the issue.
+// An alternative is to define the nested class in the parent class itself.
+template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::InnerIterator
+  : public _MatrixTypeNested::InnerIterator
+{
+    typedef typename _MatrixTypeNested::InnerIterator Base;
+    typedef typename TransposeImpl::Index Index;
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const TransposeImpl& trans, typename TransposeImpl<MatrixType,Sparse>::Index outer)
+      : Base(trans.derived().nestedExpression(), outer)
+    {}
+    typename TransposeImpl<MatrixType,Sparse>::Index row() const { return Base::col(); }
+    typename TransposeImpl<MatrixType,Sparse>::Index col() const { return Base::row(); }
+};
+
+template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::ReverseInnerIterator
+  : public _MatrixTypeNested::ReverseInnerIterator
+{
+    typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
+    typedef typename TransposeImpl::Index Index;
+  public:
+
+    EIGEN_STRONG_INLINE ReverseInnerIterator(const TransposeImpl& xpr, typename TransposeImpl<MatrixType,Sparse>::Index outer)
+      : Base(xpr.derived().nestedExpression(), outer)
+    {}
+    typename TransposeImpl<MatrixType,Sparse>::Index row() const { return Base::col(); }
+    typename TransposeImpl<MatrixType,Sparse>::Index col() const { return Base::row(); }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSETRANSPOSE_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseTriangularView.h b/include/eigen3/Eigen/src/SparseCore/SparseTriangularView.h
new file mode 100644
index 0000000..333127b
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseTriangularView.h
@@ -0,0 +1,179 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_TRIANGULARVIEW_H
+#define EIGEN_SPARSE_TRIANGULARVIEW_H
+
+namespace Eigen { 
+
+namespace internal {
+  
+template<typename MatrixType, int Mode>
+struct traits<SparseTriangularView<MatrixType,Mode> >
+: public traits<MatrixType>
+{};
+
+} // namespace internal
+
+template<typename MatrixType, int Mode> class SparseTriangularView
+  : public SparseMatrixBase<SparseTriangularView<MatrixType,Mode> >
+{
+    enum { SkipFirst = ((Mode&Lower) && !(MatrixType::Flags&RowMajorBit))
+                    || ((Mode&Upper) &&  (MatrixType::Flags&RowMajorBit)),
+           SkipLast = !SkipFirst,
+           SkipDiag = (Mode&ZeroDiag) ? 1 : 0,
+           HasUnitDiag = (Mode&UnitDiag) ? 1 : 0
+    };
+
+  public:
+    
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseTriangularView)
+
+    class InnerIterator;
+    class ReverseInnerIterator;
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+
+    typedef typename MatrixType::Nested MatrixTypeNested;
+    typedef typename internal::remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
+    typedef typename internal::remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+
+    inline SparseTriangularView(const MatrixType& matrix) : m_matrix(matrix) {}
+
+    /** \internal */
+    inline const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
+
+    template<typename OtherDerived>
+    typename internal::plain_matrix_type_column_major<OtherDerived>::type
+    solve(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
+    template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
+
+  protected:
+    MatrixTypeNested m_matrix;
+};
+
+template<typename MatrixType, int Mode>
+class SparseTriangularView<MatrixType,Mode>::InnerIterator : public MatrixTypeNestedCleaned::InnerIterator
+{
+    typedef typename MatrixTypeNestedCleaned::InnerIterator Base;
+    typedef typename SparseTriangularView::Index Index;
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const SparseTriangularView& view, Index outer)
+      : Base(view.nestedExpression(), outer), m_returnOne(false)
+    {
+      if(SkipFirst)
+      {
+        while((*this) && ((HasUnitDiag||SkipDiag)  ? this->index()<=outer : this->index()<outer))
+          Base::operator++();
+        if(HasUnitDiag)
+          m_returnOne = true;
+      }
+      else if(HasUnitDiag && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+      {
+        if((!SkipFirst) && Base::operator bool())
+          Base::operator++();
+        m_returnOne = true;
+      }
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      if(HasUnitDiag && m_returnOne)
+        m_returnOne = false;
+      else
+      {
+        Base::operator++();
+        if(HasUnitDiag && (!SkipFirst) && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+        {
+          if((!SkipFirst) && Base::operator bool())
+            Base::operator++();
+          m_returnOne = true;
+        }
+      }
+      return *this;
+    }
+
+    inline Index row() const { return (MatrixType::Flags&RowMajorBit ? Base::outer() : this->index()); }
+    inline Index col() const { return (MatrixType::Flags&RowMajorBit ? this->index() : Base::outer()); }
+    inline Index index() const
+    {
+      if(HasUnitDiag && m_returnOne)  return Base::outer();
+      else                            return Base::index();
+    }
+    inline Scalar value() const
+    {
+      if(HasUnitDiag && m_returnOne)  return Scalar(1);
+      else                            return Base::value();
+    }
+
+    EIGEN_STRONG_INLINE operator bool() const
+    {
+      if(HasUnitDiag && m_returnOne)
+        return true;
+      if(SkipFirst) return  Base::operator bool();
+      else
+      {
+        if (SkipDiag) return (Base::operator bool() && this->index() < this->outer());
+        else return (Base::operator bool() && this->index() <= this->outer());
+      }
+    }
+  protected:
+    bool m_returnOne;
+};
+
+template<typename MatrixType, int Mode>
+class SparseTriangularView<MatrixType,Mode>::ReverseInnerIterator : public MatrixTypeNestedCleaned::ReverseInnerIterator
+{
+    typedef typename MatrixTypeNestedCleaned::ReverseInnerIterator Base;
+    typedef typename SparseTriangularView::Index Index;
+  public:
+
+    EIGEN_STRONG_INLINE ReverseInnerIterator(const SparseTriangularView& view, Index outer)
+      : Base(view.nestedExpression(), outer)
+    {
+      eigen_assert((!HasUnitDiag) && "ReverseInnerIterator does not support yet triangular views with a unit diagonal");
+      if(SkipLast) {
+        while((*this) && (SkipDiag ? this->index()>=outer : this->index()>outer))
+          --(*this);
+      }
+    }
+
+    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
+    { Base::operator--(); return *this; }
+
+    inline Index row() const { return Base::row(); }
+    inline Index col() const { return Base::col(); }
+
+    EIGEN_STRONG_INLINE operator bool() const
+    {
+      if (SkipLast) return Base::operator bool() ;
+      else
+      {
+        if(SkipDiag) return (Base::operator bool() && this->index() > this->outer());
+        else return (Base::operator bool() && this->index() >= this->outer());
+      }
+    }
+};
+
+template<typename Derived>
+template<int Mode>
+inline const SparseTriangularView<Derived, Mode>
+SparseMatrixBase<Derived>::triangularView() const
+{
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_TRIANGULARVIEW_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseUtil.h b/include/eigen3/Eigen/src/SparseCore/SparseUtil.h
new file mode 100644
index 0000000..0ba4713
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseUtil.h
@@ -0,0 +1,173 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEUTIL_H
+#define EIGEN_SPARSEUTIL_H
+
+namespace Eigen { 
+
+#ifdef NDEBUG
+#define EIGEN_DBG_SPARSE(X)
+#else
+#define EIGEN_DBG_SPARSE(X) X
+#endif
+
+#define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, Op) \
+template<typename OtherDerived> \
+EIGEN_STRONG_INLINE Derived& operator Op(const Eigen::SparseMatrixBase<OtherDerived>& other) \
+{ \
+  return Base::operator Op(other.derived()); \
+} \
+EIGEN_STRONG_INLINE Derived& operator Op(const Derived& other) \
+{ \
+  return Base::operator Op(other); \
+}
+
+#define EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, Op) \
+template<typename Other> \
+EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
+{ \
+  return Base::operator Op(scalar); \
+}
+
+#define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
+EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
+EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
+EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
+EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
+EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
+
+#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \
+  typedef BaseClass Base; \
+  typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
+  typedef typename Eigen::internal::nested<Derived >::type Nested; \
+  typedef typename Eigen::internal::traits<Derived >::StorageKind StorageKind; \
+  typedef typename Eigen::internal::traits<Derived >::Index Index; \
+  enum { RowsAtCompileTime = Eigen::internal::traits<Derived >::RowsAtCompileTime, \
+        ColsAtCompileTime = Eigen::internal::traits<Derived >::ColsAtCompileTime, \
+        Flags = Eigen::internal::traits<Derived >::Flags, \
+        CoeffReadCost = Eigen::internal::traits<Derived >::CoeffReadCost, \
+        SizeAtCompileTime = Base::SizeAtCompileTime, \
+        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
+  using Base::derived; \
+  using Base::const_cast_derived;
+
+#define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
+  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >)
+
+const int CoherentAccessPattern     = 0x1;
+const int InnerRandomAccessPattern  = 0x2 | CoherentAccessPattern;
+const int OuterRandomAccessPattern  = 0x4 | CoherentAccessPattern;
+const int RandomAccessPattern       = 0x8 | OuterRandomAccessPattern | InnerRandomAccessPattern;
+
+template<typename Derived> class SparseMatrixBase;
+template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _Index = int>  class DynamicSparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseVector;
+template<typename _Scalar, int _Flags = 0, typename _Index = int>  class MappedSparseMatrix;
+
+template<typename MatrixType, int Mode>           class SparseTriangularView;
+template<typename MatrixType, unsigned int UpLo>  class SparseSelfAdjointView;
+template<typename Lhs, typename Rhs>              class SparseDiagonalProduct;
+template<typename MatrixType> class SparseView;
+
+template<typename Lhs, typename Rhs>        class SparseSparseProduct;
+template<typename Lhs, typename Rhs>        class SparseTimeDenseProduct;
+template<typename Lhs, typename Rhs>        class DenseTimeSparseProduct;
+template<typename Lhs, typename Rhs, bool Transpose> class SparseDenseOuterProduct;
+
+template<typename Lhs, typename Rhs> struct SparseSparseProductReturnType;
+template<typename Lhs, typename Rhs,
+         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct DenseSparseProductReturnType;         
+template<typename Lhs, typename Rhs,
+         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct SparseDenseProductReturnType;
+template<typename MatrixType,int UpLo> class SparseSymmetricPermutationProduct;
+
+namespace internal {
+
+template<typename T,int Rows,int Cols> struct sparse_eval;
+
+template<typename T> struct eval<T,Sparse>
+  : public sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime>
+{};
+
+template<typename T,int Cols> struct sparse_eval<T,1,Cols> {
+    typedef typename traits<T>::Scalar _Scalar;
+    typedef typename traits<T>::Index _Index;
+  public:
+    typedef SparseVector<_Scalar, RowMajor, _Index> type;
+};
+
+template<typename T,int Rows> struct sparse_eval<T,Rows,1> {
+    typedef typename traits<T>::Scalar _Scalar;
+    typedef typename traits<T>::Index _Index;
+  public:
+    typedef SparseVector<_Scalar, ColMajor, _Index> type;
+};
+
+template<typename T,int Rows,int Cols> struct sparse_eval {
+    typedef typename traits<T>::Scalar _Scalar;
+    typedef typename traits<T>::Index _Index;
+    enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
+  public:
+    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+};
+
+template<typename T> struct sparse_eval<T,1,1> {
+    typedef typename traits<T>::Scalar _Scalar;
+  public:
+    typedef Matrix<_Scalar, 1, 1> type;
+};
+
+template<typename T> struct plain_matrix_type<T,Sparse>
+{
+  typedef typename traits<T>::Scalar _Scalar;
+  typedef typename traits<T>::Index _Index;
+  enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
+  public:
+    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+};
+
+} // end namespace internal
+
+/** \ingroup SparseCore_Module
+  *
+  * \class Triplet
+  *
+  * \brief A small structure to hold a non zero as a triplet (i,j,value).
+  *
+  * \sa SparseMatrix::setFromTriplets()
+  */
+template<typename Scalar, typename Index=typename SparseMatrix<Scalar>::Index >
+class Triplet
+{
+public:
+  Triplet() : m_row(0), m_col(0), m_value(0) {}
+
+  Triplet(const Index& i, const Index& j, const Scalar& v = Scalar(0))
+    : m_row(i), m_col(j), m_value(v)
+  {}
+
+  /** \returns the row index of the element */
+  const Index& row() const { return m_row; }
+
+  /** \returns the column index of the element */
+  const Index& col() const { return m_col; }
+
+  /** \returns the value of the element */
+  const Scalar& value() const { return m_value; }
+protected:
+  Index m_row, m_col;
+  Scalar m_value;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEUTIL_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseVector.h b/include/eigen3/Eigen/src/SparseCore/SparseVector.h
new file mode 100644
index 0000000..7e15c81
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseVector.h
@@ -0,0 +1,447 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEVECTOR_H
+#define EIGEN_SPARSEVECTOR_H
+
+namespace Eigen { 
+
+/** \ingroup SparseCore_Module
+  * \class SparseVector
+  *
+  * \brief a sparse vector class
+  *
+  * \tparam _Scalar the scalar type, i.e. the type of the coefficients
+  *
+  * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEVECTOR_PLUGIN.
+  */
+
+namespace internal {
+template<typename _Scalar, int _Options, typename _Index>
+struct traits<SparseVector<_Scalar, _Options, _Index> >
+{
+  typedef _Scalar Scalar;
+  typedef _Index Index;
+  typedef Sparse StorageKind;
+  typedef MatrixXpr XprKind;
+  enum {
+    IsColVector = (_Options & RowMajorBit) ? 0 : 1,
+
+    RowsAtCompileTime = IsColVector ? Dynamic : 1,
+    ColsAtCompileTime = IsColVector ? 1 : Dynamic,
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit),
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    SupportedAccessPatterns = InnerRandomAccessPattern
+  };
+};
+
+// Sparse-Vector-Assignment kinds:
+enum {
+  SVA_RuntimeSwitch,
+  SVA_Inner,
+  SVA_Outer
+};
+
+template< typename Dest, typename Src,
+          int AssignmentKind = !bool(Src::IsVectorAtCompileTime) ? SVA_RuntimeSwitch
+                             : Src::InnerSizeAtCompileTime==1 ? SVA_Outer
+                             : SVA_Inner>
+struct sparse_vector_assign_selector;
+
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+class SparseVector
+  : public SparseMatrixBase<SparseVector<_Scalar, _Options, _Index> >
+{
+    typedef SparseMatrixBase<SparseVector> SparseBase;
+    
+  public:
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseVector)
+    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, +=)
+    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, -=)
+    
+    typedef internal::CompressedStorage<Scalar,Index> Storage;
+    enum { IsColVector = internal::traits<SparseVector>::IsColVector };
+    
+    enum {
+      Options = _Options
+    };
+    
+    EIGEN_STRONG_INLINE Index rows() const { return IsColVector ? m_size : 1; }
+    EIGEN_STRONG_INLINE Index cols() const { return IsColVector ? 1 : m_size; }
+    EIGEN_STRONG_INLINE Index innerSize() const { return m_size; }
+    EIGEN_STRONG_INLINE Index outerSize() const { return 1; }
+
+    EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return &m_data.value(0); }
+    EIGEN_STRONG_INLINE Scalar* valuePtr() { return &m_data.value(0); }
+
+    EIGEN_STRONG_INLINE const Index* innerIndexPtr() const { return &m_data.index(0); }
+    EIGEN_STRONG_INLINE Index* innerIndexPtr() { return &m_data.index(0); }
+    
+    /** \internal */
+    inline Storage& data() { return m_data; }
+    /** \internal */
+    inline const Storage& data() const { return m_data; }
+
+    inline Scalar coeff(Index row, Index col) const
+    {
+      eigen_assert(IsColVector ? (col==0 && row>=0 && row<m_size) : (row==0 && col>=0 && col<m_size));
+      return coeff(IsColVector ? row : col);
+    }
+    inline Scalar coeff(Index i) const
+    {
+      eigen_assert(i>=0 && i<m_size);
+      return m_data.at(i);
+    }
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      eigen_assert(IsColVector ? (col==0 && row>=0 && row<m_size) : (row==0 && col>=0 && col<m_size));
+      return coeff(IsColVector ? row : col);
+    }
+
+    /** \returns a reference to the coefficient value at given index \a i
+      * This operation involes a log(rho*size) binary search. If the coefficient does not
+      * exist yet, then a sorted insertion into a sequential buffer is performed.
+      *
+      * This insertion might be very costly if the number of nonzeros above \a i is large.
+      */
+    inline Scalar& coeffRef(Index i)
+    {
+      eigen_assert(i>=0 && i<m_size);
+      return m_data.atWithInsertion(i);
+    }
+
+  public:
+
+    class InnerIterator;
+    class ReverseInnerIterator;
+
+    inline void setZero() { m_data.clear(); }
+
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const  { return static_cast<Index>(m_data.size()); }
+
+    inline void startVec(Index outer)
+    {
+      EIGEN_UNUSED_VARIABLE(outer);
+      eigen_assert(outer==0);
+    }
+
+    inline Scalar& insertBackByOuterInner(Index outer, Index inner)
+    {
+      EIGEN_UNUSED_VARIABLE(outer);
+      eigen_assert(outer==0);
+      return insertBack(inner);
+    }
+    inline Scalar& insertBack(Index i)
+    {
+      m_data.append(0, i);
+      return m_data.value(m_data.size()-1);
+    }
+
+    inline Scalar& insert(Index row, Index col)
+    {
+      eigen_assert(IsColVector ? (col==0 && row>=0 && row<m_size) : (row==0 && col>=0 && col<m_size));
+      
+      Index inner = IsColVector ? row : col;
+      Index outer = IsColVector ? col : row;
+      eigen_assert(outer==0);
+      return insert(inner);
+    }
+    Scalar& insert(Index i)
+    {
+      eigen_assert(i>=0 && i<m_size);
+      
+      Index startId = 0;
+      Index p = Index(m_data.size()) - 1;
+      // TODO smart realloc
+      m_data.resize(p+2,1);
+
+      while ( (p >= startId) && (m_data.index(p) > i) )
+      {
+        m_data.index(p+1) = m_data.index(p);
+        m_data.value(p+1) = m_data.value(p);
+        --p;
+      }
+      m_data.index(p+1) = i;
+      m_data.value(p+1) = 0;
+      return m_data.value(p+1);
+    }
+
+    /**
+      */
+    inline void reserve(Index reserveSize) { m_data.reserve(reserveSize); }
+
+
+    inline void finalize() {}
+
+    void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
+    {
+      m_data.prune(reference,epsilon);
+    }
+
+    void resize(Index rows, Index cols)
+    {
+      eigen_assert(rows==1 || cols==1);
+      resize(IsColVector ? rows : cols);
+    }
+
+    void resize(Index newSize)
+    {
+      m_size = newSize;
+      m_data.clear();
+    }
+
+    void resizeNonZeros(Index size) { m_data.resize(size); }
+
+    inline SparseVector() : m_size(0) { check_template_parameters(); resize(0); }
+
+    inline SparseVector(Index size) : m_size(0) { check_template_parameters(); resize(size); }
+
+    inline SparseVector(Index rows, Index cols) : m_size(0) { check_template_parameters(); resize(rows,cols); }
+
+    template<typename OtherDerived>
+    inline SparseVector(const SparseMatrixBase<OtherDerived>& other)
+      : m_size(0)
+    {
+      check_template_parameters();
+      *this = other.derived();
+    }
+
+    inline SparseVector(const SparseVector& other)
+      : SparseBase(other), m_size(0)
+    {
+      check_template_parameters();
+      *this = other.derived();
+    }
+
+    /** Swaps the values of \c *this and \a other.
+      * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only.
+      * \sa SparseMatrixBase::swap()
+      */
+    inline void swap(SparseVector& other)
+    {
+      std::swap(m_size, other.m_size);
+      m_data.swap(other.m_data);
+    }
+
+    inline SparseVector& operator=(const SparseVector& other)
+    {
+      if (other.isRValue())
+      {
+        swap(other.const_cast_derived());
+      }
+      else
+      {
+        resize(other.size());
+        m_data = other.m_data;
+      }
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    inline SparseVector& operator=(const SparseMatrixBase<OtherDerived>& other)
+    {
+      SparseVector tmp(other.size());
+      internal::sparse_vector_assign_selector<SparseVector,OtherDerived>::run(tmp,other.derived());
+      this->swap(tmp);
+      return *this;
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename Lhs, typename Rhs>
+    inline SparseVector& operator=(const SparseSparseProduct<Lhs,Rhs>& product)
+    {
+      return Base::operator=(product);
+    }
+    #endif
+
+    friend std::ostream & operator << (std::ostream & s, const SparseVector& m)
+    {
+      for (Index i=0; i<m.nonZeros(); ++i)
+        s << "(" << m.m_data.value(i) << "," << m.m_data.index(i) << ") ";
+      s << std::endl;
+      return s;
+    }
+
+    /** Destructor */
+    inline ~SparseVector() {}
+
+    /** Overloaded for performance */
+    Scalar sum() const;
+
+  public:
+
+    /** \internal \deprecated use setZero() and reserve() */
+    EIGEN_DEPRECATED void startFill(Index reserve)
+    {
+      setZero();
+      m_data.reserve(reserve);
+    }
+
+    /** \internal \deprecated use insertBack(Index,Index) */
+    EIGEN_DEPRECATED Scalar& fill(Index r, Index c)
+    {
+      eigen_assert(r==0 || c==0);
+      return fill(IsColVector ? r : c);
+    }
+
+    /** \internal \deprecated use insertBack(Index) */
+    EIGEN_DEPRECATED Scalar& fill(Index i)
+    {
+      m_data.append(0, i);
+      return m_data.value(m_data.size()-1);
+    }
+
+    /** \internal \deprecated use insert(Index,Index) */
+    EIGEN_DEPRECATED Scalar& fillrand(Index r, Index c)
+    {
+      eigen_assert(r==0 || c==0);
+      return fillrand(IsColVector ? r : c);
+    }
+
+    /** \internal \deprecated use insert(Index) */
+    EIGEN_DEPRECATED Scalar& fillrand(Index i)
+    {
+      return insert(i);
+    }
+
+    /** \internal \deprecated use finalize() */
+    EIGEN_DEPRECATED void endFill() {}
+    
+    // These two functions were here in the 3.1 release, so let's keep them in case some code rely on them.
+    /** \internal \deprecated use data() */
+    EIGEN_DEPRECATED Storage& _data() { return m_data; }
+    /** \internal \deprecated use data() */
+    EIGEN_DEPRECATED const Storage& _data() const { return m_data; }
+    
+#   ifdef EIGEN_SPARSEVECTOR_PLUGIN
+#     include EIGEN_SPARSEVECTOR_PLUGIN
+#   endif
+
+protected:
+  
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
+      EIGEN_STATIC_ASSERT((_Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
+    }
+    
+    Storage m_data;
+    Index m_size;
+};
+
+template<typename Scalar, int _Options, typename _Index>
+class SparseVector<Scalar,_Options,_Index>::InnerIterator
+{
+  public:
+    InnerIterator(const SparseVector& vec, Index outer=0)
+      : m_data(vec.m_data), m_id(0), m_end(static_cast<Index>(m_data.size()))
+    {
+      EIGEN_UNUSED_VARIABLE(outer);
+      eigen_assert(outer==0);
+    }
+
+    InnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
+      : m_data(data), m_id(0), m_end(static_cast<Index>(m_data.size()))
+    {}
+
+    inline InnerIterator& operator++() { m_id++; return *this; }
+
+    inline Scalar value() const { return m_data.value(m_id); }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id)); }
+
+    inline Index index() const { return m_data.index(m_id); }
+    inline Index row() const { return IsColVector ? index() : 0; }
+    inline Index col() const { return IsColVector ? 0 : index(); }
+
+    inline operator bool() const { return (m_id < m_end); }
+
+  protected:
+    const internal::CompressedStorage<Scalar,Index>& m_data;
+    Index m_id;
+    const Index m_end;
+};
+
+template<typename Scalar, int _Options, typename _Index>
+class SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator
+{
+  public:
+    ReverseInnerIterator(const SparseVector& vec, Index outer=0)
+      : m_data(vec.m_data), m_id(static_cast<Index>(m_data.size())), m_start(0)
+    {
+      EIGEN_UNUSED_VARIABLE(outer);
+      eigen_assert(outer==0);
+    }
+
+    ReverseInnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
+      : m_data(data), m_id(static_cast<Index>(m_data.size())), m_start(0)
+    {}
+
+    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
+
+    inline Scalar value() const { return m_data.value(m_id-1); }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id-1)); }
+
+    inline Index index() const { return m_data.index(m_id-1); }
+    inline Index row() const { return IsColVector ? index() : 0; }
+    inline Index col() const { return IsColVector ? 0 : index(); }
+
+    inline operator bool() const { return (m_id > m_start); }
+
+  protected:
+    const internal::CompressedStorage<Scalar,Index>& m_data;
+    Index m_id;
+    const Index m_start;
+};
+
+namespace internal {
+
+template< typename Dest, typename Src>
+struct sparse_vector_assign_selector<Dest,Src,SVA_Inner> {
+  static void run(Dest& dst, const Src& src) {
+    eigen_internal_assert(src.innerSize()==src.size());
+    for(typename Src::InnerIterator it(src, 0); it; ++it)
+      dst.insert(it.index()) = it.value();
+  }
+};
+
+template< typename Dest, typename Src>
+struct sparse_vector_assign_selector<Dest,Src,SVA_Outer> {
+  static void run(Dest& dst, const Src& src) {
+    eigen_internal_assert(src.outerSize()==src.size());
+    for(typename Dest::Index i=0; i<src.size(); ++i)
+    {
+      typename Src::InnerIterator it(src, i);
+      if(it)
+        dst.insert(i) = it.value();
+    }
+  }
+};
+
+template< typename Dest, typename Src>
+struct sparse_vector_assign_selector<Dest,Src,SVA_RuntimeSwitch> {
+  static void run(Dest& dst, const Src& src) {
+    if(src.outerSize()==1)  sparse_vector_assign_selector<Dest,Src,SVA_Inner>::run(dst, src);
+    else                    sparse_vector_assign_selector<Dest,Src,SVA_Outer>::run(dst, src);
+  }
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEVECTOR_H
diff --git a/include/eigen3/Eigen/src/SparseCore/SparseView.h b/include/eigen3/Eigen/src/SparseCore/SparseView.h
new file mode 100644
index 0000000..fd84504
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/SparseView.h
@@ -0,0 +1,99 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2010 Daniel Lowengrub <lowdanie at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEVIEW_H
+#define EIGEN_SPARSEVIEW_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename MatrixType>
+struct traits<SparseView<MatrixType> > : traits<MatrixType>
+{
+  typedef typename MatrixType::Index Index;
+  typedef Sparse StorageKind;
+  enum {
+    Flags = int(traits<MatrixType>::Flags) & (RowMajorBit)
+  };
+};
+
+} // end namespace internal
+
+template<typename MatrixType>
+class SparseView : public SparseMatrixBase<SparseView<MatrixType> >
+{
+  typedef typename MatrixType::Nested MatrixTypeNested;
+  typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+public:
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseView)
+
+  SparseView(const MatrixType& mat, const Scalar& m_reference = Scalar(0),
+             typename NumTraits<Scalar>::Real m_epsilon = NumTraits<Scalar>::dummy_precision()) : 
+    m_matrix(mat), m_reference(m_reference), m_epsilon(m_epsilon) {}
+
+  class InnerIterator;
+
+  inline Index rows() const { return m_matrix.rows(); }
+  inline Index cols() const { return m_matrix.cols(); }
+
+  inline Index innerSize() const { return m_matrix.innerSize(); }
+  inline Index outerSize() const { return m_matrix.outerSize(); }
+
+protected:
+  MatrixTypeNested m_matrix;
+  Scalar m_reference;
+  typename NumTraits<Scalar>::Real m_epsilon;
+};
+
+template<typename MatrixType>
+class SparseView<MatrixType>::InnerIterator : public _MatrixTypeNested::InnerIterator
+{
+  typedef typename SparseView::Index Index;
+public:
+  typedef typename _MatrixTypeNested::InnerIterator IterBase;
+  InnerIterator(const SparseView& view, Index outer) :
+  IterBase(view.m_matrix, outer), m_view(view)
+  {
+    incrementToNonZero();
+  }
+
+  EIGEN_STRONG_INLINE InnerIterator& operator++()
+  {
+    IterBase::operator++();
+    incrementToNonZero();
+    return *this;
+  }
+
+  using IterBase::value;
+
+protected:
+  const SparseView& m_view;
+
+private:
+  void incrementToNonZero()
+  {
+    while((bool(*this)) && internal::isMuchSmallerThan(value(), m_view.m_reference, m_view.m_epsilon))
+    {
+      IterBase::operator++();
+    }
+  }
+};
+
+template<typename Derived>
+const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& m_reference,
+                                                          const typename NumTraits<Scalar>::Real& m_epsilon) const
+{
+  return SparseView<Derived>(derived(), m_reference, m_epsilon);
+}
+
+} // end namespace Eigen
+
+#endif
diff --git a/include/eigen3/Eigen/src/SparseCore/TriangularSolver.h b/include/eigen3/Eigen/src/SparseCore/TriangularSolver.h
new file mode 100644
index 0000000..cb8ad82
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseCore/TriangularSolver.h
@@ -0,0 +1,334 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSETRIANGULARSOLVER_H
+#define EIGEN_SPARSETRIANGULARSOLVER_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, int Mode,
+  int UpLo = (Mode & Lower)
+           ? Lower
+           : (Mode & Upper)
+           ? Upper
+           : -1,
+  int StorageOrder = int(traits<Lhs>::Flags) & RowMajorBit>
+struct sparse_solve_triangular_selector;
+
+// forward substitution, row-major
+template<typename Lhs, typename Rhs, int Mode>
+struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,RowMajor>
+{
+  typedef typename Rhs::Scalar Scalar;
+  static void run(const Lhs& lhs, Rhs& other)
+  {
+    for(int col=0 ; col<other.cols() ; ++col)
+    {
+      for(int i=0; i<lhs.rows(); ++i)
+      {
+        Scalar tmp = other.coeff(i,col);
+        Scalar lastVal(0);
+        int lastIndex = 0;
+        for(typename Lhs::InnerIterator it(lhs, i); it; ++it)
+        {
+          lastVal = it.value();
+          lastIndex = it.index();
+          if(lastIndex==i)
+            break;
+          tmp -= lastVal * other.coeff(lastIndex,col);
+        }
+        if (Mode & UnitDiag)
+          other.coeffRef(i,col) = tmp;
+        else
+        {
+          eigen_assert(lastIndex==i);
+          other.coeffRef(i,col) = tmp/lastVal;
+        }
+      }
+    }
+  }
+};
+
+// backward substitution, row-major
+template<typename Lhs, typename Rhs, int Mode>
+struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,RowMajor>
+{
+  typedef typename Rhs::Scalar Scalar;
+  static void run(const Lhs& lhs, Rhs& other)
+  {
+    for(int col=0 ; col<other.cols() ; ++col)
+    {
+      for(int i=lhs.rows()-1 ; i>=0 ; --i)
+      {
+        Scalar tmp = other.coeff(i,col);
+        Scalar l_ii = 0;
+        typename Lhs::InnerIterator it(lhs, i);
+        while(it && it.index()<i)
+          ++it;
+        if(!(Mode & UnitDiag))
+        {
+          eigen_assert(it && it.index()==i);
+          l_ii = it.value();
+          ++it;
+        }
+        else if (it && it.index() == i)
+          ++it;
+        for(; it; ++it)
+        {
+          tmp -= it.value() * other.coeff(it.index(),col);
+        }
+
+        if (Mode & UnitDiag)
+          other.coeffRef(i,col) = tmp;
+        else
+          other.coeffRef(i,col) = tmp/l_ii;
+      }
+    }
+  }
+};
+
+// forward substitution, col-major
+template<typename Lhs, typename Rhs, int Mode>
+struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,ColMajor>
+{
+  typedef typename Rhs::Scalar Scalar;
+  static void run(const Lhs& lhs, Rhs& other)
+  {
+    for(int col=0 ; col<other.cols() ; ++col)
+    {
+      for(int i=0; i<lhs.cols(); ++i)
+      {
+        Scalar& tmp = other.coeffRef(i,col);
+        if (tmp!=Scalar(0)) // optimization when other is actually sparse
+        {
+          typename Lhs::InnerIterator it(lhs, i);
+          while(it && it.index()<i)
+            ++it;
+          if(!(Mode & UnitDiag))
+          {
+            eigen_assert(it && it.index()==i);
+            tmp /= it.value();
+          }
+          if (it && it.index()==i)
+            ++it;
+          for(; it; ++it)
+            other.coeffRef(it.index(), col) -= tmp * it.value();
+        }
+      }
+    }
+  }
+};
+
+// backward substitution, col-major
+template<typename Lhs, typename Rhs, int Mode>
+struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>
+{
+  typedef typename Rhs::Scalar Scalar;
+  static void run(const Lhs& lhs, Rhs& other)
+  {
+    for(int col=0 ; col<other.cols() ; ++col)
+    {
+      for(int i=lhs.cols()-1; i>=0; --i)
+      {
+        Scalar& tmp = other.coeffRef(i,col);
+        if (tmp!=Scalar(0)) // optimization when other is actually sparse
+        {
+          if(!(Mode & UnitDiag))
+          {
+            // TODO replace this by a binary search. make sure the binary search is safe for partially sorted elements
+            typename Lhs::ReverseInnerIterator it(lhs, i);
+            while(it && it.index()!=i)
+              --it;
+            eigen_assert(it && it.index()==i);
+            other.coeffRef(i,col) /= it.value();
+          }
+          typename Lhs::InnerIterator it(lhs, i);
+          for(; it && it.index()<i; ++it)
+            other.coeffRef(it.index(), col) -= tmp * it.value();
+        }
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+template<typename ExpressionType,int Mode>
+template<typename OtherDerived>
+void SparseTriangularView<ExpressionType,Mode>::solveInPlace(MatrixBase<OtherDerived>& other) const
+{
+  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
+  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+
+  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
+
+  typedef typename internal::conditional<copy,
+    typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
+  OtherCopy otherCopy(other.derived());
+
+  internal::sparse_solve_triangular_selector<ExpressionType, typename internal::remove_reference<OtherCopy>::type, Mode>::run(m_matrix, otherCopy);
+
+  if (copy)
+    other = otherCopy;
+}
+
+template<typename ExpressionType,int Mode>
+template<typename OtherDerived>
+typename internal::plain_matrix_type_column_major<OtherDerived>::type
+SparseTriangularView<ExpressionType,Mode>::solve(const MatrixBase<OtherDerived>& other) const
+{
+  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
+  solveInPlace(res);
+  return res;
+}
+
+// pure sparse path
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, int Mode,
+  int UpLo = (Mode & Lower)
+           ? Lower
+           : (Mode & Upper)
+           ? Upper
+           : -1,
+  int StorageOrder = int(Lhs::Flags) & (RowMajorBit)>
+struct sparse_solve_triangular_sparse_selector;
+
+// forward substitution, col-major
+template<typename Lhs, typename Rhs, int Mode, int UpLo>
+struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
+{
+  typedef typename Rhs::Scalar Scalar;
+  typedef typename promote_index_type<typename traits<Lhs>::Index,
+                                         typename traits<Rhs>::Index>::type Index;
+  static void run(const Lhs& lhs, Rhs& other)
+  {
+    const bool IsLower = (UpLo==Lower);
+    AmbiVector<Scalar,Index> tempVector(other.rows()*2);
+    tempVector.setBounds(0,other.rows());
+
+    Rhs res(other.rows(), other.cols());
+    res.reserve(other.nonZeros());
+
+    for(int col=0 ; col<other.cols() ; ++col)
+    {
+      // FIXME estimate number of non zeros
+      tempVector.init(.99/*float(other.col(col).nonZeros())/float(other.rows())*/);
+      tempVector.setZero();
+      tempVector.restart();
+      for (typename Rhs::InnerIterator rhsIt(other, col); rhsIt; ++rhsIt)
+      {
+        tempVector.coeffRef(rhsIt.index()) = rhsIt.value();
+      }
+
+      for(int i=IsLower?0:lhs.cols()-1;
+          IsLower?i<lhs.cols():i>=0;
+          i+=IsLower?1:-1)
+      {
+        tempVector.restart();
+        Scalar& ci = tempVector.coeffRef(i);
+        if (ci!=Scalar(0))
+        {
+          // find
+          typename Lhs::InnerIterator it(lhs, i);
+          if(!(Mode & UnitDiag))
+          {
+            if (IsLower)
+            {
+              eigen_assert(it.index()==i);
+              ci /= it.value();
+            }
+            else
+              ci /= lhs.coeff(i,i);
+          }
+          tempVector.restart();
+          if (IsLower)
+          {
+            if (it.index()==i)
+              ++it;
+            for(; it; ++it)
+              tempVector.coeffRef(it.index()) -= ci * it.value();
+          }
+          else
+          {
+            for(; it && it.index()<i; ++it)
+              tempVector.coeffRef(it.index()) -= ci * it.value();
+          }
+        }
+      }
+
+
+      int count = 0;
+      // FIXME compute a reference value to filter zeros
+      for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector/*,1e-12*/); it; ++it)
+      {
+        ++ count;
+//         std::cerr << "fill " << it.index() << ", " << col << "\n";
+//         std::cout << it.value() << "  ";
+        // FIXME use insertBack
+        res.insert(it.index(), col) = it.value();
+      }
+//       std::cout << "tempVector.nonZeros() == " << int(count) << " / " << (other.rows()) << "\n";
+    }
+    res.finalize();
+    other = res.markAsRValue();
+  }
+};
+
+} // end namespace internal
+
+template<typename ExpressionType,int Mode>
+template<typename OtherDerived>
+void SparseTriangularView<ExpressionType,Mode>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const
+{
+  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
+  eigen_assert( (!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+
+//   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
+
+//   typedef typename internal::conditional<copy,
+//     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
+//   OtherCopy otherCopy(other.derived());
+
+  internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(m_matrix, other.derived());
+
+//   if (copy)
+//     other = otherCopy;
+}
+
+#ifdef EIGEN2_SUPPORT
+
+// deprecated stuff:
+
+/** \deprecated */
+template<typename Derived>
+template<typename OtherDerived>
+void SparseMatrixBase<Derived>::solveTriangularInPlace(MatrixBase<OtherDerived>& other) const
+{
+  this->template triangular<Flags&(Upper|Lower)>().solveInPlace(other);
+}
+
+/** \deprecated */
+template<typename Derived>
+template<typename OtherDerived>
+typename internal::plain_matrix_type_column_major<OtherDerived>::type
+SparseMatrixBase<Derived>::solveTriangular(const MatrixBase<OtherDerived>& other) const
+{
+  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
+  derived().solveTriangularInPlace(res);
+  return res;
+}
+#endif // EIGEN2_SUPPORT
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSETRIANGULARSOLVER_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU.h b/include/eigen3/Eigen/src/SparseLU/SparseLU.h
new file mode 100644
index 0000000..d6f5d59
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU.h
@@ -0,0 +1,757 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_SPARSE_LU_H
+#define EIGEN_SPARSE_LU_H
+
+namespace Eigen {
+
+template <typename _MatrixType, typename _OrderingType = COLAMDOrdering<typename _MatrixType::Index> > class SparseLU;
+template <typename MappedSparseMatrixType> struct SparseLUMatrixLReturnType;
+template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixUReturnType;
+
+/** \ingroup SparseLU_Module
+  * \class SparseLU
+  * 
+  * \brief Sparse supernodal LU factorization for general matrices
+  * 
+  * This class implements the supernodal LU factorization for general matrices.
+  * It uses the main techniques from the sequential SuperLU package 
+  * (http://crd-legacy.lbl.gov/~xiaoye/SuperLU/). It handles transparently real 
+  * and complex arithmetics with single and double precision, depending on the 
+  * scalar type of your input matrix. 
+  * The code has been optimized to provide BLAS-3 operations during supernode-panel updates. 
+  * It benefits directly from the built-in high-performant Eigen BLAS routines. 
+  * Moreover, when the size of a supernode is very small, the BLAS calls are avoided to 
+  * enable a better optimization from the compiler. For best performance, 
+  * you should compile it with NDEBUG flag to avoid the numerous bounds checking on vectors. 
+  * 
+  * An important parameter of this class is the ordering method. It is used to reorder the columns 
+  * (and eventually the rows) of the matrix to reduce the number of new elements that are created during 
+  * numerical factorization. The cheapest method available is COLAMD. 
+  * See  \link OrderingMethods_Module the OrderingMethods module \endlink for the list of 
+  * built-in and external ordering methods. 
+  *
+  * Simple example with key steps 
+  * \code
+  * VectorXd x(n), b(n);
+  * SparseMatrix<double, ColMajor> A;
+  * SparseLU<SparseMatrix<scalar, ColMajor>, COLAMDOrdering<Index> >   solver;
+  * // fill A and b;
+  * // Compute the ordering permutation vector from the structural pattern of A
+  * solver.analyzePattern(A); 
+  * // Compute the numerical factorization 
+  * solver.factorize(A); 
+  * //Use the factors to solve the linear system 
+  * x = solver.solve(b); 
+  * \endcode
+  * 
+  * \warning The input matrix A should be in a \b compressed and \b column-major form.
+  * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
+  * 
+  * \note Unlike the initial SuperLU implementation, there is no step to equilibrate the matrix. 
+  * For badly scaled matrices, this step can be useful to reduce the pivoting during factorization. 
+  * If this is the case for your matrices, you can try the basic scaling method at
+  *  "unsupported/Eigen/src/IterativeSolvers/Scaling.h"
+  * 
+  * \tparam _MatrixType The type of the sparse matrix. It must be a column-major SparseMatrix<>
+  * \tparam _OrderingType The ordering method to use, either AMD, COLAMD or METIS. Default is COLMAD
+  * 
+  * 
+  * \sa \ref TutorialSparseDirectSolvers
+  * \sa \ref OrderingMethods_Module
+  */
+template <typename _MatrixType, typename _OrderingType>
+class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typename _MatrixType::Index>
+{
+  public:
+    typedef _MatrixType MatrixType; 
+    typedef _OrderingType OrderingType;
+    typedef typename MatrixType::Scalar Scalar; 
+    typedef typename MatrixType::RealScalar RealScalar; 
+    typedef typename MatrixType::Index Index; 
+    typedef SparseMatrix<Scalar,ColMajor,Index> NCMatrix;
+    typedef internal::MappedSuperNodalMatrix<Scalar, Index> SCMatrix; 
+    typedef Matrix<Scalar,Dynamic,1> ScalarVector;
+    typedef Matrix<Index,Dynamic,1> IndexVector;
+    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef internal::SparseLUImpl<Scalar, Index> Base;
+    
+  public:
+    SparseLU():m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
+    {
+      initperfvalues(); 
+    }
+    SparseLU(const MatrixType& matrix):m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
+    {
+      initperfvalues(); 
+      compute(matrix);
+    }
+    
+    ~SparseLU()
+    {
+      // Free all explicit dynamic pointers 
+    }
+    
+    void analyzePattern (const MatrixType& matrix);
+    void factorize (const MatrixType& matrix);
+    void simplicialfactorize(const MatrixType& matrix);
+    
+    /**
+      * Compute the symbolic and numeric factorization of the input sparse matrix.
+      * The input matrix should be in column-major storage. 
+      */
+    void compute (const MatrixType& matrix)
+    {
+      // Analyze 
+      analyzePattern(matrix); 
+      //Factorize
+      factorize(matrix);
+    } 
+    
+    inline Index rows() const { return m_mat.rows(); }
+    inline Index cols() const { return m_mat.cols(); }
+    /** Indicate that the pattern of the input matrix is symmetric */
+    void isSymmetric(bool sym)
+    {
+      m_symmetricmode = sym;
+    }
+    
+    /** \returns an expression of the matrix L, internally stored as supernodes
+      * The only operation available with this expression is the triangular solve
+      * \code
+      * y = b; matrixL().solveInPlace(y);
+      * \endcode
+      */
+    SparseLUMatrixLReturnType<SCMatrix> matrixL() const
+    {
+      return SparseLUMatrixLReturnType<SCMatrix>(m_Lstore);
+    }
+    /** \returns an expression of the matrix U,
+      * The only operation available with this expression is the triangular solve
+      * \code
+      * y = b; matrixU().solveInPlace(y);
+      * \endcode
+      */
+    SparseLUMatrixUReturnType<SCMatrix,MappedSparseMatrix<Scalar,ColMajor,Index> > matrixU() const
+    {
+      return SparseLUMatrixUReturnType<SCMatrix, MappedSparseMatrix<Scalar,ColMajor,Index> >(m_Lstore, m_Ustore);
+    }
+
+    /**
+      * \returns a reference to the row matrix permutation \f$ P_r \f$ such that \f$P_r A P_c^T = L U\f$
+      * \sa colsPermutation()
+      */
+    inline const PermutationType& rowsPermutation() const
+    {
+      return m_perm_r;
+    }
+    /**
+      * \returns a reference to the column matrix permutation\f$ P_c^T \f$ such that \f$P_r A P_c^T = L U\f$
+      * \sa rowsPermutation()
+      */
+    inline const PermutationType& colsPermutation() const
+    {
+      return m_perm_c;
+    }
+    /** Set the threshold used for a diagonal entry to be an acceptable pivot. */
+    void setPivotThreshold(const RealScalar& thresh)
+    {
+      m_diagpivotthresh = thresh; 
+    }
+
+    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
+      *
+      * \warning the destination matrix X in X = this->solve(B) must be colmun-major.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const 
+    {
+      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
+      eigen_assert(rows()==B.rows()
+                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
+          return internal::solve_retval<SparseLU, Rhs>(*this, B.derived());
+    }
+
+    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::sparse_solve_retval<SparseLU, Rhs> solve(const SparseMatrixBase<Rhs>& B) const 
+    {
+      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
+      eigen_assert(rows()==B.rows()
+                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
+          return internal::sparse_solve_retval<SparseLU, Rhs>(*this, B.derived());
+    }
+    
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
+      *          \c InvalidInput if the input matrix is invalid
+      *
+      * \sa iparm()          
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+    
+    /**
+      * \returns A string describing the type of error
+      */
+    std::string lastErrorMessage() const
+    {
+      return m_lastError; 
+    }
+
+    template<typename Rhs, typename Dest>
+    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
+    {
+      Dest& X(X_base.derived());
+      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first");
+      EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
+                        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+      
+      // Permute the right hand side to form X = Pr*B
+      // on return, X is overwritten by the computed solution
+      X.resize(B.rows(),B.cols());
+
+      // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+      for(Index j = 0; j < B.cols(); ++j)
+        X.col(j) = rowsPermutation() * B.const_cast_derived().col(j);
+      
+      //Forward substitution with L
+      this->matrixL().solveInPlace(X);
+      this->matrixU().solveInPlace(X);
+      
+      // Permute back the solution 
+      for (Index j = 0; j < B.cols(); ++j)
+        X.col(j) = colsPermutation().inverse() * X.col(j);
+      
+      return true; 
+    }
+    
+    /**
+      * \returns the absolute value of the determinant of the matrix of which
+      * *this is the QR decomposition.
+      *
+      * \warning a determinant can be very big or small, so for matrices
+      * of large enough dimension, there is a risk of overflow/underflow.
+      * One way to work around that is to use logAbsDeterminant() instead.
+      *
+      * \sa logAbsDeterminant(), signDeterminant()
+      */
+     Scalar absDeterminant()
+    {
+      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+      // Initialize with the determinant of the row matrix
+      Scalar det = Scalar(1.);
+      // Note that the diagonal blocks of U are stored in supernodes,
+      // which are available in the  L part :)
+      for (Index j = 0; j < this->cols(); ++j)
+      {
+        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
+        {
+          if(it.index() == j)
+          {
+            det *= (std::abs)(it.value());
+            break;
+          }
+        }
+       }
+       return det;
+     }
+
+     /** \returns the natural log of the absolute value of the determinant of the matrix
+       * of which **this is the QR decomposition
+       *
+       * \note This method is useful to work around the risk of overflow/underflow that's
+       * inherent to the determinant computation.
+       *
+       * \sa absDeterminant(), signDeterminant()
+       */
+     Scalar logAbsDeterminant() const
+     {
+       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+       Scalar det = Scalar(0.);
+       for (Index j = 0; j < this->cols(); ++j)
+       {
+         for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
+         {
+           if(it.row() < j) continue;
+           if(it.row() == j)
+           {
+             det += (std::log)((std::abs)(it.value()));
+             break;
+           }
+         }
+       }
+       return det;
+     }
+
+     /** \returns A number representing the sign of the determinant
+       *
+       * \sa absDeterminant(), logAbsDeterminant()
+       */
+     Scalar signDeterminant()
+     {
+       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+       return Scalar(m_detPermR);
+     }
+
+  protected:
+    // Functions 
+    void initperfvalues()
+    {
+      m_perfv.panel_size = 1;
+      m_perfv.relax = 1; 
+      m_perfv.maxsuper = 128; 
+      m_perfv.rowblk = 16; 
+      m_perfv.colblk = 8; 
+      m_perfv.fillfactor = 20;  
+    }
+      
+    // Variables 
+    mutable ComputationInfo m_info;
+    bool m_isInitialized;
+    bool m_factorizationIsOk;
+    bool m_analysisIsOk;
+    std::string m_lastError;
+    NCMatrix m_mat; // The input (permuted ) matrix 
+    SCMatrix m_Lstore; // The lower triangular matrix (supernodal)
+    MappedSparseMatrix<Scalar,ColMajor,Index> m_Ustore; // The upper triangular matrix
+    PermutationType m_perm_c; // Column permutation 
+    PermutationType m_perm_r ; // Row permutation
+    IndexVector m_etree; // Column elimination tree 
+    
+    typename Base::GlobalLU_t m_glu; 
+                               
+    // SparseLU options 
+    bool m_symmetricmode;
+    // values for performance 
+    internal::perfvalues<Index> m_perfv; 
+    RealScalar m_diagpivotthresh; // Specifies the threshold used for a diagonal entry to be an acceptable pivot
+    Index m_nnzL, m_nnzU; // Nonzeros in L and U factors 
+    Index m_detPermR; // Determinant of the coefficient matrix
+  private:
+    // Disable copy constructor 
+    SparseLU (const SparseLU& );
+  
+}; // End class SparseLU
+
+
+
+// Functions needed by the anaysis phase
+/** 
+  * Compute the column permutation to minimize the fill-in
+  * 
+  *  - Apply this permutation to the input matrix - 
+  * 
+  *  - Compute the column elimination tree on the permuted matrix 
+  * 
+  *  - Postorder the elimination tree and the column permutation
+  * 
+  */
+template <typename MatrixType, typename OrderingType>
+void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat)
+{
+  
+  //TODO  It is possible as in SuperLU to compute row and columns scaling vectors to equilibrate the matrix mat.
+  
+  OrderingType ord; 
+  ord(mat,m_perm_c);
+  
+  // Apply the permutation to the column of the input  matrix
+  //First copy the whole input matrix. 
+  m_mat = mat;
+  if (m_perm_c.size()) {
+    m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers. FIXME : This vector is filled but not subsequently used.  
+    //Then, permute only the column pointers
+    const Index * outerIndexPtr;
+    if (mat.isCompressed()) outerIndexPtr = mat.outerIndexPtr();
+    else
+    {
+      Index *outerIndexPtr_t = new Index[mat.cols()+1];
+      for(Index i = 0; i <= mat.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
+      outerIndexPtr = outerIndexPtr_t;
+    }
+    for (Index i = 0; i < mat.cols(); i++)
+    {
+      m_mat.outerIndexPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i];
+      m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i+1] - outerIndexPtr[i];
+    }
+    if(!mat.isCompressed()) delete[] outerIndexPtr;
+  }
+  // Compute the column elimination tree of the permuted matrix 
+  IndexVector firstRowElt;
+  internal::coletree(m_mat, m_etree,firstRowElt); 
+     
+  // In symmetric mode, do not do postorder here
+  if (!m_symmetricmode) {
+    IndexVector post, iwork; 
+    // Post order etree
+    internal::treePostorder(m_mat.cols(), m_etree, post); 
+      
+   
+    // Renumber etree in postorder 
+    Index m = m_mat.cols(); 
+    iwork.resize(m+1);
+    for (Index i = 0; i < m; ++i) iwork(post(i)) = post(m_etree(i));
+    m_etree = iwork;
+    
+    // Postmultiply A*Pc by post, i.e reorder the matrix according to the postorder of the etree
+    PermutationType post_perm(m); 
+    for (Index i = 0; i < m; i++) 
+      post_perm.indices()(i) = post(i); 
+        
+    // Combine the two permutations : postorder the permutation for future use
+    if(m_perm_c.size()) {
+      m_perm_c = post_perm * m_perm_c;
+    }
+    
+  } // end postordering 
+  
+  m_analysisIsOk = true; 
+}
+
+// Functions needed by the numerical factorization phase
+
+
+/** 
+  *  - Numerical factorization 
+  *  - Interleaved with the symbolic factorization 
+  * On exit,  info is 
+  * 
+  *    = 0: successful factorization
+  * 
+  *    > 0: if info = i, and i is
+  * 
+  *       <= A->ncol: U(i,i) is exactly zero. The factorization has
+  *          been completed, but the factor U is exactly singular,
+  *          and division by zero will occur if it is used to solve a
+  *          system of equations.
+  * 
+  *       > A->ncol: number of bytes allocated when memory allocation
+  *         failure occurred, plus A->ncol. If lwork = -1, it is
+  *         the estimated amount of space needed, plus A->ncol.  
+  */
+template <typename MatrixType, typename OrderingType>
+void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
+{
+  using internal::emptyIdxLU;
+  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
+  eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices");
+  
+  typedef typename IndexVector::Scalar Index; 
+  
+  
+  // Apply the column permutation computed in analyzepattern()
+  //   m_mat = matrix * m_perm_c.inverse(); 
+  m_mat = matrix;
+  if (m_perm_c.size()) 
+  {
+    m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers.
+    //Then, permute only the column pointers
+    const Index * outerIndexPtr;
+    if (matrix.isCompressed()) outerIndexPtr = matrix.outerIndexPtr();
+    else
+    {
+      Index* outerIndexPtr_t = new Index[matrix.cols()+1];
+      for(Index i = 0; i <= matrix.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
+      outerIndexPtr = outerIndexPtr_t;
+    }
+    for (Index i = 0; i < matrix.cols(); i++)
+    {
+      m_mat.outerIndexPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i];
+      m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i+1] - outerIndexPtr[i];
+    }
+    if(!matrix.isCompressed()) delete[] outerIndexPtr;
+  } 
+  else 
+  { //FIXME This should not be needed if the empty permutation is handled transparently
+    m_perm_c.resize(matrix.cols());
+    for(Index i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
+  }
+  
+  Index m = m_mat.rows();
+  Index n = m_mat.cols();
+  Index nnz = m_mat.nonZeros();
+  Index maxpanel = m_perfv.panel_size * m;
+  // Allocate working storage common to the factor routines
+  Index lwork = 0;
+  Index info = Base::memInit(m, n, nnz, lwork, m_perfv.fillfactor, m_perfv.panel_size, m_glu); 
+  if (info) 
+  {
+    m_lastError = "UNABLE TO ALLOCATE WORKING MEMORY\n\n" ;
+    m_factorizationIsOk = false;
+    return ; 
+  }
+  
+  // Set up pointers for integer working arrays 
+  IndexVector segrep(m); segrep.setZero();
+  IndexVector parent(m); parent.setZero();
+  IndexVector xplore(m); xplore.setZero();
+  IndexVector repfnz(maxpanel);
+  IndexVector panel_lsub(maxpanel);
+  IndexVector xprune(n); xprune.setZero();
+  IndexVector marker(m*internal::LUNoMarker); marker.setZero();
+  
+  repfnz.setConstant(-1); 
+  panel_lsub.setConstant(-1);
+  
+  // Set up pointers for scalar working arrays 
+  ScalarVector dense; 
+  dense.setZero(maxpanel);
+  ScalarVector tempv; 
+  tempv.setZero(internal::LUnumTempV(m, m_perfv.panel_size, m_perfv.maxsuper, /*m_perfv.rowblk*/m) );
+  
+  // Compute the inverse of perm_c
+  PermutationType iperm_c(m_perm_c.inverse()); 
+  
+  // Identify initial relaxed snodes
+  IndexVector relax_end(n);
+  if ( m_symmetricmode == true ) 
+    Base::heap_relax_snode(n, m_etree, m_perfv.relax, marker, relax_end);
+  else
+    Base::relax_snode(n, m_etree, m_perfv.relax, marker, relax_end);
+  
+  
+  m_perm_r.resize(m); 
+  m_perm_r.indices().setConstant(-1);
+  marker.setConstant(-1);
+  m_detPermR = 1; // Record the determinant of the row permutation
+  
+  m_glu.supno(0) = emptyIdxLU; m_glu.xsup.setConstant(0);
+  m_glu.xsup(0) = m_glu.xlsub(0) = m_glu.xusub(0) = m_glu.xlusup(0) = Index(0);
+  
+  // Work on one 'panel' at a time. A panel is one of the following :
+  //  (a) a relaxed supernode at the bottom of the etree, or
+  //  (b) panel_size contiguous columns, <panel_size> defined by the user
+  Index jcol; 
+  IndexVector panel_histo(n);
+  Index pivrow; // Pivotal row number in the original row matrix
+  Index nseg1; // Number of segments in U-column above panel row jcol
+  Index nseg; // Number of segments in each U-column 
+  Index irep; 
+  Index i, k, jj; 
+  for (jcol = 0; jcol < n; )
+  {
+    // Adjust panel size so that a panel won't overlap with the next relaxed snode. 
+    Index panel_size = m_perfv.panel_size; // upper bound on panel width
+    for (k = jcol + 1; k < (std::min)(jcol+panel_size, n); k++)
+    {
+      if (relax_end(k) != emptyIdxLU) 
+      {
+        panel_size = k - jcol; 
+        break; 
+      }
+    }
+    if (k == n) 
+      panel_size = n - jcol; 
+      
+    // Symbolic outer factorization on a panel of columns 
+    Base::panel_dfs(m, panel_size, jcol, m_mat, m_perm_r.indices(), nseg1, dense, panel_lsub, segrep, repfnz, xprune, marker, parent, xplore, m_glu); 
+    
+    // Numeric sup-panel updates in topological order 
+    Base::panel_bmod(m, panel_size, jcol, nseg1, dense, tempv, segrep, repfnz, m_glu); 
+    
+    // Sparse LU within the panel, and below the panel diagonal 
+    for ( jj = jcol; jj< jcol + panel_size; jj++) 
+    {
+      k = (jj - jcol) * m; // Column index for w-wide arrays 
+      
+      nseg = nseg1; // begin after all the panel segments
+      //Depth-first-search for the current column
+      VectorBlock<IndexVector> panel_lsubk(panel_lsub, k, m);
+      VectorBlock<IndexVector> repfnz_k(repfnz, k, m); 
+      info = Base::column_dfs(m, jj, m_perm_r.indices(), m_perfv.maxsuper, nseg, panel_lsubk, segrep, repfnz_k, xprune, marker, parent, xplore, m_glu); 
+      if ( info ) 
+      {
+        m_lastError =  "UNABLE TO EXPAND MEMORY IN COLUMN_DFS() ";
+        m_info = NumericalIssue; 
+        m_factorizationIsOk = false; 
+        return; 
+      }
+      // Numeric updates to this column 
+      VectorBlock<ScalarVector> dense_k(dense, k, m); 
+      VectorBlock<IndexVector> segrep_k(segrep, nseg1, m-nseg1); 
+      info = Base::column_bmod(jj, (nseg - nseg1), dense_k, tempv, segrep_k, repfnz_k, jcol, m_glu); 
+      if ( info ) 
+      {
+        m_lastError = "UNABLE TO EXPAND MEMORY IN COLUMN_BMOD() ";
+        m_info = NumericalIssue; 
+        m_factorizationIsOk = false; 
+        return; 
+      }
+      
+      // Copy the U-segments to ucol(*)
+      info = Base::copy_to_ucol(jj, nseg, segrep, repfnz_k ,m_perm_r.indices(), dense_k, m_glu); 
+      if ( info ) 
+      {
+        m_lastError = "UNABLE TO EXPAND MEMORY IN COPY_TO_UCOL() ";
+        m_info = NumericalIssue; 
+        m_factorizationIsOk = false; 
+        return; 
+      }
+      
+      // Form the L-segment 
+      info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu);
+      if ( info ) 
+      {
+        m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR ... ZERO COLUMN AT ";
+        std::ostringstream returnInfo;
+        returnInfo << info; 
+        m_lastError += returnInfo.str();
+        m_info = NumericalIssue; 
+        m_factorizationIsOk = false; 
+        return; 
+      }
+      
+      // Update the determinant of the row permutation matrix
+      if (pivrow != jj) m_detPermR *= -1;
+
+      // Prune columns (0:jj-1) using column jj
+      Base::pruneL(jj, m_perm_r.indices(), pivrow, nseg, segrep, repfnz_k, xprune, m_glu); 
+      
+      // Reset repfnz for this column 
+      for (i = 0; i < nseg; i++)
+      {
+        irep = segrep(i); 
+        repfnz_k(irep) = emptyIdxLU; 
+      }
+    } // end SparseLU within the panel  
+    jcol += panel_size;  // Move to the next panel
+  } // end for -- end elimination 
+  
+  // Count the number of nonzeros in factors 
+  Base::countnz(n, m_nnzL, m_nnzU, m_glu); 
+  // Apply permutation  to the L subscripts 
+  Base::fixupL(n, m_perm_r.indices(), m_glu); 
+  
+  // Create supernode matrix L 
+  m_Lstore.setInfos(m, n, m_glu.lusup, m_glu.xlusup, m_glu.lsub, m_glu.xlsub, m_glu.supno, m_glu.xsup); 
+  // Create the column major upper sparse matrix  U; 
+  new (&m_Ustore) MappedSparseMatrix<Scalar, ColMajor, Index> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() ); 
+  
+  m_info = Success;
+  m_factorizationIsOk = true;
+}
+
+template<typename MappedSupernodalType>
+struct SparseLUMatrixLReturnType : internal::no_assignment_operator
+{
+  typedef typename MappedSupernodalType::Index Index;
+  typedef typename MappedSupernodalType::Scalar Scalar;
+  SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
+  { }
+  Index rows() { return m_mapL.rows(); }
+  Index cols() { return m_mapL.cols(); }
+  template<typename Dest>
+  void solveInPlace( MatrixBase<Dest> &X) const
+  {
+    m_mapL.solveInPlace(X);
+  }
+  const MappedSupernodalType& m_mapL;
+};
+
+template<typename MatrixLType, typename MatrixUType>
+struct SparseLUMatrixUReturnType : internal::no_assignment_operator
+{
+  typedef typename MatrixLType::Index Index;
+  typedef typename MatrixLType::Scalar Scalar;
+  SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
+  : m_mapL(mapL),m_mapU(mapU)
+  { }
+  Index rows() { return m_mapL.rows(); }
+  Index cols() { return m_mapL.cols(); }
+
+  template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
+  {
+    Index nrhs = X.cols();
+    Index n = X.rows();
+    // Backward solve with U
+    for (Index k = m_mapL.nsuper(); k >= 0; k--)
+    {
+      Index fsupc = m_mapL.supToCol()[k];
+      Index lda = m_mapL.colIndexPtr()[fsupc+1] - m_mapL.colIndexPtr()[fsupc]; // leading dimension
+      Index nsupc = m_mapL.supToCol()[k+1] - fsupc;
+      Index luptr = m_mapL.colIndexPtr()[fsupc];
+
+      if (nsupc == 1)
+      {
+        for (Index j = 0; j < nrhs; j++)
+        {
+          X(fsupc, j) /= m_mapL.valuePtr()[luptr];
+        }
+      }
+      else
+      {
+        Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        U = A.template triangularView<Upper>().solve(U);
+      }
+
+      for (Index j = 0; j < nrhs; ++j)
+      {
+        for (Index jcol = fsupc; jcol < fsupc + nsupc; jcol++)
+        {
+          typename MatrixUType::InnerIterator it(m_mapU, jcol);
+          for ( ; it; ++it)
+          {
+            Index irow = it.index();
+            X(irow, j) -= X(jcol, j) * it.value();
+          }
+        }
+      }
+    } // End For U-solve
+  }
+  const MatrixLType& m_mapL;
+  const MatrixUType& m_mapU;
+};
+
+namespace internal {
+  
+template<typename _MatrixType, typename Derived, typename Rhs>
+struct solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
+  : solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
+{
+  typedef SparseLU<_MatrixType,Derived> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+template<typename _MatrixType, typename Derived, typename Rhs>
+struct sparse_solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
+  : sparse_solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
+{
+  typedef SparseLU<_MatrixType,Derived> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    this->defaultEvalTo(dst);
+  }
+};
+} // end namespace internal
+
+} // End namespace Eigen 
+
+#endif
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLUImpl.h b/include/eigen3/Eigen/src/SparseLU/SparseLUImpl.h
new file mode 100644
index 0000000..14d7089
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLUImpl.h
@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef SPARSELU_IMPL_H
+#define SPARSELU_IMPL_H
+
+namespace Eigen {
+namespace internal {
+  
+/** \ingroup SparseLU_Module
+  * \class SparseLUImpl
+  * Base class for sparseLU
+  */
+template <typename Scalar, typename Index>
+class SparseLUImpl
+{
+  public:
+    typedef Matrix<Scalar,Dynamic,1> ScalarVector;
+    typedef Matrix<Index,Dynamic,1> IndexVector; 
+    typedef typename ScalarVector::RealScalar RealScalar; 
+    typedef Ref<Matrix<Scalar,Dynamic,1> > BlockScalarVector;
+    typedef Ref<Matrix<Index,Dynamic,1> > BlockIndexVector;
+    typedef LU_GlobalLU_t<IndexVector, ScalarVector> GlobalLU_t; 
+    typedef SparseMatrix<Scalar,ColMajor,Index> MatrixType; 
+    
+  protected:
+     template <typename VectorType>
+     Index expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions);
+     Index memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu); 
+     template <typename VectorType>
+     Index memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions);
+     void heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end); 
+     void relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end); 
+     Index snode_dfs(const Index jcol, const Index kcol,const MatrixType& mat,  IndexVector& xprune, IndexVector& marker, GlobalLU_t& glu); 
+     Index snode_bmod (const Index jcol, const Index fsupc, ScalarVector& dense, GlobalLU_t& glu);
+     Index pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu);
+     template <typename Traits>
+     void dfs_kernel(const Index jj, IndexVector& perm_r,
+                    Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
+                    Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
+                    IndexVector& xplore, GlobalLU_t& glu, Index& nextl_col, Index krow, Traits& traits);
+     void panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu);
+    
+     void panel_bmod(const Index m, const Index w, const Index jcol, const Index nseg, ScalarVector& dense, ScalarVector& tempv, IndexVector& segrep, IndexVector& repfnz, GlobalLU_t& glu);
+     Index column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,  BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu);
+     Index column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv, BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu); 
+     Index copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep, BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu); 
+     void pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg, const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu);
+     void countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu); 
+     void fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu); 
+     
+     template<typename , typename >
+     friend struct column_dfs_traits;
+}; 
+
+} // end namespace internal
+} // namespace Eigen
+
+#endif
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_Memory.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_Memory.h
new file mode 100644
index 0000000..1ffa7d5
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -0,0 +1,227 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* 
+ 
+ * NOTE: This file is the modified version of [s,d,c,z]memory.c files in SuperLU 
+ 
+ * -- SuperLU routine (version 3.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * August 1, 2008
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+
+#ifndef EIGEN_SPARSELU_MEMORY
+#define EIGEN_SPARSELU_MEMORY
+
+namespace Eigen {
+namespace internal {
+  
+enum { LUNoMarker = 3 };
+enum {emptyIdxLU = -1};
+template<typename Index>
+inline Index LUnumTempV(Index& m, Index& w, Index& t, Index& b)
+{
+  return (std::max)(m, (t+b)*w);
+}
+
+template< typename Scalar, typename Index>
+inline Index LUTempSpace(Index&m, Index& w)
+{
+  return (2*w + 4 + LUNoMarker) * m * sizeof(Index) + (w + 1) * m * sizeof(Scalar);
+}
+
+
+
+
+/** 
+  * Expand the existing storage to accomodate more fill-ins
+  * \param vec Valid pointer to the vector to allocate or expand
+  * \param[in,out] length  At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector
+  * \param[in] nbElts Current number of elements in the factors
+  * \param keep_prev  1: use length  and do not expand the vector; 0: compute new_len and expand
+  * \param[in,out] num_expansions Number of times the memory has been expanded
+  */
+template <typename Scalar, typename Index>
+template <typename VectorType>
+Index  SparseLUImpl<Scalar,Index>::expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions) 
+{
+  
+  float alpha = 1.5; // Ratio of the memory increase 
+  Index new_len; // New size of the allocated memory
+  
+  if(num_expansions == 0 || keep_prev) 
+    new_len = length ; // First time allocate requested
+  else 
+    new_len = (std::max)(length+1,Index(alpha * length));
+  
+  VectorType old_vec; // Temporary vector to hold the previous values   
+  if (nbElts > 0 )
+    old_vec = vec.segment(0,nbElts); 
+  
+  //Allocate or expand the current vector
+#ifdef EIGEN_EXCEPTIONS
+  try
+#endif
+  {
+    vec.resize(new_len); 
+  }
+#ifdef EIGEN_EXCEPTIONS
+  catch(std::bad_alloc& )
+#else
+  if(!vec.size())
+#endif
+  {
+    if (!num_expansions)
+    {
+      // First time to allocate from LUMemInit()
+      // Let LUMemInit() deals with it.
+      return -1;
+    }
+    if (keep_prev)
+    {
+      // In this case, the memory length should not not be reduced
+      return new_len;
+    }
+    else 
+    {
+      // Reduce the size and increase again 
+      Index tries = 0; // Number of attempts
+      do 
+      {
+        alpha = (alpha + 1)/2;
+        new_len = (std::max)(length+1,Index(alpha * length));
+#ifdef EIGEN_EXCEPTIONS
+        try
+#endif
+        {
+          vec.resize(new_len); 
+        }
+#ifdef EIGEN_EXCEPTIONS
+        catch(std::bad_alloc& )
+#else
+        if (!vec.size())
+#endif
+        {
+          tries += 1; 
+          if ( tries > 10) return new_len; 
+        }
+      } while (!vec.size());
+    }
+  }
+  //Copy the previous values to the newly allocated space 
+  if (nbElts > 0)
+    vec.segment(0, nbElts) = old_vec;   
+   
+  
+  length  = new_len;
+  if(num_expansions) ++num_expansions;
+  return 0; 
+}
+
+/**
+ * \brief  Allocate various working space for the numerical factorization phase.
+ * \param m number of rows of the input matrix 
+ * \param n number of columns 
+ * \param annz number of initial nonzeros in the matrix 
+ * \param lwork  if lwork=-1, this routine returns an estimated size of the required memory
+ * \param glu persistent data to facilitate multiple factors : will be deleted later ??
+ * \param fillratio estimated ratio of fill in the factors
+ * \param panel_size Size of a panel
+ * \return an estimated size of the required memory if lwork = -1; otherwise, return the size of actually allocated memory when allocation failed, and 0 on success
+ * \note Unlike SuperLU, this routine does not support successive factorization with the same pattern and the same row permutation
+ */
+template <typename Scalar, typename Index>
+Index SparseLUImpl<Scalar,Index>::memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu)
+{
+  Index& num_expansions = glu.num_expansions; //No memory expansions so far
+  num_expansions = 0;
+  glu.nzumax = glu.nzlumax = (std::min)(fillratio * annz / n, m) * n; // estimated number of nonzeros in U 
+  glu.nzlmax = (std::max)(Index(4), fillratio) * annz / 4; // estimated  nnz in L factor
+  // Return the estimated size to the user if necessary
+  Index tempSpace;
+  tempSpace = (2*panel_size + 4 + LUNoMarker) * m * sizeof(Index) + (panel_size + 1) * m * sizeof(Scalar);
+  if (lwork == emptyIdxLU) 
+  {
+    Index estimated_size;
+    estimated_size = (5 * n + 5) * sizeof(Index)  + tempSpace
+                    + (glu.nzlmax + glu.nzumax) * sizeof(Index) + (glu.nzlumax+glu.nzumax) *  sizeof(Scalar) + n; 
+    return estimated_size;
+  }
+  
+  // Setup the required space 
+  
+  // First allocate Integer pointers for L\U factors
+  glu.xsup.resize(n+1);
+  glu.supno.resize(n+1);
+  glu.xlsub.resize(n+1);
+  glu.xlusup.resize(n+1);
+  glu.xusub.resize(n+1);
+
+  // Reserve memory for L/U factors
+  do 
+  {
+    if(     (expand<ScalarVector>(glu.lusup, glu.nzlumax, 0, 0, num_expansions)<0)
+        ||  (expand<ScalarVector>(glu.ucol,  glu.nzumax,  0, 0, num_expansions)<0)
+        ||  (expand<IndexVector> (glu.lsub,  glu.nzlmax,  0, 0, num_expansions)<0)
+        ||  (expand<IndexVector> (glu.usub,  glu.nzumax,  0, 1, num_expansions)<0) )
+    {
+      //Reduce the estimated size and retry
+      glu.nzlumax /= 2;
+      glu.nzumax /= 2;
+      glu.nzlmax /= 2;
+      if (glu.nzlumax < annz ) return glu.nzlumax; 
+    }
+  } while (!glu.lusup.size() || !glu.ucol.size() || !glu.lsub.size() || !glu.usub.size());
+  
+  ++num_expansions;
+  return 0;
+  
+} // end LuMemInit
+
+/** 
+ * \brief Expand the existing storage 
+ * \param vec vector to expand 
+ * \param[in,out] maxlen On input, previous size of vec (Number of elements to copy ). on output, new size
+ * \param nbElts current number of elements in the vector.
+ * \param memtype Type of the element to expand
+ * \param num_expansions Number of expansions 
+ * \return 0 on success, > 0 size of the memory allocated so far
+ */
+template <typename Scalar, typename Index>
+template <typename VectorType>
+Index SparseLUImpl<Scalar,Index>::memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions)
+{
+  Index failed_size; 
+  if (memtype == USUB)
+     failed_size = this->expand<VectorType>(vec, maxlen, nbElts, 1, num_expansions);
+  else
+    failed_size = this->expand<VectorType>(vec, maxlen, nbElts, 0, num_expansions);
+
+  if (failed_size)
+    return failed_size; 
+  
+  return 0 ;  
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+#endif // EIGEN_SPARSELU_MEMORY
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_Structs.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_Structs.h
new file mode 100644
index 0000000..24d6bf1
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_Structs.h
@@ -0,0 +1,111 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* 
+ * NOTE: This file comes from a partly modified version of files slu_[s,d,c,z]defs.h
+ * -- SuperLU routine (version 4.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November, 2010
+ * 
+ * Global data structures used in LU factorization -
+ * 
+ *   nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
+ *   (xsup,supno): supno[i] is the supernode no to which i belongs;
+ *  xsup(s) points to the beginning of the s-th supernode.
+ *  e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
+ *          xsup 0 1 2 4 7 12
+ *  Note: dfs will be performed on supernode rep. relative to the new 
+ *        row pivoting ordering
+ *
+ *   (xlsub,lsub): lsub[*] contains the compressed subscript of
+ *  rectangular supernodes; xlsub[j] points to the starting
+ *  location of the j-th column in lsub[*]. Note that xlsub 
+ *  is indexed by column.
+ *  Storage: original row subscripts
+ *
+ *      During the course of sparse LU factorization, we also use
+ *  (xlsub,lsub) for the purpose of symmetric pruning. For each
+ *  supernode {s,s+1,...,t=s+r} with first column s and last
+ *  column t, the subscript set
+ *    lsub[j], j=xlsub[s], .., xlsub[s+1]-1
+ *  is the structure of column s (i.e. structure of this supernode).
+ *  It is used for the storage of numerical values.
+ *  Furthermore,
+ *    lsub[j], j=xlsub[t], .., xlsub[t+1]-1
+ *  is the structure of the last column t of this supernode.
+ *  It is for the purpose of symmetric pruning. Therefore, the
+ *  structural subscripts can be rearranged without making physical
+ *  interchanges among the numerical values.
+ *
+ *  However, if the supernode has only one column, then we
+ *  only keep one set of subscripts. For any subscript interchange
+ *  performed, similar interchange must be done on the numerical
+ *  values.
+ *
+ *  The last column structures (for pruning) will be removed
+ *  after the numercial LU factorization phase.
+ *
+ *   (xlusup,lusup): lusup[*] contains the numerical values of the
+ *  rectangular supernodes; xlusup[j] points to the starting
+ *  location of the j-th column in storage vector lusup[*]
+ *  Note: xlusup is indexed by column.
+ *  Each rectangular supernode is stored by column-major
+ *  scheme, consistent with Fortran 2-dim array storage.
+ *
+ *   (xusub,ucol,usub): ucol[*] stores the numerical values of
+ *  U-columns outside the rectangular supernodes. The row
+ *  subscript of nonzero ucol[k] is stored in usub[k].
+ *  xusub[i] points to the starting location of column i in ucol.
+ *  Storage: new row subscripts; that is subscripts of PA.
+ */
+
+#ifndef EIGEN_LU_STRUCTS
+#define EIGEN_LU_STRUCTS
+namespace Eigen {
+namespace internal {
+  
+typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; 
+
+template <typename IndexVector, typename ScalarVector>
+struct LU_GlobalLU_t {
+  typedef typename IndexVector::Scalar Index; 
+  IndexVector xsup; //First supernode column ... xsup(s) points to the beginning of the s-th supernode
+  IndexVector supno; // Supernode number corresponding to this column (column to supernode mapping)
+  ScalarVector  lusup; // nonzero values of L ordered by columns 
+  IndexVector lsub; // Compressed row indices of L rectangular supernodes. 
+  IndexVector xlusup; // pointers to the beginning of each column in lusup
+  IndexVector xlsub; // pointers to the beginning of each column in lsub
+  Index   nzlmax; // Current max size of lsub
+  Index   nzlumax; // Current max size of lusup
+  ScalarVector  ucol; // nonzero values of U ordered by columns 
+  IndexVector usub; // row indices of U columns in ucol
+  IndexVector xusub; // Pointers to the beginning of each column of U in ucol 
+  Index   nzumax; // Current max size of ucol
+  Index   n; // Number of columns in the matrix  
+  Index   num_expansions; 
+};
+
+// Values to set for performance
+template <typename Index>
+struct perfvalues {
+  Index panel_size; // a panel consists of at most <panel_size> consecutive columns
+  Index relax; // To control degree of relaxing supernodes. If the number of nodes (columns) 
+                // in a subtree of the elimination tree is less than relax, this subtree is considered 
+                // as one supernode regardless of the row structures of those columns
+  Index maxsuper; // The maximum size for a supernode in complete LU
+  Index rowblk; // The minimum row dimension for 2-D blocking to be used;
+  Index colblk; // The minimum column dimension for 2-D blocking to be used;
+  Index fillfactor; // The estimated fills factors for L and U, compared with A
+}; 
+
+} // end namespace internal
+
+} // end namespace Eigen
+#endif // EIGEN_LU_STRUCTS
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
new file mode 100644
index 0000000..b16afd6
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -0,0 +1,298 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSELU_SUPERNODAL_MATRIX_H
+#define EIGEN_SPARSELU_SUPERNODAL_MATRIX_H
+
+namespace Eigen {
+namespace internal {
+
+/** \ingroup SparseLU_Module
+ * \brief a class to manipulate the L supernodal factor from the SparseLU factorization
+ * 
+ * This class  contain the data to easily store 
+ * and manipulate the supernodes during the factorization and solution phase of Sparse LU. 
+ * Only the lower triangular matrix has supernodes.
+ * 
+ * NOTE : This class corresponds to the SCformat structure in SuperLU
+ * 
+ */
+/* TODO
+ * InnerIterator as for sparsematrix 
+ * SuperInnerIterator to iterate through all supernodes 
+ * Function for triangular solve
+ */
+template <typename _Scalar, typename _Index>
+class MappedSuperNodalMatrix
+{
+  public:
+    typedef _Scalar Scalar; 
+    typedef _Index Index;
+    typedef Matrix<Index,Dynamic,1> IndexVector; 
+    typedef Matrix<Scalar,Dynamic,1> ScalarVector;
+  public:
+    MappedSuperNodalMatrix()
+    {
+      
+    }
+    MappedSuperNodalMatrix(Index m, Index n,  ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
+             IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
+    {
+      setInfos(m, n, nzval, nzval_colptr, rowind, rowind_colptr, col_to_sup, sup_to_col);
+    }
+    
+    ~MappedSuperNodalMatrix()
+    {
+      
+    }
+    /**
+     * Set appropriate pointers for the lower triangular supernodal matrix
+     * These infos are available at the end of the numerical factorization
+     * FIXME This class will be modified such that it can be use in the course 
+     * of the factorization.
+     */
+    void setInfos(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
+             IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
+    {
+      m_row = m;
+      m_col = n; 
+      m_nzval = nzval.data(); 
+      m_nzval_colptr = nzval_colptr.data(); 
+      m_rowind = rowind.data(); 
+      m_rowind_colptr = rowind_colptr.data(); 
+      m_nsuper = col_to_sup(n); 
+      m_col_to_sup = col_to_sup.data(); 
+      m_sup_to_col = sup_to_col.data(); 
+    }
+    
+    /**
+     * Number of rows
+     */
+    Index rows() { return m_row; }
+    
+    /**
+     * Number of columns
+     */
+    Index cols() { return m_col; }
+    
+    /**
+     * Return the array of nonzero values packed by column
+     * 
+     * The size is nnz
+     */
+    Scalar* valuePtr() {  return m_nzval; }
+    
+    const Scalar* valuePtr() const 
+    {
+      return m_nzval; 
+    }
+    /**
+     * Return the pointers to the beginning of each column in \ref valuePtr()
+     */
+    Index* colIndexPtr()
+    {
+      return m_nzval_colptr; 
+    }
+    
+    const Index* colIndexPtr() const
+    {
+      return m_nzval_colptr; 
+    }
+    
+    /**
+     * Return the array of compressed row indices of all supernodes
+     */
+    Index* rowIndex()  { return m_rowind; }
+    
+    const Index* rowIndex() const
+    {
+      return m_rowind; 
+    }
+    
+    /**
+     * Return the location in \em rowvaluePtr() which starts each column
+     */
+    Index* rowIndexPtr() { return m_rowind_colptr; }
+    
+    const Index* rowIndexPtr() const 
+    {
+      return m_rowind_colptr; 
+    }
+    
+    /** 
+     * Return the array of column-to-supernode mapping 
+     */
+    Index* colToSup()  { return m_col_to_sup; }
+    
+    const Index* colToSup() const
+    {
+      return m_col_to_sup;       
+    }
+    /**
+     * Return the array of supernode-to-column mapping
+     */
+    Index* supToCol() { return m_sup_to_col; }
+    
+    const Index* supToCol() const 
+    {
+      return m_sup_to_col;
+    }
+    
+    /**
+     * Return the number of supernodes
+     */
+    Index nsuper() const 
+    {
+      return m_nsuper; 
+    }
+    
+    class InnerIterator; 
+    template<typename Dest>
+    void solveInPlace( MatrixBase<Dest>&X) const;
+    
+      
+      
+    
+  protected:
+    Index m_row; // Number of rows
+    Index m_col; // Number of columns 
+    Index m_nsuper; // Number of supernodes 
+    Scalar* m_nzval; //array of nonzero values packed by column
+    Index* m_nzval_colptr; //nzval_colptr[j] Stores the location in nzval[] which starts column j 
+    Index* m_rowind; // Array of compressed row indices of rectangular supernodes
+    Index* m_rowind_colptr; //rowind_colptr[j] stores the location in rowind[] which starts column j
+    Index* m_col_to_sup; // col_to_sup[j] is the supernode number to which column j belongs
+    Index* m_sup_to_col; //sup_to_col[s] points to the starting column of the s-th supernode
+    
+  private :
+};
+
+/**
+  * \brief InnerIterator class to iterate over nonzero values of the current column in the supernodal matrix L
+  * 
+  */
+template<typename Scalar, typename Index>
+class MappedSuperNodalMatrix<Scalar,Index>::InnerIterator
+{
+  public:
+     InnerIterator(const MappedSuperNodalMatrix& mat, Index outer)
+      : m_matrix(mat),
+        m_outer(outer), 
+        m_supno(mat.colToSup()[outer]),
+        m_idval(mat.colIndexPtr()[outer]),
+        m_startidval(m_idval),
+        m_endidval(mat.colIndexPtr()[outer+1]),
+        m_idrow(mat.rowIndexPtr()[mat.supToCol()[mat.colToSup()[outer]]]),
+        m_endidrow(mat.rowIndexPtr()[mat.supToCol()[mat.colToSup()[outer]]+1])
+    {}
+    inline InnerIterator& operator++()
+    { 
+      m_idval++; 
+      m_idrow++;
+      return *this;
+    }
+    inline Scalar value() const { return m_matrix.valuePtr()[m_idval]; }
+    
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_idval]); }
+    
+    inline Index index() const { return m_matrix.rowIndex()[m_idrow]; }
+    inline Index row() const { return index(); }
+    inline Index col() const { return m_outer; }
+    
+    inline Index supIndex() const { return m_supno; }
+    
+    inline operator bool() const 
+    { 
+      return ( (m_idval < m_endidval) && (m_idval >= m_startidval)
+                && (m_idrow < m_endidrow) );
+    }
+    
+  protected:
+    const MappedSuperNodalMatrix& m_matrix; // Supernodal lower triangular matrix 
+    const Index m_outer;                    // Current column 
+    const Index m_supno;                    // Current SuperNode number
+    Index m_idval;                          // Index to browse the values in the current column
+    const Index m_startidval;               // Start of the column value
+    const Index m_endidval;                 // End of the column value
+    Index m_idrow;                          // Index to browse the row indices 
+    Index m_endidrow;                       // End index of row indices of the current column
+};
+
+/**
+ * \brief Solve with the supernode triangular matrix
+ * 
+ */
+template<typename Scalar, typename Index>
+template<typename Dest>
+void MappedSuperNodalMatrix<Scalar,Index>::solveInPlace( MatrixBase<Dest>&X) const
+{
+    Index n = X.rows(); 
+    Index nrhs = X.cols(); 
+    const Scalar * Lval = valuePtr();                 // Nonzero values 
+    Matrix<Scalar,Dynamic,Dynamic> work(n, nrhs);     // working vector
+    work.setZero();
+    for (Index k = 0; k <= nsuper(); k ++)
+    {
+      Index fsupc = supToCol()[k];                    // First column of the current supernode 
+      Index istart = rowIndexPtr()[fsupc];            // Pointer index to the subscript of the current column
+      Index nsupr = rowIndexPtr()[fsupc+1] - istart;  // Number of rows in the current supernode
+      Index nsupc = supToCol()[k+1] - fsupc;          // Number of columns in the current supernode
+      Index nrow = nsupr - nsupc;                     // Number of rows in the non-diagonal part of the supernode
+      Index irow;                                     //Current index row
+      
+      if (nsupc == 1 )
+      {
+        for (Index j = 0; j < nrhs; j++)
+        {
+          InnerIterator it(*this, fsupc);
+          ++it; // Skip the diagonal element
+          for (; it; ++it)
+          {
+            irow = it.row();
+            X(irow, j) -= X(fsupc, j) * it.value();
+          }
+        }
+      }
+      else
+      {
+        // The supernode has more than one column 
+        Index luptr = colIndexPtr()[fsupc]; 
+        Index lda = colIndexPtr()[fsupc+1] - luptr;
+        
+        // Triangular solve 
+        Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); 
+        Map< Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); 
+        U = A.template triangularView<UnitLower>().solve(U); 
+        
+        // Matrix-vector product 
+        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); 
+        work.block(0, 0, nrow, nrhs) = A * U; 
+        
+        //Begin Scatter 
+        for (Index j = 0; j < nrhs; j++)
+        {
+          Index iptr = istart + nsupc; 
+          for (Index i = 0; i < nrow; i++)
+          {
+            irow = rowIndex()[iptr]; 
+            X(irow, j) -= work(i, j); // Scatter operation
+            work(i, j) = Scalar(0); 
+            iptr++;
+          }
+        }
+      }
+    } 
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSELU_MATRIX_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h
new file mode 100644
index 0000000..15352ac
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h
@@ -0,0 +1,80 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_SPARSELU_UTILS_H
+#define EIGEN_SPARSELU_UTILS_H
+
+namespace Eigen {
+namespace internal {
+
+/**
+ * \brief Count Nonzero elements in the factors
+ */
+template <typename Scalar, typename Index>
+void SparseLUImpl<Scalar,Index>::countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu)
+{
+ nnzL = 0; 
+ nnzU = (glu.xusub)(n); 
+ Index nsuper = (glu.supno)(n); 
+ Index jlen; 
+ Index i, j, fsupc;
+ if (n <= 0 ) return; 
+ // For each supernode
+ for (i = 0; i <= nsuper; i++)
+ {
+   fsupc = glu.xsup(i); 
+   jlen = glu.xlsub(fsupc+1) - glu.xlsub(fsupc); 
+   
+   for (j = fsupc; j < glu.xsup(i+1); j++)
+   {
+     nnzL += jlen; 
+     nnzU += j - fsupc + 1; 
+     jlen--; 
+   }
+ }
+}
+
+/**
+ * \brief Fix up the data storage lsub for L-subscripts. 
+ * 
+ * It removes the subscripts sets for structural pruning, 
+ * and applies permutation to the remaining subscripts
+ * 
+ */
+template <typename Scalar, typename Index>
+void SparseLUImpl<Scalar,Index>::fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu)
+{
+  Index fsupc, i, j, k, jstart; 
+  
+  Index nextl = 0; 
+  Index nsuper = (glu.supno)(n); 
+  
+  // For each supernode 
+  for (i = 0; i <= nsuper; i++)
+  {
+    fsupc = glu.xsup(i); 
+    jstart = glu.xlsub(fsupc); 
+    glu.xlsub(fsupc) = nextl; 
+    for (j = jstart; j < glu.xlsub(fsupc + 1); j++)
+    {
+      glu.lsub(nextl) = perm_r(glu.lsub(j)); // Now indexed into P*A
+      nextl++;
+    }
+    for (k = fsupc+1; k < glu.xsup(i+1); k++)
+      glu.xlsub(k) = nextl; // other columns in supernode i
+  }
+  
+  glu.xlsub(n) = nextl; 
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+#endif // EIGEN_SPARSELU_UTILS_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_column_bmod.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_column_bmod.h
new file mode 100644
index 0000000..f24bd87
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_column_bmod.h
@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* 
+ 
+ * NOTE: This file is the modified version of xcolumn_bmod.c file in SuperLU 
+ 
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+#ifndef SPARSELU_COLUMN_BMOD_H
+#define SPARSELU_COLUMN_BMOD_H
+
+namespace Eigen {
+
+namespace internal {
+/**
+ * \brief Performs numeric block updates (sup-col) in topological order
+ * 
+ * \param jcol current column to update
+ * \param nseg Number of segments in the U part
+ * \param dense Store the full representation of the column
+ * \param tempv working array 
+ * \param segrep segment representative ...
+ * \param repfnz ??? First nonzero column in each row ???  ...
+ * \param fpanelc First column in the current panel
+ * \param glu Global LU data. 
+ * \return 0 - successful return 
+ *         > 0 - number of bytes allocated when run out of space
+ * 
+ */
+template <typename Scalar, typename Index>
+Index SparseLUImpl<Scalar,Index>::column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv, BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu)
+{
+  Index  jsupno, k, ksub, krep, ksupno; 
+  Index lptr, nrow, isub, irow, nextlu, new_next, ufirst; 
+  Index fsupc, nsupc, nsupr, luptr, kfnz, no_zeros; 
+  /* krep = representative of current k-th supernode
+    * fsupc =  first supernodal column
+    * nsupc = number of columns in a supernode
+    * nsupr = number of rows in a supernode
+    * luptr = location of supernodal LU-block in storage
+    * kfnz = first nonz in the k-th supernodal segment
+    * no_zeros = no lf leading zeros in a supernodal U-segment
+    */
+  
+  jsupno = glu.supno(jcol);
+  // For each nonzero supernode segment of U[*,j] in topological order 
+  k = nseg - 1; 
+  Index d_fsupc; // distance between the first column of the current panel and the 
+               // first column of the current snode
+  Index fst_col; // First column within small LU update
+  Index segsize; 
+  for (ksub = 0; ksub < nseg; ksub++)
+  {
+    krep = segrep(k); k--; 
+    ksupno = glu.supno(krep); 
+    if (jsupno != ksupno )
+    {
+      // outside the rectangular supernode 
+      fsupc = glu.xsup(ksupno); 
+      fst_col = (std::max)(fsupc, fpanelc); 
+      
+      // Distance from the current supernode to the current panel; 
+      // d_fsupc = 0 if fsupc > fpanelc
+      d_fsupc = fst_col - fsupc; 
+      
+      luptr = glu.xlusup(fst_col) + d_fsupc; 
+      lptr = glu.xlsub(fsupc) + d_fsupc; 
+      
+      kfnz = repfnz(krep); 
+      kfnz = (std::max)(kfnz, fpanelc); 
+      
+      segsize = krep - kfnz + 1; 
+      nsupc = krep - fst_col + 1; 
+      nsupr = glu.xlsub(fsupc+1) - glu.xlsub(fsupc); 
+      nrow = nsupr - d_fsupc - nsupc;
+      Index lda = glu.xlusup(fst_col+1) - glu.xlusup(fst_col);
+      
+      
+      // Perform a triangular solver and block update, 
+      // then scatter the result of sup-col update to dense
+      no_zeros = kfnz - fst_col; 
+      if(segsize==1)
+        LU_kernel_bmod<1>::run(segsize, dense, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
+      else
+        LU_kernel_bmod<Dynamic>::run(segsize, dense, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
+    } // end if jsupno 
+  } // end for each segment
+  
+  // Process the supernodal portion of  L\U[*,j]
+  nextlu = glu.xlusup(jcol); 
+  fsupc = glu.xsup(jsupno);
+  
+  // copy the SPA dense into L\U[*,j]
+  Index mem; 
+  new_next = nextlu + glu.xlsub(fsupc + 1) - glu.xlsub(fsupc); 
+  Index offset = internal::first_multiple<Index>(new_next, internal::packet_traits<Scalar>::size) - new_next;
+  if(offset)
+    new_next += offset;
+  while (new_next > glu.nzlumax )
+  {
+    mem = memXpand<ScalarVector>(glu.lusup, glu.nzlumax, nextlu, LUSUP, glu.num_expansions);  
+    if (mem) return mem; 
+  }
+  
+  for (isub = glu.xlsub(fsupc); isub < glu.xlsub(fsupc+1); isub++)
+  {
+    irow = glu.lsub(isub);
+    glu.lusup(nextlu) = dense(irow);
+    dense(irow) = Scalar(0.0); 
+    ++nextlu; 
+  }
+  
+  if(offset)
+  {
+    glu.lusup.segment(nextlu,offset).setZero();
+    nextlu += offset;
+  }
+  glu.xlusup(jcol + 1) = nextlu;  // close L\U(*,jcol); 
+  
+  /* For more updates within the panel (also within the current supernode),
+   * should start from the first column of the panel, or the first column
+   * of the supernode, whichever is bigger. There are two cases:
+   *  1) fsupc < fpanelc, then fst_col <-- fpanelc
+   *  2) fsupc >= fpanelc, then fst_col <-- fsupc
+   */
+  fst_col = (std::max)(fsupc, fpanelc); 
+  
+  if (fst_col  < jcol)
+  {
+    // Distance between the current supernode and the current panel
+    // d_fsupc = 0 if fsupc >= fpanelc
+    d_fsupc = fst_col - fsupc; 
+    
+    lptr = glu.xlsub(fsupc) + d_fsupc; 
+    luptr = glu.xlusup(fst_col) + d_fsupc; 
+    nsupr = glu.xlsub(fsupc+1) - glu.xlsub(fsupc); // leading dimension
+    nsupc = jcol - fst_col; // excluding jcol 
+    nrow = nsupr - d_fsupc - nsupc; 
+    
+    // points to the beginning of jcol in snode L\U(jsupno) 
+    ufirst = glu.xlusup(jcol) + d_fsupc; 
+    Index lda = glu.xlusup(jcol+1) - glu.xlusup(jcol);
+    Map<Matrix<Scalar,Dynamic,Dynamic>, 0,  OuterStride<> > A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); 
+    VectorBlock<ScalarVector> u(glu.lusup, ufirst, nsupc); 
+    u = A.template triangularView<UnitLower>().solve(u); 
+    
+    new (&A) Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); 
+    VectorBlock<ScalarVector> l(glu.lusup, ufirst+nsupc, nrow); 
+    l.noalias() -= A * u;
+    
+  } // End if fst_col
+  return 0; 
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // SPARSELU_COLUMN_BMOD_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_column_dfs.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_column_dfs.h
new file mode 100644
index 0000000..4c04b0e
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_column_dfs.h
@@ -0,0 +1,177 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* 
+ 
+ * NOTE: This file is the modified version of [s,d,c,z]column_dfs.c file in SuperLU 
+ 
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+#ifndef SPARSELU_COLUMN_DFS_H
+#define SPARSELU_COLUMN_DFS_H
+
+template <typename Scalar, typename Index> class SparseLUImpl;
+namespace Eigen {
+
+namespace internal {
+
+template<typename IndexVector, typename ScalarVector>
+struct column_dfs_traits : no_assignment_operator
+{
+  typedef typename ScalarVector::Scalar Scalar;
+  typedef typename IndexVector::Scalar Index;
+  column_dfs_traits(Index jcol, Index& jsuper, typename SparseLUImpl<Scalar, Index>::GlobalLU_t& glu, SparseLUImpl<Scalar, Index>& luImpl)
+   : m_jcol(jcol), m_jsuper_ref(jsuper), m_glu(glu), m_luImpl(luImpl)
+ {}
+  bool update_segrep(Index /*krep*/, Index /*jj*/)
+  {
+    return true;
+  }
+  void mem_expand(IndexVector& lsub, Index& nextl, Index chmark)
+  {
+    if (nextl >= m_glu.nzlmax)
+      m_luImpl.memXpand(lsub, m_glu.nzlmax, nextl, LSUB, m_glu.num_expansions); 
+    if (chmark != (m_jcol-1)) m_jsuper_ref = emptyIdxLU;
+  }
+  enum { ExpandMem = true };
+  
+  Index m_jcol;
+  Index& m_jsuper_ref;
+  typename SparseLUImpl<Scalar, Index>::GlobalLU_t& m_glu;
+  SparseLUImpl<Scalar, Index>& m_luImpl;
+};
+
+
+/**
+ * \brief Performs a symbolic factorization on column jcol and decide the supernode boundary
+ * 
+ * A supernode representative is the last column of a supernode.
+ * The nonzeros in U[*,j] are segments that end at supernodes representatives. 
+ * The routine returns a list of the supernodal representatives 
+ * in topological order of the dfs that generates them. 
+ * The location of the first nonzero in each supernodal segment 
+ * (supernodal entry location) is also returned. 
+ * 
+ * \param m number of rows in the matrix
+ * \param jcol Current column 
+ * \param perm_r Row permutation
+ * \param maxsuper  Maximum number of column allowed in a supernode
+ * \param [in,out] nseg Number of segments in current U[*,j] - new segments appended
+ * \param lsub_col defines the rhs vector to start the dfs
+ * \param [in,out] segrep Segment representatives - new segments appended 
+ * \param repfnz  First nonzero location in each row
+ * \param xprune 
+ * \param marker  marker[i] == jj, if i was visited during dfs of current column jj;
+ * \param parent
+ * \param xplore working array
+ * \param glu global LU data 
+ * \return 0 success
+ *         > 0 number of bytes allocated when run out of space
+ * 
+ */
+template <typename Scalar, typename Index>
+Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,  BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
+{
+  
+  Index jsuper = glu.supno(jcol); 
+  Index nextl = glu.xlsub(jcol); 
+  VectorBlock<IndexVector> marker2(marker, 2*m, m); 
+  
+  
+  column_dfs_traits<IndexVector, ScalarVector> traits(jcol, jsuper, glu, *this);
+  
+  // For each nonzero in A(*,jcol) do dfs 
+  for (Index k = 0; ((k < m) ? lsub_col[k] != emptyIdxLU : false) ; k++)
+  {
+    Index krow = lsub_col(k); 
+    lsub_col(k) = emptyIdxLU; 
+    Index kmark = marker2(krow); 
+    
+    // krow was visited before, go to the next nonz; 
+    if (kmark == jcol) continue;
+    
+    dfs_kernel(jcol, perm_r, nseg, glu.lsub, segrep, repfnz, xprune, marker2, parent,
+                   xplore, glu, nextl, krow, traits);
+  } // for each nonzero ... 
+  
+  Index fsupc, jptr, jm1ptr, ito, ifrom, istop;
+  Index nsuper = glu.supno(jcol);
+  Index jcolp1 = jcol + 1;
+  Index jcolm1 = jcol - 1;
+  
+  // check to see if j belongs in the same supernode as j-1
+  if ( jcol == 0 )
+  { // Do nothing for column 0 
+    nsuper = glu.supno(0) = 0 ;
+  }
+  else 
+  {
+    fsupc = glu.xsup(nsuper); 
+    jptr = glu.xlsub(jcol); // Not yet compressed
+    jm1ptr = glu.xlsub(jcolm1); 
+    
+    // Use supernodes of type T2 : see SuperLU paper
+    if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = emptyIdxLU;
+    
+    // Make sure the number of columns in a supernode doesn't
+    // exceed threshold
+    if ( (jcol - fsupc) >= maxsuper) jsuper = emptyIdxLU; 
+    
+    /* If jcol starts a new supernode, reclaim storage space in
+     * glu.lsub from previous supernode. Note we only store 
+     * the subscript set of the first and last columns of 
+     * a supernode. (first for num values, last for pruning)
+     */
+    if (jsuper == emptyIdxLU)
+    { // starts a new supernode 
+      if ( (fsupc < jcolm1-1) ) 
+      { // >= 3 columns in nsuper
+        ito = glu.xlsub(fsupc+1);
+        glu.xlsub(jcolm1) = ito; 
+        istop = ito + jptr - jm1ptr; 
+        xprune(jcolm1) = istop; // intialize xprune(jcol-1)
+        glu.xlsub(jcol) = istop; 
+        
+        for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
+          glu.lsub(ito) = glu.lsub(ifrom); 
+        nextl = ito;  // = istop + length(jcol)
+      }
+      nsuper++; 
+      glu.supno(jcol) = nsuper; 
+    } // if a new supernode 
+  } // end else:  jcol > 0
+  
+  // Tidy up the pointers before exit
+  glu.xsup(nsuper+1) = jcolp1; 
+  glu.supno(jcolp1) = nsuper; 
+  xprune(jcol) = nextl;  // Intialize upper bound for pruning
+  glu.xlsub(jcolp1) = nextl; 
+  
+  return 0; 
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
new file mode 100644
index 0000000..170610d
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
@@ -0,0 +1,106 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+/* 
+ 
+ * NOTE: This file is the modified version of [s,d,c,z]copy_to_ucol.c file in SuperLU 
+ 
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+#ifndef SPARSELU_COPY_TO_UCOL_H
+#define SPARSELU_COPY_TO_UCOL_H
+
+namespace Eigen {
+namespace internal {
+
+/**
+ * \brief Performs numeric block updates (sup-col) in topological order
+ * 
+ * \param jcol current column to update
+ * \param nseg Number of segments in the U part
+ * \param segrep segment representative ...
+ * \param repfnz First nonzero column in each row  ...
+ * \param perm_r Row permutation 
+ * \param dense Store the full representation of the column
+ * \param glu Global LU data. 
+ * \return 0 - successful return 
+ *         > 0 - number of bytes allocated when run out of space
+ * 
+ */
+template <typename Scalar, typename Index>
+Index SparseLUImpl<Scalar,Index>::copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep, BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu)
+{  
+  Index ksub, krep, ksupno; 
+    
+  Index jsupno = glu.supno(jcol);
+  
+  // For each nonzero supernode segment of U[*,j] in topological order 
+  Index k = nseg - 1, i; 
+  Index nextu = glu.xusub(jcol); 
+  Index kfnz, isub, segsize; 
+  Index new_next,irow; 
+  Index fsupc, mem; 
+  for (ksub = 0; ksub < nseg; ksub++)
+  {
+    krep = segrep(k); k--; 
+    ksupno = glu.supno(krep); 
+    if (jsupno != ksupno ) // should go into ucol(); 
+    {
+      kfnz = repfnz(krep); 
+      if (kfnz != emptyIdxLU)
+      { // Nonzero U-segment 
+        fsupc = glu.xsup(ksupno); 
+        isub = glu.xlsub(fsupc) + kfnz - fsupc; 
+        segsize = krep - kfnz + 1; 
+        new_next = nextu + segsize; 
+        while (new_next > glu.nzumax) 
+        {
+          mem = memXpand<ScalarVector>(glu.ucol, glu.nzumax, nextu, UCOL, glu.num_expansions); 
+          if (mem) return mem; 
+          mem = memXpand<IndexVector>(glu.usub, glu.nzumax, nextu, USUB, glu.num_expansions); 
+          if (mem) return mem; 
+          
+        }
+        
+        for (i = 0; i < segsize; i++)
+        {
+          irow = glu.lsub(isub); 
+          glu.usub(nextu) = perm_r(irow); // Unlike the L part, the U part is stored in its final order
+          glu.ucol(nextu) = dense(irow); 
+          dense(irow) = Scalar(0.0); 
+          nextu++;
+          isub++;
+        }
+        
+      } // end nonzero U-segment 
+      
+    } // end if jsupno 
+    
+  } // end for each segment
+  glu.xusub(jcol + 1) = nextu; // close U(*,jcol)
+  return 0; 
+}
+
+} // namespace internal
+} // end namespace Eigen
+
+#endif // SPARSELU_COPY_TO_UCOL_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
new file mode 100644
index 0000000..9e4e3e7
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -0,0 +1,279 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H
+#define EIGEN_SPARSELU_GEMM_KERNEL_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+/** \internal
+  * A general matrix-matrix product kernel optimized for the SparseLU factorization.
+  *  - A, B, and C must be column major
+  *  - lda and ldc must be multiples of the respective packet size
+  *  - C must have the same alignment as A
+  */
+template<typename Scalar,typename Index>
+EIGEN_DONT_INLINE
+void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc)
+{
+  using namespace Eigen::internal;
+  
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum {
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    PacketSize = packet_traits<Scalar>::size,
+    PM = 8,                             // peeling in M
+    RN = 2,                             // register blocking
+    RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking
+    BM = 4096/sizeof(Scalar),           // number of rows of A-C per chunk
+    SM = PM*PacketSize                  // step along M
+  };
+  Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
+  Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
+  Index i0 = internal::first_aligned(A,m);
+  
+  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m)));
+  
+  // handle the non aligned rows of A and C without any optimization:
+  for(Index i=0; i<i0; ++i)
+  {
+    for(Index j=0; j<n; ++j)
+    {
+      Scalar c = C[i+j*ldc];
+      for(Index k=0; k<d; ++k)
+        c += B[k+j*ldb] * A[i+k*lda];
+      C[i+j*ldc] = c;
+    }
+  }
+  // process the remaining rows per chunk of BM rows
+  for(Index ib=i0; ib<m; ib+=BM)
+  {
+    Index actual_b = std::min<Index>(BM, m-ib);                 // actual number of rows
+    Index actual_b_end1 = (actual_b/SM)*SM;                   // actual number of rows suitable for peeling
+    Index actual_b_end2 = (actual_b/PacketSize)*PacketSize;   // actual number of rows suitable for vectorization
+    
+    // Let's process two columns of B-C at once
+    for(Index j=0; j<n_end; j+=RN)
+    {
+      const Scalar* Bc0 = B+(j+0)*ldb;
+      const Scalar* Bc1 = B+(j+1)*ldb;
+      
+      for(Index k=0; k<d_end; k+=RK)
+      {
+        
+        // load and expand a RN x RK block of B
+        Packet b00, b10, b20, b30, b01, b11, b21, b31;
+                  b00 = pset1<Packet>(Bc0[0]);
+                  b10 = pset1<Packet>(Bc0[1]);
+        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
+        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
+                  b01 = pset1<Packet>(Bc1[0]);
+                  b11 = pset1<Packet>(Bc1[1]);
+        if(RK==4) b21 = pset1<Packet>(Bc1[2]);
+        if(RK==4) b31 = pset1<Packet>(Bc1[3]);
+        
+        Packet a0, a1, a2, a3, c0, c1, t0, t1;
+        
+        const Scalar* A0 = A+ib+(k+0)*lda;
+        const Scalar* A1 = A+ib+(k+1)*lda;
+        const Scalar* A2 = A+ib+(k+2)*lda;
+        const Scalar* A3 = A+ib+(k+3)*lda;
+        
+        Scalar* C0 = C+ib+(j+0)*ldc;
+        Scalar* C1 = C+ib+(j+1)*ldc;
+        
+                  a0 = pload<Packet>(A0);
+                  a1 = pload<Packet>(A1);
+        if(RK==4)
+        {
+          a2 = pload<Packet>(A2);
+          a3 = pload<Packet>(A3);
+        }
+        else
+        {
+          // workaround "may be used uninitialized in this function" warning
+          a2 = a3 = a0;
+        }
+        
+#define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}
+#define WORK(I)  \
+                    c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
+                    c1 = pload<Packet>(C1+i+(I)*PacketSize);   \
+                    KMADD(c0, a0, b00, t0)      \
+                    KMADD(c1, a0, b01, t1)      \
+                    a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
+                    KMADD(c0, a1, b10, t0)      \
+                    KMADD(c1, a1, b11, t1)       \
+                    a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
+          if(RK==4) KMADD(c0, a2, b20, t0)       \
+          if(RK==4) KMADD(c1, a2, b21, t1)       \
+          if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
+          if(RK==4) KMADD(c0, a3, b30, t0)       \
+          if(RK==4) KMADD(c1, a3, b31, t1)       \
+          if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
+                    pstore(C0+i+(I)*PacketSize, c0);           \
+                    pstore(C1+i+(I)*PacketSize, c1)
+        
+        // process rows of A' - C' with aggressive vectorization and peeling 
+        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
+        {
+          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL1");
+                    prefetch((A0+i+(5)*PacketSize));
+                    prefetch((A1+i+(5)*PacketSize));
+          if(RK==4) prefetch((A2+i+(5)*PacketSize));
+          if(RK==4) prefetch((A3+i+(5)*PacketSize));
+                    WORK(0);
+                    WORK(1);
+                    WORK(2);
+                    WORK(3);
+                    WORK(4);
+                    WORK(5);
+                    WORK(6);
+                    WORK(7);
+        }
+        // process the remaining rows with vectorization only
+        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
+        {
+          WORK(0);
+        }
+#undef WORK
+        // process the remaining rows without vectorization
+        for(Index i=actual_b_end2; i<actual_b; ++i)
+        {
+          if(RK==4)
+          {
+            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
+            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]+A2[i]*Bc1[2]+A3[i]*Bc1[3];
+          }
+          else
+          {
+            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
+            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1];
+          }
+        }
+        
+        Bc0 += RK;
+        Bc1 += RK;
+      } // peeled loop on k
+    } // peeled loop on the columns j
+    // process the last column (we now perform a matrux-vector product)
+    if((n-n_end)>0)
+    {
+      const Scalar* Bc0 = B+(n-1)*ldb;
+      
+      for(Index k=0; k<d_end; k+=RK)
+      {
+        
+        // load and expand a 1 x RK block of B
+        Packet b00, b10, b20, b30;
+                  b00 = pset1<Packet>(Bc0[0]);
+                  b10 = pset1<Packet>(Bc0[1]);
+        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
+        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
+        
+        Packet a0, a1, a2, a3, c0, t0/*, t1*/;
+        
+        const Scalar* A0 = A+ib+(k+0)*lda;
+        const Scalar* A1 = A+ib+(k+1)*lda;
+        const Scalar* A2 = A+ib+(k+2)*lda;
+        const Scalar* A3 = A+ib+(k+3)*lda;
+        
+        Scalar* C0 = C+ib+(n_end)*ldc;
+        
+                  a0 = pload<Packet>(A0);
+                  a1 = pload<Packet>(A1);
+        if(RK==4)
+        {
+          a2 = pload<Packet>(A2);
+          a3 = pload<Packet>(A3);
+        }
+        else
+        {
+          // workaround "may be used uninitialized in this function" warning
+          a2 = a3 = a0;
+        }
+        
+#define WORK(I) \
+                  c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
+                  KMADD(c0, a0, b00, t0)       \
+                  a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
+                  KMADD(c0, a1, b10, t0)       \
+                  a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
+        if(RK==4) KMADD(c0, a2, b20, t0)       \
+        if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
+        if(RK==4) KMADD(c0, a3, b30, t0)       \
+        if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
+                  pstore(C0+i+(I)*PacketSize, c0);
+        
+        // agressive vectorization and peeling
+        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
+        {
+          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
+          WORK(0);
+          WORK(1);
+          WORK(2);
+          WORK(3);
+          WORK(4);
+          WORK(5);
+          WORK(6);
+          WORK(7);
+        }
+        // vectorization only
+        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
+        {
+          WORK(0);
+        }
+        // remaining scalars
+        for(Index i=actual_b_end2; i<actual_b; ++i)
+        {
+          if(RK==4) 
+            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
+          else
+            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
+        }
+        
+        Bc0 += RK;
+#undef WORK
+      }
+    }
+    
+    // process the last columns of A, corresponding to the last rows of B
+    Index rd = d-d_end;
+    if(rd>0)
+    {
+      for(Index j=0; j<n; ++j)
+      {
+        enum {
+          Alignment = PacketSize>1 ? Aligned : 0
+        };
+        typedef Map<Matrix<Scalar,Dynamic,1>, Alignment > MapVector;
+        typedef Map<const Matrix<Scalar,Dynamic,1>, Alignment > ConstMapVector;
+        if(rd==1)       MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b);
+        
+        else if(rd==2)  MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
+                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b);
+        
+        else            MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
+                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b)
+                                                        + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b);
+      }
+    }
+  
+  } // blocking on the rows of A and C
+}
+#undef KMADD
+
+} // namespace internal
+
+} // namespace Eigen
+
+#endif // EIGEN_SPARSELU_GEMM_KERNEL_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
new file mode 100644
index 0000000..7a4e430
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
@@ -0,0 +1,127 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* This file is a modified version of heap_relax_snode.c file in SuperLU
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+
+#ifndef SPARSELU_HEAP_RELAX_SNODE_H
+#define SPARSELU_HEAP_RELAX_SNODE_H
+
+namespace Eigen {
+namespace internal {
+
+/** 
+ * \brief Identify the initial relaxed supernodes
+ * 
+ * This routine applied to a symmetric elimination tree. 
+ * It assumes that the matrix has been reordered according to the postorder of the etree
+ * \param n The number of columns
+ * \param et elimination tree 
+ * \param relax_columns Maximum number of columns allowed in a relaxed snode 
+ * \param descendants Number of descendants of each node in the etree
+ * \param relax_end last column in a supernode
+ */
+template <typename Scalar, typename Index>
+void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
+{
+  
+  // The etree may not be postordered, but its heap ordered  
+  IndexVector post;
+  internal::treePostorder(n, et, post); // Post order etree
+  IndexVector inv_post(n+1); 
+  Index i;
+  for (i = 0; i < n+1; ++i) inv_post(post(i)) = i; // inv_post = post.inverse()???
+  
+  // Renumber etree in postorder 
+  IndexVector iwork(n);
+  IndexVector et_save(n+1);
+  for (i = 0; i < n; ++i)
+  {
+    iwork(post(i)) = post(et(i));
+  }
+  et_save = et; // Save the original etree
+  et = iwork; 
+  
+  // compute the number of descendants of each node in the etree
+  relax_end.setConstant(emptyIdxLU);
+  Index j, parent; 
+  descendants.setZero();
+  for (j = 0; j < n; j++) 
+  {
+    parent = et(j);
+    if (parent != n) // not the dummy root
+      descendants(parent) += descendants(j) + 1;
+  }
+  // Identify the relaxed supernodes by postorder traversal of the etree
+  Index snode_start; // beginning of a snode 
+  Index k;
+  Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree 
+  Index nsuper_et = 0; // Number of relaxed snodes in the original etree 
+  Index l; 
+  for (j = 0; j < n; )
+  {
+    parent = et(j);
+    snode_start = j; 
+    while ( parent != n && descendants(parent) < relax_columns ) 
+    {
+      j = parent; 
+      parent = et(j);
+    }
+    // Found a supernode in postordered etree, j is the last column 
+    ++nsuper_et_post;
+    k = n;
+    for (i = snode_start; i <= j; ++i)
+      k = (std::min)(k, inv_post(i));
+    l = inv_post(j);
+    if ( (l - k) == (j - snode_start) )  // Same number of columns in the snode
+    {
+      // This is also a supernode in the original etree
+      relax_end(k) = l; // Record last column 
+      ++nsuper_et; 
+    }
+    else 
+    {
+      for (i = snode_start; i <= j; ++i) 
+      {
+        l = inv_post(i);
+        if (descendants(i) == 0) 
+        {
+          relax_end(l) = l;
+          ++nsuper_et;
+        }
+      }
+    }
+    j++;
+    // Search for a new leaf
+    while (descendants(j) != 0 && j < n) j++;
+  } // End postorder traversal of the etree
+  
+  // Recover the original etree
+  et = et_save; 
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+#endif // SPARSELU_HEAP_RELAX_SNODE_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
new file mode 100644
index 0000000..0d0283b
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef SPARSELU_KERNEL_BMOD_H
+#define SPARSELU_KERNEL_BMOD_H
+
+namespace Eigen {
+namespace internal {
+  
+/**
+ * \brief Performs numeric block updates from a given supernode to a single column
+ * 
+ * \param segsize Size of the segment (and blocks ) to use for updates
+ * \param[in,out] dense Packed values of the original matrix
+ * \param tempv temporary vector to use for updates
+ * \param lusup array containing the supernodes
+ * \param lda Leading dimension in the supernode
+ * \param nrow Number of rows in the rectangular part of the supernode
+ * \param lsub compressed row subscripts of supernodes
+ * \param lptr pointer to the first column of the current supernode in lsub
+ * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
+ * \return 0 on success
+ */
+template <int SegSizeAtCompileTime> struct LU_kernel_bmod
+{
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
+  static EIGEN_DONT_INLINE void run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
+                                    const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
+};
+
+template <int SegSizeAtCompileTime>
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
+EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
+                                                                  const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
+{
+  typedef typename ScalarVector::Scalar Scalar;
+  // First, copy U[*,j] segment from dense(*) to tempv(*)
+  // The result of triangular solve is in tempv[*]; 
+    // The result of matric-vector update is in dense[*]
+  Index isub = lptr + no_zeros; 
+  int i;
+  Index irow;
+  for (i = 0; i < ((SegSizeAtCompileTime==Dynamic)?segsize:SegSizeAtCompileTime); i++)
+  {
+    irow = lsub(isub); 
+    tempv(i) = dense(irow); 
+    ++isub; 
+  }
+  // Dense triangular solve -- start effective triangle
+  luptr += lda * no_zeros + no_zeros; 
+  // Form Eigen matrix and vector 
+  Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) );
+  Map<Matrix<Scalar,SegSizeAtCompileTime,1> > u(tempv.data(), segsize);
+  
+  u = A.template triangularView<UnitLower>().solve(u); 
+  
+  // Dense matrix-vector product y <-- B*x 
+  luptr += segsize;
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
+  Index ldl = internal::first_multiple(nrow, PacketSize);
+  Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
+  Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize);
+  Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize;
+  Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
+  
+  l.setZero();
+  internal::sparselu_gemm<Scalar>(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride());
+  
+  // Scatter tempv[] into SPA dense[] as a temporary storage 
+  isub = lptr + no_zeros;
+  for (i = 0; i < ((SegSizeAtCompileTime==Dynamic)?segsize:SegSizeAtCompileTime); i++)
+  {
+    irow = lsub(isub++); 
+    dense(irow) = tempv(i);
+  }
+  
+  // Scatter l into SPA dense[]
+  for (i = 0; i < nrow; i++)
+  {
+    irow = lsub(isub++); 
+    dense(irow) -= l(i);
+  } 
+}
+
+template <> struct LU_kernel_bmod<1>
+{
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
+  static EIGEN_DONT_INLINE void run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
+                                    const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
+};
+
+
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
+EIGEN_DONT_INLINE void LU_kernel_bmod<1>::run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
+                                              const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
+{
+  typedef typename ScalarVector::Scalar Scalar;
+  Scalar f = dense(lsub(lptr + no_zeros));
+  luptr += lda * no_zeros + no_zeros + 1;
+  const Scalar* a(lusup.data() + luptr);
+  const /*typename IndexVector::Scalar*/Index*  irow(lsub.data()+lptr + no_zeros + 1);
+  Index i = 0;
+  for (; i+1 < nrow; i+=2)
+  {
+    Index i0 = *(irow++);
+    Index i1 = *(irow++);
+    Scalar a0 = *(a++);
+    Scalar a1 = *(a++);
+    Scalar d0 = dense.coeff(i0);
+    Scalar d1 = dense.coeff(i1);
+    d0 -= f*a0;
+    d1 -= f*a1;
+    dense.coeffRef(i0) = d0;
+    dense.coeffRef(i1) = d1;
+  }
+  if(i<nrow)
+    dense.coeffRef(*(irow++)) -= f * *(a++);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+#endif // SPARSELU_KERNEL_BMOD_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_bmod.h
new file mode 100644
index 0000000..da0e0fc
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -0,0 +1,223 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* 
+ 
+ * NOTE: This file is the modified version of [s,d,c,z]panel_bmod.c file in SuperLU 
+ 
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+#ifndef SPARSELU_PANEL_BMOD_H
+#define SPARSELU_PANEL_BMOD_H
+
+namespace Eigen {
+namespace internal {
+
+/**
+ * \brief Performs numeric block updates (sup-panel) in topological order.
+ * 
+ * Before entering this routine, the original nonzeros in the panel
+ * were already copied i nto the spa[m,w]
+ * 
+ * \param m number of rows in the matrix
+ * \param w Panel size
+ * \param jcol Starting  column of the panel
+ * \param nseg Number of segments in the U part
+ * \param dense Store the full representation of the panel 
+ * \param tempv working array 
+ * \param segrep segment representative... first row in the segment
+ * \param repfnz First nonzero rows
+ * \param glu Global LU data. 
+ * 
+ * 
+ */
+template <typename Scalar, typename Index>
+void SparseLUImpl<Scalar,Index>::panel_bmod(const Index m, const Index w, const Index jcol, 
+                                            const Index nseg, ScalarVector& dense, ScalarVector& tempv,
+                                            IndexVector& segrep, IndexVector& repfnz, GlobalLU_t& glu)
+{
+  
+  Index ksub,jj,nextl_col; 
+  Index fsupc, nsupc, nsupr, nrow; 
+  Index krep, kfnz; 
+  Index lptr; // points to the row subscripts of a supernode 
+  Index luptr; // ...
+  Index segsize,no_zeros ; 
+  // For each nonz supernode segment of U[*,j] in topological order
+  Index k = nseg - 1; 
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
+  
+  for (ksub = 0; ksub < nseg; ksub++)
+  { // For each updating supernode
+    /* krep = representative of current k-th supernode
+     * fsupc =  first supernodal column
+     * nsupc = number of columns in a supernode
+     * nsupr = number of rows in a supernode
+     */
+    krep = segrep(k); k--; 
+    fsupc = glu.xsup(glu.supno(krep)); 
+    nsupc = krep - fsupc + 1; 
+    nsupr = glu.xlsub(fsupc+1) - glu.xlsub(fsupc); 
+    nrow = nsupr - nsupc; 
+    lptr = glu.xlsub(fsupc); 
+    
+    // loop over the panel columns to detect the actual number of columns and rows
+    Index u_rows = 0;
+    Index u_cols = 0;
+    for (jj = jcol; jj < jcol + w; jj++)
+    {
+      nextl_col = (jj-jcol) * m; 
+      VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero column index for each row
+      
+      kfnz = repfnz_col(krep); 
+      if ( kfnz == emptyIdxLU ) 
+        continue; // skip any zero segment
+      
+      segsize = krep - kfnz + 1;
+      u_cols++;
+      u_rows = (std::max)(segsize,u_rows);
+    }
+    
+    if(nsupc >= 2)
+    { 
+      Index ldu = internal::first_multiple<Index>(u_rows, PacketSize);
+      Map<Matrix<Scalar,Dynamic,Dynamic>, Aligned,  OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu));
+      
+      // gather U
+      Index u_col = 0;
+      for (jj = jcol; jj < jcol + w; jj++)
+      {
+        nextl_col = (jj-jcol) * m; 
+        VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero column index for each row
+        VectorBlock<ScalarVector> dense_col(dense, nextl_col, m); // Scatter/gather entire matrix column from/to here
+        
+        kfnz = repfnz_col(krep); 
+        if ( kfnz == emptyIdxLU ) 
+          continue; // skip any zero segment
+        
+        segsize = krep - kfnz + 1;
+        luptr = glu.xlusup(fsupc);    
+        no_zeros = kfnz - fsupc; 
+        
+        Index isub = lptr + no_zeros;
+        Index off = u_rows-segsize;
+        for (Index i = 0; i < off; i++) U(i,u_col) = 0;
+        for (Index i = 0; i < segsize; i++)
+        {
+          Index irow = glu.lsub(isub); 
+          U(i+off,u_col) = dense_col(irow); 
+          ++isub; 
+        }
+        u_col++;
+      }
+      // solve U = A^-1 U
+      luptr = glu.xlusup(fsupc);
+      Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc);
+      no_zeros = (krep - u_rows + 1) - fsupc;
+      luptr += lda * no_zeros + no_zeros;
+      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(lda) );
+      U = A.template triangularView<UnitLower>().solve(U);
+      
+      // update
+      luptr += u_rows;
+      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(lda) );
+      eigen_assert(tempv.size()>w*ldu + nrow*w + 1);
+      
+      Index ldl = internal::first_multiple<Index>(nrow, PacketSize);
+      Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize;
+      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
+      
+      L.setZero();
+      internal::sparselu_gemm<Scalar>(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride());
+      
+      // scatter U and L
+      u_col = 0;
+      for (jj = jcol; jj < jcol + w; jj++)
+      {
+        nextl_col = (jj-jcol) * m; 
+        VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero column index for each row
+        VectorBlock<ScalarVector> dense_col(dense, nextl_col, m); // Scatter/gather entire matrix column from/to here
+        
+        kfnz = repfnz_col(krep); 
+        if ( kfnz == emptyIdxLU ) 
+          continue; // skip any zero segment
+        
+        segsize = krep - kfnz + 1;
+        no_zeros = kfnz - fsupc; 
+        Index isub = lptr + no_zeros;
+        
+        Index off = u_rows-segsize;
+        for (Index i = 0; i < segsize; i++)
+        {
+          Index irow = glu.lsub(isub++); 
+          dense_col(irow) = U.coeff(i+off,u_col);
+          U.coeffRef(i+off,u_col) = 0;
+        }
+        
+        // Scatter l into SPA dense[]
+        for (Index i = 0; i < nrow; i++)
+        {
+          Index irow = glu.lsub(isub++); 
+          dense_col(irow) -= L.coeff(i,u_col);
+          L.coeffRef(i,u_col) = 0;
+        }
+        u_col++;
+      }
+    }
+    else // level 2 only
+    {
+      // Sequence through each column in the panel
+      for (jj = jcol; jj < jcol + w; jj++)
+      {
+        nextl_col = (jj-jcol) * m; 
+        VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero column index for each row
+        VectorBlock<ScalarVector> dense_col(dense, nextl_col, m); // Scatter/gather entire matrix column from/to here
+        
+        kfnz = repfnz_col(krep); 
+        if ( kfnz == emptyIdxLU ) 
+          continue; // skip any zero segment
+        
+        segsize = krep - kfnz + 1;
+        luptr = glu.xlusup(fsupc);
+        
+        Index lda = glu.xlusup(fsupc+1)-glu.xlusup(fsupc);// nsupr
+        
+        // Perform a trianglar solve and block update, 
+        // then scatter the result of sup-col update to dense[]
+        no_zeros = kfnz - fsupc; 
+              if(segsize==1)  LU_kernel_bmod<1>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
+        else  if(segsize==2)  LU_kernel_bmod<2>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
+        else  if(segsize==3)  LU_kernel_bmod<3>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
+        else                  LU_kernel_bmod<Dynamic>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros); 
+      } // End for each column in the panel 
+    }
+    
+  } // End for each updating supernode
+} // end panel bmod
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // SPARSELU_PANEL_BMOD_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h
new file mode 100644
index 0000000..dc0054e
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* 
+ 
+ * NOTE: This file is the modified version of [s,d,c,z]panel_dfs.c file in SuperLU 
+ 
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+#ifndef SPARSELU_PANEL_DFS_H
+#define SPARSELU_PANEL_DFS_H
+
+namespace Eigen {
+
+namespace internal {
+  
+template<typename IndexVector>
+struct panel_dfs_traits
+{
+  typedef typename IndexVector::Scalar Index;
+  panel_dfs_traits(Index jcol, Index* marker)
+    : m_jcol(jcol), m_marker(marker)
+  {}
+  bool update_segrep(Index krep, Index jj)
+  {
+    if(m_marker[krep]<m_jcol)
+    {
+      m_marker[krep] = jj; 
+      return true;
+    }
+    return false;
+  }
+  void mem_expand(IndexVector& /*glu.lsub*/, Index /*nextl*/, Index /*chmark*/) {}
+  enum { ExpandMem = false };
+  Index m_jcol;
+  Index* m_marker;
+};
+
+
+template <typename Scalar, typename Index>
+template <typename Traits>
+void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
+                   Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
+                   Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
+                   IndexVector& xplore, GlobalLU_t& glu,
+                   Index& nextl_col, Index krow, Traits& traits
+                  )
+{
+  
+  Index kmark = marker(krow);
+      
+  // For each unmarked krow of jj
+  marker(krow) = jj; 
+  Index kperm = perm_r(krow); 
+  if (kperm == emptyIdxLU ) {
+    // krow is in L : place it in structure of L(*, jj)
+    panel_lsub(nextl_col++) = krow;  // krow is indexed into A
+    
+    traits.mem_expand(panel_lsub, nextl_col, kmark);
+  }
+  else 
+  {
+    // krow is in U : if its supernode-representative krep
+    // has been explored, update repfnz(*)
+    // krep = supernode representative of the current row
+    Index krep = glu.xsup(glu.supno(kperm)+1) - 1; 
+    // First nonzero element in the current column:
+    Index myfnz = repfnz_col(krep); 
+    
+    if (myfnz != emptyIdxLU )
+    {
+      // Representative visited before
+      if (myfnz > kperm ) repfnz_col(krep) = kperm; 
+      
+    }
+    else 
+    {
+      // Otherwise, perform dfs starting at krep
+      Index oldrep = emptyIdxLU; 
+      parent(krep) = oldrep; 
+      repfnz_col(krep) = kperm; 
+      Index xdfs =  glu.xlsub(krep); 
+      Index maxdfs = xprune(krep); 
+      
+      Index kpar;
+      do 
+      {
+        // For each unmarked kchild of krep
+        while (xdfs < maxdfs) 
+        {
+          Index kchild = glu.lsub(xdfs); 
+          xdfs++; 
+          Index chmark = marker(kchild); 
+          
+          if (chmark != jj ) 
+          {
+            marker(kchild) = jj; 
+            Index chperm = perm_r(kchild); 
+            
+            if (chperm == emptyIdxLU) 
+            {
+              // case kchild is in L: place it in L(*, j)
+              panel_lsub(nextl_col++) = kchild;
+              traits.mem_expand(panel_lsub, nextl_col, chmark);
+            }
+            else
+            {
+              // case kchild is in U :
+              // chrep = its supernode-rep. If its rep has been explored, 
+              // update its repfnz(*)
+              Index chrep = glu.xsup(glu.supno(chperm)+1) - 1; 
+              myfnz = repfnz_col(chrep); 
+              
+              if (myfnz != emptyIdxLU) 
+              { // Visited before 
+                if (myfnz > chperm) 
+                  repfnz_col(chrep) = chperm; 
+              }
+              else 
+              { // Cont. dfs at snode-rep of kchild
+                xplore(krep) = xdfs; 
+                oldrep = krep; 
+                krep = chrep; // Go deeper down G(L)
+                parent(krep) = oldrep; 
+                repfnz_col(krep) = chperm; 
+                xdfs = glu.xlsub(krep); 
+                maxdfs = xprune(krep); 
+                
+              } // end if myfnz != -1
+            } // end if chperm == -1 
+                
+          } // end if chmark !=jj
+        } // end while xdfs < maxdfs
+        
+        // krow has no more unexplored nbrs :
+        //    Place snode-rep krep in postorder DFS, if this 
+        //    segment is seen for the first time. (Note that 
+        //    "repfnz(krep)" may change later.)
+        //    Baktrack dfs to its parent
+        if(traits.update_segrep(krep,jj))
+        //if (marker1(krep) < jcol )
+        {
+          segrep(nseg) = krep; 
+          ++nseg; 
+          //marker1(krep) = jj; 
+        }
+        
+        kpar = parent(krep); // Pop recursion, mimic recursion 
+        if (kpar == emptyIdxLU) 
+          break; // dfs done 
+        krep = kpar; 
+        xdfs = xplore(krep); 
+        maxdfs = xprune(krep); 
+
+      } while (kpar != emptyIdxLU); // Do until empty stack 
+      
+    } // end if (myfnz = -1)
+
+  } // end if (kperm == -1)   
+}
+
+/**
+ * \brief Performs a symbolic factorization on a panel of columns [jcol, jcol+w)
+ * 
+ * A supernode representative is the last column of a supernode.
+ * The nonzeros in U[*,j] are segments that end at supernodes representatives
+ * 
+ * The routine returns a list of the supernodal representatives 
+ * in topological order of the dfs that generates them. This list is 
+ * a superset of the topological order of each individual column within 
+ * the panel.
+ * The location of the first nonzero in each supernodal segment 
+ * (supernodal entry location) is also returned. Each column has 
+ * a separate list for this purpose. 
+ * 
+ * Two markers arrays are used for dfs :
+ *    marker[i] == jj, if i was visited during dfs of current column jj;
+ *    marker1[i] >= jcol, if i was visited by earlier columns in this panel; 
+ * 
+ * \param[in] m number of rows in the matrix
+ * \param[in] w Panel size
+ * \param[in] jcol Starting  column of the panel
+ * \param[in] A Input matrix in column-major storage
+ * \param[in] perm_r Row permutation
+ * \param[out] nseg Number of U segments
+ * \param[out] dense Accumulate the column vectors of the panel
+ * \param[out] panel_lsub Subscripts of the row in the panel 
+ * \param[out] segrep Segment representative i.e first nonzero row of each segment
+ * \param[out] repfnz First nonzero location in each row
+ * \param[out] xprune The pruned elimination tree
+ * \param[out] marker work vector
+ * \param  parent The elimination tree
+ * \param xplore work vector
+ * \param glu The global data structure
+ * 
+ */
+
+template <typename Scalar, typename Index>
+void SparseLUImpl<Scalar,Index>::panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
+{
+  Index nextl_col; // Next available position in panel_lsub[*,jj] 
+  
+  // Initialize pointers 
+  VectorBlock<IndexVector> marker1(marker, m, m); 
+  nseg = 0; 
+  
+  panel_dfs_traits<IndexVector> traits(jcol, marker1.data());
+  
+  // For each column in the panel 
+  for (Index jj = jcol; jj < jcol + w; jj++) 
+  {
+    nextl_col = (jj - jcol) * m; 
+    
+    VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero location in each row
+    VectorBlock<ScalarVector> dense_col(dense,nextl_col, m); // Accumulate a column vector here
+    
+    
+    // For each nnz in A[*, jj] do depth first search
+    for (typename MatrixType::InnerIterator it(A, jj); it; ++it)
+    {
+      Index krow = it.row(); 
+      dense_col(krow) = it.value();
+      
+      Index kmark = marker(krow); 
+      if (kmark == jj) 
+        continue; // krow visited before, go to the next nonzero
+      
+      dfs_kernel(jj, perm_r, nseg, panel_lsub, segrep, repfnz_col, xprune, marker, parent,
+                   xplore, glu, nextl_col, krow, traits);
+    }// end for nonzeros in column jj
+    
+  } // end for column jj
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // SPARSELU_PANEL_DFS_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h
new file mode 100644
index 0000000..ddcd4ec
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h
@@ -0,0 +1,134 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* 
+ 
+ * NOTE: This file is the modified version of xpivotL.c file in SuperLU 
+ 
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+#ifndef SPARSELU_PIVOTL_H
+#define SPARSELU_PIVOTL_H
+
+namespace Eigen {
+namespace internal {
+  
+/**
+ * \brief Performs the numerical pivotin on the current column of L, and the CDIV operation.
+ * 
+ * Pivot policy :
+ * (1) Compute thresh = u * max_(i>=j) abs(A_ij);
+ * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
+ *           pivot row = k;
+ *       ELSE IF abs(A_jj) >= thresh THEN
+ *           pivot row = j;
+ *       ELSE
+ *           pivot row = m;
+ * 
+ *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
+ * 
+ * \param jcol The current column of L
+ * \param diagpivotthresh diagonal pivoting threshold
+ * \param[in,out] perm_r Row permutation (threshold pivoting)
+ * \param[in] iperm_c column permutation - used to finf diagonal of Pc*A*Pc'
+ * \param[out] pivrow  The pivot row
+ * \param glu Global LU data
+ * \return 0 if success, i > 0 if U(i,i) is exactly zero 
+ * 
+ */
+template <typename Scalar, typename Index>
+Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu)
+{
+  
+  Index fsupc = (glu.xsup)((glu.supno)(jcol)); // First column in the supernode containing the column jcol
+  Index nsupc = jcol - fsupc; // Number of columns in the supernode portion, excluding jcol; nsupc >=0
+  Index lptr = glu.xlsub(fsupc); // pointer to the starting location of the row subscripts for this supernode portion
+  Index nsupr = glu.xlsub(fsupc+1) - lptr; // Number of rows in the supernode
+  Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc); // leading dimension
+  Scalar* lu_sup_ptr = &(glu.lusup.data()[glu.xlusup(fsupc)]); // Start of the current supernode
+  Scalar* lu_col_ptr = &(glu.lusup.data()[glu.xlusup(jcol)]); // Start of jcol in the supernode
+  Index* lsub_ptr = &(glu.lsub.data()[lptr]); // Start of row indices of the supernode
+  
+  // Determine the largest abs numerical value for partial pivoting 
+  Index diagind = iperm_c(jcol); // diagonal index 
+  RealScalar pivmax = 0.0; 
+  Index pivptr = nsupc; 
+  Index diag = emptyIdxLU; 
+  RealScalar rtemp;
+  Index isub, icol, itemp, k; 
+  for (isub = nsupc; isub < nsupr; ++isub) {
+    rtemp = std::abs(lu_col_ptr[isub]);
+    if (rtemp > pivmax) {
+      pivmax = rtemp; 
+      pivptr = isub;
+    } 
+    if (lsub_ptr[isub] == diagind) diag = isub;
+  }
+  
+  // Test for singularity
+  if ( pivmax == 0.0 ) {
+    pivrow = lsub_ptr[pivptr];
+    perm_r(pivrow) = jcol;
+    return (jcol+1);
+  }
+  
+  RealScalar thresh = diagpivotthresh * pivmax; 
+  
+  // Choose appropriate pivotal element 
+  
+  {
+    // Test if the diagonal element can be used as a pivot (given the threshold value)
+    if (diag >= 0 ) 
+    {
+      // Diagonal element exists
+      rtemp = std::abs(lu_col_ptr[diag]);
+      if (rtemp != 0.0 && rtemp >= thresh) pivptr = diag;
+    }
+    pivrow = lsub_ptr[pivptr];
+  }
+  
+  // Record pivot row
+  perm_r(pivrow) = jcol; 
+  // Interchange row subscripts
+  if (pivptr != nsupc )
+  {
+    std::swap( lsub_ptr[pivptr], lsub_ptr[nsupc] );
+    // Interchange numerical values as well, for the two rows in the whole snode
+    // such that L is indexed the same way as A
+    for (icol = 0; icol <= nsupc; icol++)
+    {
+      itemp = pivptr + icol * lda; 
+      std::swap(lu_sup_ptr[itemp], lu_sup_ptr[nsupc + icol * lda]);
+    }
+  }
+  // cdiv operations
+  Scalar temp = Scalar(1.0) / lu_col_ptr[nsupc];
+  for (k = nsupc+1; k < nsupr; k++)
+    lu_col_ptr[k] *= temp; 
+  return 0;
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // SPARSELU_PIVOTL_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h
new file mode 100644
index 0000000..66460d1
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h
@@ -0,0 +1,135 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* 
+ 
+ * NOTE: This file is the modified version of [s,d,c,z]pruneL.c file in SuperLU 
+ 
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+#ifndef SPARSELU_PRUNEL_H
+#define SPARSELU_PRUNEL_H
+
+namespace Eigen {
+namespace internal {
+
+/**
+ * \brief Prunes the L-structure.
+ *
+ * It prunes the L-structure  of supernodes whose L-structure contains the current pivot row "pivrow"
+ * 
+ * 
+ * \param jcol The current column of L
+ * \param[in] perm_r Row permutation
+ * \param[out] pivrow  The pivot row
+ * \param nseg Number of segments
+ * \param segrep 
+ * \param repfnz
+ * \param[out] xprune 
+ * \param glu Global LU data
+ * 
+ */
+template <typename Scalar, typename Index>
+void SparseLUImpl<Scalar,Index>::pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg, const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu)
+{
+  // For each supernode-rep irep in U(*,j]
+  Index jsupno = glu.supno(jcol); 
+  Index i,irep,irep1; 
+  bool movnum, do_prune = false; 
+  Index kmin = 0, kmax = 0, minloc, maxloc,krow; 
+  for (i = 0; i < nseg; i++)
+  {
+    irep = segrep(i); 
+    irep1 = irep + 1; 
+    do_prune = false; 
+    
+    // Don't prune with a zero U-segment 
+    if (repfnz(irep) == emptyIdxLU) continue; 
+    
+    // If a snode overlaps with the next panel, then the U-segment
+    // is fragmented into two parts -- irep and irep1. We should let 
+    // pruning occur at the rep-column in irep1s snode. 
+    if (glu.supno(irep) == glu.supno(irep1) ) continue; // don't prune 
+    
+    // If it has not been pruned & it has a nonz in row L(pivrow,i)
+    if (glu.supno(irep) != jsupno )
+    {
+      if ( xprune (irep) >= glu.xlsub(irep1) )
+      {
+        kmin = glu.xlsub(irep);
+        kmax = glu.xlsub(irep1) - 1; 
+        for (krow = kmin; krow <= kmax; krow++)
+        {
+          if (glu.lsub(krow) == pivrow) 
+          {
+            do_prune = true; 
+            break; 
+          }
+        }
+      }
+      
+      if (do_prune) 
+      {
+        // do a quicksort-type partition
+        // movnum=true means that the num values have to be exchanged
+        movnum = false; 
+        if (irep == glu.xsup(glu.supno(irep)) ) // Snode of size 1 
+          movnum = true; 
+        
+        while (kmin <= kmax)
+        {
+          if (perm_r(glu.lsub(kmax)) == emptyIdxLU)
+            kmax--; 
+          else if ( perm_r(glu.lsub(kmin)) != emptyIdxLU)
+            kmin++;
+          else 
+          {
+            // kmin below pivrow (not yet pivoted), and kmax
+            // above pivrow: interchange the two suscripts
+            std::swap(glu.lsub(kmin), glu.lsub(kmax)); 
+            
+            // If the supernode has only one column, then we 
+            // only keep one set of subscripts. For any subscript
+            // intercnahge performed, similar interchange must be 
+            // done on the numerical values. 
+            if (movnum) 
+            {
+              minloc = glu.xlusup(irep) + ( kmin - glu.xlsub(irep) ); 
+              maxloc = glu.xlusup(irep) + ( kmax - glu.xlsub(irep) ); 
+              std::swap(glu.lusup(minloc), glu.lusup(maxloc)); 
+            }
+            kmin++;
+            kmax--;
+          }
+        } // end while 
+        
+        xprune(irep) = kmin;  //Pruning 
+      } // end if do_prune 
+    } // end pruning 
+  } // End for each U-segment
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // SPARSELU_PRUNEL_H
diff --git a/include/eigen3/Eigen/src/SparseLU/SparseLU_relax_snode.h b/include/eigen3/Eigen/src/SparseLU/SparseLU_relax_snode.h
new file mode 100644
index 0000000..58ec32e
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseLU/SparseLU_relax_snode.h
@@ -0,0 +1,83 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* This file is a modified version of heap_relax_snode.c file in SuperLU
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+
+#ifndef SPARSELU_RELAX_SNODE_H
+#define SPARSELU_RELAX_SNODE_H
+
+namespace Eigen {
+
+namespace internal {
+ 
+/** 
+ * \brief Identify the initial relaxed supernodes
+ * 
+ * This routine is applied to a column elimination tree. 
+ * It assumes that the matrix has been reordered according to the postorder of the etree
+ * \param n  the number of columns
+ * \param et elimination tree 
+ * \param relax_columns Maximum number of columns allowed in a relaxed snode 
+ * \param descendants Number of descendants of each node in the etree
+ * \param relax_end last column in a supernode
+ */
+template <typename Scalar, typename Index>
+void SparseLUImpl<Scalar,Index>::relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
+{
+  
+  // compute the number of descendants of each node in the etree
+  Index j, parent; 
+  relax_end.setConstant(emptyIdxLU);
+  descendants.setZero();
+  for (j = 0; j < n; j++) 
+  {
+    parent = et(j);
+    if (parent != n) // not the dummy root
+      descendants(parent) += descendants(j) + 1;
+  }
+  // Identify the relaxed supernodes by postorder traversal of the etree
+  Index snode_start; // beginning of a snode 
+  for (j = 0; j < n; )
+  {
+    parent = et(j);
+    snode_start = j; 
+    while ( parent != n && descendants(parent) < relax_columns ) 
+    {
+      j = parent; 
+      parent = et(j);
+    }
+    // Found a supernode in postordered etree, j is the last column 
+    relax_end(snode_start) = j; // Record last column
+    j++;
+    // Search for a new leaf
+    while (descendants(j) != 0 && j < n) j++;
+  } // End postorder traversal of the etree
+  
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+#endif
diff --git a/include/eigen3/Eigen/src/SparseQR/SparseQR.h b/include/eigen3/Eigen/src/SparseQR/SparseQR.h
new file mode 100644
index 0000000..a00bd5d
--- /dev/null
+++ b/include/eigen3/Eigen/src/SparseQR/SparseQR.h
@@ -0,0 +1,714 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012-2013 Desire Nuentsa <desire.nuentsa_wakam at inria.fr>
+// Copyright (C) 2012-2014 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_QR_H
+#define EIGEN_SPARSE_QR_H
+
+namespace Eigen {
+
+template<typename MatrixType, typename OrderingType> class SparseQR;
+template<typename SparseQRType> struct SparseQRMatrixQReturnType;
+template<typename SparseQRType> struct SparseQRMatrixQTransposeReturnType;
+template<typename SparseQRType, typename Derived> struct SparseQR_QProduct;
+namespace internal {
+  template <typename SparseQRType> struct traits<SparseQRMatrixQReturnType<SparseQRType> >
+  {
+    typedef typename SparseQRType::MatrixType ReturnType;
+    typedef typename ReturnType::Index Index;
+    typedef typename ReturnType::StorageKind StorageKind;
+  };
+  template <typename SparseQRType> struct traits<SparseQRMatrixQTransposeReturnType<SparseQRType> >
+  {
+    typedef typename SparseQRType::MatrixType ReturnType;
+  };
+  template <typename SparseQRType, typename Derived> struct traits<SparseQR_QProduct<SparseQRType, Derived> >
+  {
+    typedef typename Derived::PlainObject ReturnType;
+  };
+} // End namespace internal
+
+/**
+  * \ingroup SparseQR_Module
+  * \class SparseQR
+  * \brief Sparse left-looking rank-revealing QR factorization
+  * 
+  * This class implements a left-looking rank-revealing QR decomposition 
+  * of sparse matrices. When a column has a norm less than a given tolerance
+  * it is implicitly permuted to the end. The QR factorization thus obtained is 
+  * given by A*P = Q*R where R is upper triangular or trapezoidal. 
+  * 
+  * P is the column permutation which is the product of the fill-reducing and the
+  * rank-revealing permutations. Use colsPermutation() to get it.
+  * 
+  * Q is the orthogonal matrix represented as products of Householder reflectors. 
+  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
+  * You can then apply it to a vector.
+  * 
+  * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient.
+  * matrixR().topLeftCorner(rank(), rank()) always returns a triangular factor of full rank.
+  * 
+  * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
+  * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module 
+  *  OrderingMethods \endlink module for the list of built-in and external ordering methods.
+  * 
+  * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
+  * 
+  */
+template<typename _MatrixType, typename _OrderingType>
+class SparseQR
+{
+  public:
+    typedef _MatrixType MatrixType;
+    typedef _OrderingType OrderingType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef SparseMatrix<Scalar,ColMajor,Index> QRMatrixType;
+    typedef Matrix<Index, Dynamic, 1> IndexVector;
+    typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
+    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+  public:
+    SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
+    { }
+    
+    /** Construct a QR factorization of the matrix \a mat.
+      * 
+      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+      * 
+      * \sa compute()
+      */
+    SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
+    {
+      compute(mat);
+    }
+    
+    /** Computes the QR factorization of the sparse matrix \a mat.
+      * 
+      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+      * 
+      * \sa analyzePattern(), factorize()
+      */
+    void compute(const MatrixType& mat)
+    {
+      analyzePattern(mat);
+      factorize(mat);
+    }
+    void analyzePattern(const MatrixType& mat);
+    void factorize(const MatrixType& mat);
+    
+    /** \returns the number of rows of the represented matrix. 
+      */
+    inline Index rows() const { return m_pmat.rows(); }
+    
+    /** \returns the number of columns of the represented matrix. 
+      */
+    inline Index cols() const { return m_pmat.cols();}
+    
+    /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization.
+      */
+    const QRMatrixType& matrixR() const { return m_R; }
+    
+    /** \returns the number of non linearly dependent columns as determined by the pivoting threshold.
+      *
+      * \sa setPivotThreshold()
+      */
+    Index rank() const 
+    {
+      eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
+      return m_nonzeropivots; 
+    }
+    
+    /** \returns an expression of the matrix Q as products of sparse Householder reflectors.
+    * The common usage of this function is to apply it to a dense matrix or vector
+    * \code
+    * VectorXd B1, B2;
+    * // Initialize B1
+    * B2 = matrixQ() * B1;
+    * \endcode
+    *
+    * To get a plain SparseMatrix representation of Q:
+    * \code
+    * SparseMatrix<double> Q;
+    * Q = SparseQR<SparseMatrix<double> >(A).matrixQ();
+    * \endcode
+    * Internally, this call simply performs a sparse product between the matrix Q
+    * and a sparse identity matrix. However, due to the fact that the sparse
+    * reflectors are stored unsorted, two transpositions are needed to sort
+    * them before performing the product.
+    */
+    SparseQRMatrixQReturnType<SparseQR> matrixQ() const 
+    { return SparseQRMatrixQReturnType<SparseQR>(*this); }
+    
+    /** \returns a const reference to the column permutation P that was applied to A such that A*P = Q*R
+      * It is the combination of the fill-in reducing permutation and numerical column pivoting.
+      */
+    const PermutationType& colsPermutation() const
+    { 
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_outputPerm_c;
+    }
+    
+    /** \returns A string describing the type of error.
+      * This method is provided to ease debugging, not to handle errors.
+      */
+    std::string lastErrorMessage() const { return m_lastError; }
+    
+    /** \internal */
+    template<typename Rhs, typename Dest>
+    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &dest) const
+    {
+      eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
+      eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
+
+      Index rank = this->rank();
+      
+      // Compute Q^T * b;
+      typename Dest::PlainObject y, b;
+      y = this->matrixQ().transpose() * B; 
+      b = y;
+      
+      // Solve with the triangular matrix R
+      y.resize((std::max)(cols(),Index(y.rows())),y.cols());
+      y.topRows(rank) = this->matrixR().topLeftCorner(rank, rank).template triangularView<Upper>().solve(b.topRows(rank));
+      y.bottomRows(y.rows()-rank).setZero();
+
+      // Apply the column permutation
+      if (m_perm_c.size())  dest = colsPermutation() * y.topRows(cols());
+      else                  dest = y.topRows(cols());
+      
+      m_info = Success;
+      return true;
+    }
+    
+
+    /** Sets the threshold that is used to determine linearly dependent columns during the factorization.
+      *
+      * In practice, if during the factorization the norm of the column that has to be eliminated is below
+      * this threshold, then the entire column is treated as zero, and it is moved at the end.
+      */
+    void setPivotThreshold(const RealScalar& threshold)
+    {
+      m_useDefaultThreshold = false;
+      m_threshold = threshold;
+    }
+    
+    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
+    {
+      eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
+      eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
+      return internal::solve_retval<SparseQR, Rhs>(*this, B.derived());
+    }
+    template<typename Rhs>
+    inline const internal::sparse_solve_retval<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const
+    {
+          eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
+          eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
+          return internal::sparse_solve_retval<SparseQR, Rhs>(*this, B.derived());
+    }
+    
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the QR factorization reports a numerical problem
+      *          \c InvalidInput if the input matrix is invalid
+      *
+      * \sa iparm()          
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+
+  protected:
+    inline void sort_matrix_Q()
+    {
+      if(this->m_isQSorted) return;
+      // The matrix Q is sorted during the transposition
+      SparseMatrix<Scalar, RowMajor, Index> mQrm(this->m_Q);
+      this->m_Q = mQrm;
+      this->m_isQSorted = true;
+    }
+
+    
+  protected:
+    bool m_isInitialized;
+    bool m_analysisIsok;
+    bool m_factorizationIsok;
+    mutable ComputationInfo m_info;
+    std::string m_lastError;
+    QRMatrixType m_pmat;            // Temporary matrix
+    QRMatrixType m_R;               // The triangular factor matrix
+    QRMatrixType m_Q;               // The orthogonal reflectors
+    ScalarVector m_hcoeffs;         // The Householder coefficients
+    PermutationType m_perm_c;       // Fill-reducing  Column  permutation
+    PermutationType m_pivotperm;    // The permutation for rank revealing
+    PermutationType m_outputPerm_c; // The final column permutation
+    RealScalar m_threshold;         // Threshold to determine null Householder reflections
+    bool m_useDefaultThreshold;     // Use default threshold
+    Index m_nonzeropivots;          // Number of non zero pivots found 
+    IndexVector m_etree;            // Column elimination tree
+    IndexVector m_firstRowElt;      // First element in each row
+    bool m_isQSorted;               // whether Q is sorted or not
+    bool m_isEtreeOk;               // whether the elimination tree match the initial input matrix
+    
+    template <typename, typename > friend struct SparseQR_QProduct;
+    template <typename > friend struct SparseQRMatrixQReturnType;
+    
+};
+
+/** \brief Preprocessing step of a QR factorization 
+  * 
+  * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+  * 
+  * In this step, the fill-reducing permutation is computed and applied to the columns of A
+  * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited.
+  * 
+  * \note In this step it is assumed that there is no empty row in the matrix \a mat.
+  */
+template <typename MatrixType, typename OrderingType>
+void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
+{
+  eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR");
+  // Copy to a column major matrix if the input is rowmajor
+  typename internal::conditional<MatrixType::IsRowMajor,QRMatrixType,const MatrixType&>::type matCpy(mat);
+  // Compute the column fill reducing ordering
+  OrderingType ord; 
+  ord(matCpy, m_perm_c); 
+  Index n = mat.cols();
+  Index m = mat.rows();
+  Index diagSize = (std::min)(m,n);
+  
+  if (!m_perm_c.size())
+  {
+    m_perm_c.resize(n);
+    m_perm_c.indices().setLinSpaced(n, 0,n-1);
+  }
+  
+  // Compute the column elimination tree of the permuted matrix
+  m_outputPerm_c = m_perm_c.inverse();
+  internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
+  m_isEtreeOk = true;
+  
+  m_R.resize(m, n);
+  m_Q.resize(m, diagSize);
+  
+  // Allocate space for nonzero elements : rough estimation
+  m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree
+  m_Q.reserve(2*mat.nonZeros());
+  m_hcoeffs.resize(diagSize);
+  m_analysisIsok = true;
+}
+
+/** \brief Performs the numerical QR factorization of the input matrix
+  * 
+  * The function SparseQR::analyzePattern(const MatrixType&) must have been called beforehand with
+  * a matrix having the same sparsity pattern than \a mat.
+  * 
+  * \param mat The sparse column-major matrix
+  */
+template <typename MatrixType, typename OrderingType>
+void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
+{
+  using std::abs;
+  using std::max;
+  
+  eigen_assert(m_analysisIsok && "analyzePattern() should be called before this step");
+  Index m = mat.rows();
+  Index n = mat.cols();
+  Index diagSize = (std::min)(m,n);
+  IndexVector mark((std::max)(m,n)); mark.setConstant(-1);  // Record the visited nodes
+  IndexVector Ridx(n), Qidx(m);                             // Store temporarily the row indexes for the current column of R and Q
+  Index nzcolR, nzcolQ;                                     // Number of nonzero for the current column of R and Q
+  ScalarVector tval(m);                                     // The dense vector used to compute the current column
+  RealScalar pivotThreshold = m_threshold;
+  
+  m_R.setZero();
+  m_Q.setZero();
+  m_pmat = mat;
+  if(!m_isEtreeOk)
+  {
+    m_outputPerm_c = m_perm_c.inverse();
+    internal::coletree(m_pmat, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
+    m_isEtreeOk = true;
+  }
+
+  m_pmat.uncompress(); // To have the innerNonZeroPtr allocated
+  
+  // Apply the fill-in reducing permutation lazily:
+  {
+    // If the input is row major, copy the original column indices,
+    // otherwise directly use the input matrix
+    // 
+    IndexVector originalOuterIndicesCpy;
+    const Index *originalOuterIndices = mat.outerIndexPtr();
+    if(MatrixType::IsRowMajor)
+    {
+      originalOuterIndicesCpy = IndexVector::Map(m_pmat.outerIndexPtr(),n+1);
+      originalOuterIndices = originalOuterIndicesCpy.data();
+    }
+    
+    for (int i = 0; i < n; i++)
+    {
+      Index p = m_perm_c.size() ? m_perm_c.indices()(i) : i;
+      m_pmat.outerIndexPtr()[p] = originalOuterIndices[i]; 
+      m_pmat.innerNonZeroPtr()[p] = originalOuterIndices[i+1] - originalOuterIndices[i]; 
+    }
+  }
+  
+  /* Compute the default threshold as in MatLab, see:
+   * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
+   * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
+   */
+  if(m_useDefaultThreshold) 
+  {
+    RealScalar max2Norm = 0.0;
+    for (int j = 0; j < n; j++) max2Norm = (max)(max2Norm, m_pmat.col(j).norm());
+    if(max2Norm==RealScalar(0))
+      max2Norm = RealScalar(1);
+    pivotThreshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
+  }
+  
+  // Initialize the numerical permutation
+  m_pivotperm.setIdentity(n);
+  
+  Index nonzeroCol = 0; // Record the number of valid pivots
+  m_Q.startVec(0);
+
+  // Left looking rank-revealing QR factorization: compute a column of R and Q at a time
+  for (Index col = 0; col < n; ++col)
+  {
+    mark.setConstant(-1);
+    m_R.startVec(col);
+    mark(nonzeroCol) = col;
+    Qidx(0) = nonzeroCol;
+    nzcolR = 0; nzcolQ = 1;
+    bool found_diag = nonzeroCol>=m;
+    tval.setZero(); 
+    
+    // Symbolic factorization: find the nonzero locations of the column k of the factors R and Q, i.e.,
+    // all the nodes (with indexes lower than rank) reachable through the column elimination tree (etree) rooted at node k.
+    // Note: if the diagonal entry does not exist, then its contribution must be explicitly added,
+    // thus the trick with found_diag that permits to do one more iteration on the diagonal element if this one has not been found.
+    for (typename QRMatrixType::InnerIterator itp(m_pmat, col); itp || !found_diag; ++itp)
+    {
+      Index curIdx = nonzeroCol;
+      if(itp) curIdx = itp.row();
+      if(curIdx == nonzeroCol) found_diag = true;
+      
+      // Get the nonzeros indexes of the current column of R
+      Index st = m_firstRowElt(curIdx); // The traversal of the etree starts here 
+      if (st < 0 )
+      {
+        m_lastError = "Empty row found during numerical factorization";
+        m_info = InvalidInput;
+        return;
+      }
+
+      // Traverse the etree 
+      Index bi = nzcolR;
+      for (; mark(st) != col; st = m_etree(st))
+      {
+        Ridx(nzcolR) = st;  // Add this row to the list,
+        mark(st) = col;     // and mark this row as visited
+        nzcolR++;
+      }
+
+      // Reverse the list to get the topological ordering
+      Index nt = nzcolR-bi;
+      for(Index i = 0; i < nt/2; i++) std::swap(Ridx(bi+i), Ridx(nzcolR-i-1));
+       
+      // Copy the current (curIdx,pcol) value of the input matrix
+      if(itp) tval(curIdx) = itp.value();
+      else    tval(curIdx) = Scalar(0);
+      
+      // Compute the pattern of Q(:,k)
+      if(curIdx > nonzeroCol && mark(curIdx) != col ) 
+      {
+        Qidx(nzcolQ) = curIdx;  // Add this row to the pattern of Q,
+        mark(curIdx) = col;     // and mark it as visited
+        nzcolQ++;
+      }
+    }
+
+    // Browse all the indexes of R(:,col) in reverse order
+    for (Index i = nzcolR-1; i >= 0; i--)
+    {
+      Index curIdx = Ridx(i);
+      
+      // Apply the curIdx-th householder vector to the current column (temporarily stored into tval)
+      Scalar tdot(0);
+      
+      // First compute q' * tval
+      tdot = m_Q.col(curIdx).dot(tval);
+
+      tdot *= m_hcoeffs(curIdx);
+      
+      // Then update tval = tval - q * tau
+      // FIXME: tval -= tdot * m_Q.col(curIdx) should amount to the same (need to check/add support for efficient "dense ?= sparse")
+      for (typename QRMatrixType::InnerIterator itq(m_Q, curIdx); itq; ++itq)
+        tval(itq.row()) -= itq.value() * tdot;
+
+      // Detect fill-in for the current column of Q
+      if(m_etree(Ridx(i)) == nonzeroCol)
+      {
+        for (typename QRMatrixType::InnerIterator itq(m_Q, curIdx); itq; ++itq)
+        {
+          Index iQ = itq.row();
+          if (mark(iQ) != col)
+          {
+            Qidx(nzcolQ++) = iQ;  // Add this row to the pattern of Q,
+            mark(iQ) = col;       // and mark it as visited
+          }
+        }
+      }
+    } // End update current column
+    
+    Scalar tau = 0;
+    RealScalar beta = 0;
+    
+    if(nonzeroCol < diagSize)
+    {
+      // Compute the Householder reflection that eliminate the current column
+      // FIXME this step should call the Householder module.
+      Scalar c0 = nzcolQ ? tval(Qidx(0)) : Scalar(0);
+      
+      // First, the squared norm of Q((col+1):m, col)
+      RealScalar sqrNorm = 0.;
+      for (Index itq = 1; itq < nzcolQ; ++itq) sqrNorm += numext::abs2(tval(Qidx(itq)));
+      if(sqrNorm == RealScalar(0) && numext::imag(c0) == RealScalar(0))
+      {
+        beta = numext::real(c0);
+        tval(Qidx(0)) = 1;
+      }
+      else
+      {
+        using std::sqrt;
+        beta = sqrt(numext::abs2(c0) + sqrNorm);
+        if(numext::real(c0) >= RealScalar(0))
+          beta = -beta;
+        tval(Qidx(0)) = 1;
+        for (Index itq = 1; itq < nzcolQ; ++itq)
+          tval(Qidx(itq)) /= (c0 - beta);
+        tau = numext::conj((beta-c0) / beta);
+          
+      }
+    }
+
+    // Insert values in R
+    for (Index  i = nzcolR-1; i >= 0; i--)
+    {
+      Index curIdx = Ridx(i);
+      if(curIdx < nonzeroCol) 
+      {
+        m_R.insertBackByOuterInnerUnordered(col, curIdx) = tval(curIdx);
+        tval(curIdx) = Scalar(0.);
+      }
+    }
+
+    if(nonzeroCol < diagSize && abs(beta) >= pivotThreshold)
+    {
+      m_R.insertBackByOuterInner(col, nonzeroCol) = beta;
+      // The householder coefficient
+      m_hcoeffs(nonzeroCol) = tau;
+      // Record the householder reflections
+      for (Index itq = 0; itq < nzcolQ; ++itq)
+      {
+        Index iQ = Qidx(itq);
+        m_Q.insertBackByOuterInnerUnordered(nonzeroCol,iQ) = tval(iQ);
+        tval(iQ) = Scalar(0.);
+      }
+      nonzeroCol++;
+      if(nonzeroCol<diagSize)
+        m_Q.startVec(nonzeroCol);
+    }
+    else
+    {
+      // Zero pivot found: move implicitly this column to the end
+      for (Index j = nonzeroCol; j < n-1; j++) 
+        std::swap(m_pivotperm.indices()(j), m_pivotperm.indices()[j+1]);
+      
+      // Recompute the column elimination tree
+      internal::coletree(m_pmat, m_etree, m_firstRowElt, m_pivotperm.indices().data());
+      m_isEtreeOk = false;
+    }
+  }
+  
+  m_hcoeffs.tail(diagSize-nonzeroCol).setZero();
+  
+  // Finalize the column pointers of the sparse matrices R and Q
+  m_Q.finalize();
+  m_Q.makeCompressed();
+  m_R.finalize();
+  m_R.makeCompressed();
+  m_isQSorted = false;
+
+  m_nonzeropivots = nonzeroCol;
+  
+  if(nonzeroCol<n)
+  {
+    // Permute the triangular factor to put the 'dead' columns to the end
+    QRMatrixType tempR(m_R);
+    m_R = tempR * m_pivotperm;
+    
+    // Update the column permutation
+    m_outputPerm_c = m_outputPerm_c * m_pivotperm;
+  }
+  
+  m_isInitialized = true; 
+  m_factorizationIsok = true;
+  m_info = Success;
+}
+
+namespace internal {
+  
+template<typename _MatrixType, typename OrderingType, typename Rhs>
+struct solve_retval<SparseQR<_MatrixType,OrderingType>, Rhs>
+  : solve_retval_base<SparseQR<_MatrixType,OrderingType>, Rhs>
+{
+  typedef SparseQR<_MatrixType,OrderingType> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+template<typename _MatrixType, typename OrderingType, typename Rhs>
+struct sparse_solve_retval<SparseQR<_MatrixType, OrderingType>, Rhs>
+ : sparse_solve_retval_base<SparseQR<_MatrixType, OrderingType>, Rhs>
+{
+  typedef SparseQR<_MatrixType, OrderingType> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec, Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    this->defaultEvalTo(dst);
+  }
+};
+} // end namespace internal
+
+template <typename SparseQRType, typename Derived>
+struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived> >
+{
+  typedef typename SparseQRType::QRMatrixType MatrixType;
+  typedef typename SparseQRType::Scalar Scalar;
+  typedef typename SparseQRType::Index Index;
+  // Get the references 
+  SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose) : 
+  m_qr(qr),m_other(other),m_transpose(transpose) {}
+  inline Index rows() const { return m_transpose ? m_qr.rows() : m_qr.cols(); }
+  inline Index cols() const { return m_other.cols(); }
+  
+  // Assign to a vector
+  template<typename DesType>
+  void evalTo(DesType& res) const
+  {
+    Index m = m_qr.rows();
+    Index n = m_qr.cols();
+    Index diagSize = (std::min)(m,n);
+    res = m_other;
+    if (m_transpose)
+    {
+      eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
+      //Compute res = Q' * other column by column
+      for(Index j = 0; j < res.cols(); j++){
+        for (Index k = 0; k < diagSize; k++)
+        {
+          Scalar tau = Scalar(0);
+          tau = m_qr.m_Q.col(k).dot(res.col(j));
+          if(tau==Scalar(0)) continue;
+          tau = tau * m_qr.m_hcoeffs(k);
+          res.col(j) -= tau * m_qr.m_Q.col(k);
+        }
+      }
+    }
+    else
+    {
+      eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
+      // Compute res = Q * other column by column
+      for(Index j = 0; j < res.cols(); j++)
+      {
+        for (Index k = diagSize-1; k >=0; k--)
+        {
+          Scalar tau = Scalar(0);
+          tau = m_qr.m_Q.col(k).dot(res.col(j));
+          if(tau==Scalar(0)) continue;
+          tau = tau * m_qr.m_hcoeffs(k);
+          res.col(j) -= tau * m_qr.m_Q.col(k);
+        }
+      }
+    }
+  }
+  
+  const SparseQRType& m_qr;
+  const Derived& m_other;
+  bool m_transpose;
+};
+
+template<typename SparseQRType>
+struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<SparseQRType> >
+{  
+  typedef typename SparseQRType::Index Index;
+  typedef typename SparseQRType::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  template<typename Derived>
+  SparseQR_QProduct<SparseQRType, Derived> operator*(const MatrixBase<Derived>& other)
+  {
+    return SparseQR_QProduct<SparseQRType,Derived>(m_qr,other.derived(),false);
+  }
+  SparseQRMatrixQTransposeReturnType<SparseQRType> adjoint() const
+  {
+    return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
+  }
+  inline Index rows() const { return m_qr.rows(); }
+  inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); }
+  // To use for operations with the transpose of Q
+  SparseQRMatrixQTransposeReturnType<SparseQRType> transpose() const
+  {
+    return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
+  }
+  template<typename Dest> void evalTo(MatrixBase<Dest>& dest) const
+  {
+    dest.derived() = m_qr.matrixQ() * Dest::Identity(m_qr.rows(), m_qr.rows());
+  }
+  template<typename Dest> void evalTo(SparseMatrixBase<Dest>& dest) const
+  {
+    Dest idMat(m_qr.rows(), m_qr.rows());
+    idMat.setIdentity();
+    // Sort the sparse householder reflectors if needed
+    const_cast<SparseQRType *>(&m_qr)->sort_matrix_Q();
+    dest.derived() = SparseQR_QProduct<SparseQRType, Dest>(m_qr, idMat, false);
+  }
+
+  const SparseQRType& m_qr;
+};
+
+template<typename SparseQRType>
+struct SparseQRMatrixQTransposeReturnType
+{
+  SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  template<typename Derived>
+  SparseQR_QProduct<SparseQRType,Derived> operator*(const MatrixBase<Derived>& other)
+  {
+    return SparseQR_QProduct<SparseQRType,Derived>(m_qr,other.derived(), true);
+  }
+  const SparseQRType& m_qr;
+};
+
+} // end namespace Eigen
+
+#endif
diff --git a/include/eigen3/Eigen/src/StlSupport/StdDeque.h b/include/eigen3/Eigen/src/StlSupport/StdDeque.h
new file mode 100644
index 0000000..aaf6633
--- /dev/null
+++ b/include/eigen3/Eigen/src/StlSupport/StdDeque.h
@@ -0,0 +1,134 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel at googlemail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STDDEQUE_H
+#define EIGEN_STDDEQUE_H
+
+#include "details.h"
+
+// Define the explicit instantiation (e.g. necessary for the Intel compiler)
+#if defined(__INTEL_COMPILER) || defined(__GNUC__)
+  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...) template class std::deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
+#else
+  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...)
+#endif
+
+/**
+ * This section contains a convenience MACRO which allows an easy specialization of
+ * std::deque such that for data types with alignment issues the correct allocator
+ * is used automatically.
+ */
+#define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) \
+EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(__VA_ARGS__) \
+namespace std \
+{ \
+  template<typename _Ay> \
+  class deque<__VA_ARGS__, _Ay>  \
+    : public deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
+  { \
+    typedef deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > deque_base; \
+  public: \
+    typedef __VA_ARGS__ value_type; \
+    typedef typename deque_base::allocator_type allocator_type; \
+    typedef typename deque_base::size_type size_type;  \
+    typedef typename deque_base::iterator iterator;  \
+    explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {}  \
+    template<typename InputIterator> \
+    deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \
+    deque(const deque& c) : deque_base(c) {}  \
+    explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
+    deque(iterator start, iterator end) : deque_base(start, end) {}  \
+    deque& operator=(const deque& x) {  \
+      deque_base::operator=(x);  \
+      return *this;  \
+    } \
+  }; \
+}
+
+// check whether we really need the std::deque specialization
+#if !(defined(_GLIBCXX_DEQUE) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::deque::resize(size_type,const T&). */
+
+namespace std {
+
+#define EIGEN_STD_DEQUE_SPECIALIZATION_BODY \
+  public:  \
+    typedef T value_type; \
+    typedef typename deque_base::allocator_type allocator_type; \
+    typedef typename deque_base::size_type size_type;  \
+    typedef typename deque_base::iterator iterator;  \
+    typedef typename deque_base::const_iterator const_iterator;  \
+    explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {}  \
+    template<typename InputIterator> \
+    deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \
+    : deque_base(first, last, a) {} \
+    deque(const deque& c) : deque_base(c) {}  \
+    explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
+    deque(iterator start, iterator end) : deque_base(start, end) {}  \
+    deque& operator=(const deque& x) {  \
+      deque_base::operator=(x);  \
+      return *this;  \
+    }
+
+  template<typename T>
+  class deque<T,EIGEN_ALIGNED_ALLOCATOR<T> >
+    : public deque<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
+                   Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> >
+{
+  typedef deque<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
+                Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> > deque_base;
+  EIGEN_STD_DEQUE_SPECIALIZATION_BODY
+
+  void resize(size_type new_size)
+  { resize(new_size, T()); }
+
+#if defined(_DEQUE_)
+  // workaround MSVC std::deque implementation
+  void resize(size_type new_size, const value_type& x)
+  {
+    if (deque_base::size() < new_size)
+      deque_base::_Insert_n(deque_base::end(), new_size - deque_base::size(), x);
+    else if (new_size < deque_base::size())
+      deque_base::erase(deque_base::begin() + new_size, deque_base::end());
+  }
+  void push_back(const value_type& x)
+  { deque_base::push_back(x); } 
+  void push_front(const value_type& x)
+  { deque_base::push_front(x); }
+  using deque_base::insert;  
+  iterator insert(const_iterator position, const value_type& x)
+  { return deque_base::insert(position,x); }
+  void insert(const_iterator position, size_type new_size, const value_type& x)
+  { deque_base::insert(position, new_size, x); }
+#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2)
+  // workaround GCC std::deque implementation
+  void resize(size_type new_size, const value_type& x)
+  {
+    if (new_size < deque_base::size())
+      deque_base::_M_erase_at_end(this->_M_impl._M_start + new_size);
+    else
+      deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
+  }
+#else
+  // either GCC 4.1 or non-GCC
+  // default implementation which should always work.
+  void resize(size_type new_size, const value_type& x)
+  {
+    if (new_size < deque_base::size())
+      deque_base::erase(deque_base::begin() + new_size, deque_base::end());
+    else if (new_size > deque_base::size())
+      deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
+  }
+#endif
+  };
+}
+
+#endif // check whether specialization is actually required
+
+#endif // EIGEN_STDDEQUE_H
diff --git a/include/eigen3/Eigen/src/StlSupport/StdList.h b/include/eigen3/Eigen/src/StlSupport/StdList.h
new file mode 100644
index 0000000..3c74243
--- /dev/null
+++ b/include/eigen3/Eigen/src/StlSupport/StdList.h
@@ -0,0 +1,114 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel at googlemail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STDLIST_H
+#define EIGEN_STDLIST_H
+
+#include "details.h"
+
+// Define the explicit instantiation (e.g. necessary for the Intel compiler)
+#if defined(__INTEL_COMPILER) || defined(__GNUC__)
+  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...) template class std::list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
+#else
+  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...)
+#endif
+
+/**
+ * This section contains a convenience MACRO which allows an easy specialization of
+ * std::list such that for data types with alignment issues the correct allocator
+ * is used automatically.
+ */
+#define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) \
+EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(__VA_ARGS__) \
+namespace std \
+{ \
+  template<typename _Ay> \
+  class list<__VA_ARGS__, _Ay>  \
+    : public list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
+  { \
+    typedef list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > list_base; \
+  public: \
+    typedef __VA_ARGS__ value_type; \
+    typedef typename list_base::allocator_type allocator_type; \
+    typedef typename list_base::size_type size_type;  \
+    typedef typename list_base::iterator iterator;  \
+    explicit list(const allocator_type& a = allocator_type()) : list_base(a) {}  \
+    template<typename InputIterator> \
+    list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \
+    list(const list& c) : list_base(c) {}  \
+    explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
+    list(iterator start, iterator end) : list_base(start, end) {}  \
+    list& operator=(const list& x) {  \
+      list_base::operator=(x);  \
+      return *this;  \
+    } \
+  }; \
+}
+
+// check whether we really need the std::vector specialization
+#if !(defined(_GLIBCXX_VECTOR) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::list::resize(size_type,const T&). */
+
+namespace std
+{
+
+#define EIGEN_STD_LIST_SPECIALIZATION_BODY \
+  public:  \
+    typedef T value_type; \
+    typedef typename list_base::allocator_type allocator_type; \
+    typedef typename list_base::size_type size_type;  \
+    typedef typename list_base::iterator iterator;  \
+    typedef typename list_base::const_iterator const_iterator;  \
+    explicit list(const allocator_type& a = allocator_type()) : list_base(a) {}  \
+    template<typename InputIterator> \
+    list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \
+    : list_base(first, last, a) {} \
+    list(const list& c) : list_base(c) {}  \
+    explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
+    list(iterator start, iterator end) : list_base(start, end) {}  \
+    list& operator=(const list& x) {  \
+    list_base::operator=(x);  \
+    return *this; \
+  }
+
+  template<typename T>
+  class list<T,EIGEN_ALIGNED_ALLOCATOR<T> >
+    : public list<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
+                  Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> >
+  {
+    typedef list<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
+                 Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> > list_base;
+    EIGEN_STD_LIST_SPECIALIZATION_BODY
+
+    void resize(size_type new_size)
+    { resize(new_size, T()); }
+
+    void resize(size_type new_size, const value_type& x)
+    {
+      if (list_base::size() < new_size)
+        list_base::insert(list_base::end(), new_size - list_base::size(), x);
+      else
+        while (new_size < list_base::size()) list_base::pop_back();
+    }
+
+#if defined(_LIST_)
+    // workaround MSVC std::list implementation
+    void push_back(const value_type& x)
+    { list_base::push_back(x); } 
+    using list_base::insert;  
+    iterator insert(const_iterator position, const value_type& x)
+    { return list_base::insert(position,x); }
+    void insert(const_iterator position, size_type new_size, const value_type& x)
+    { list_base::insert(position, new_size, x); }
+#endif
+  };
+}
+
+#endif // check whether specialization is actually required
+
+#endif // EIGEN_STDLIST_H
diff --git a/include/eigen3/Eigen/src/StlSupport/StdVector.h b/include/eigen3/Eigen/src/StlSupport/StdVector.h
new file mode 100644
index 0000000..611664a
--- /dev/null
+++ b/include/eigen3/Eigen/src/StlSupport/StdVector.h
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel at googlemail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STDVECTOR_H
+#define EIGEN_STDVECTOR_H
+
+#include "details.h"
+
+/**
+ * This section contains a convenience MACRO which allows an easy specialization of
+ * std::vector such that for data types with alignment issues the correct allocator
+ * is used automatically.
+ */
+#define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...) \
+namespace std \
+{ \
+  template<> \
+  class vector<__VA_ARGS__, std::allocator<__VA_ARGS__> >  \
+    : public vector<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
+  { \
+    typedef vector<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > vector_base; \
+  public: \
+    typedef __VA_ARGS__ value_type; \
+    typedef vector_base::allocator_type allocator_type; \
+    typedef vector_base::size_type size_type;  \
+    typedef vector_base::iterator iterator;  \
+    explicit vector(const allocator_type& a = allocator_type()) : vector_base(a) {}  \
+    template<typename InputIterator> \
+    vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \
+    vector(const vector& c) : vector_base(c) {}  \
+    explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
+    vector(iterator start, iterator end) : vector_base(start, end) {}  \
+    vector& operator=(const vector& x) {  \
+      vector_base::operator=(x);  \
+      return *this;  \
+    } \
+  }; \
+}
+
+namespace std {
+
+#define EIGEN_STD_VECTOR_SPECIALIZATION_BODY \
+  public:  \
+    typedef T value_type; \
+    typedef typename vector_base::allocator_type allocator_type; \
+    typedef typename vector_base::size_type size_type;  \
+    typedef typename vector_base::iterator iterator;  \
+    typedef typename vector_base::const_iterator const_iterator;  \
+    explicit vector(const allocator_type& a = allocator_type()) : vector_base(a) {}  \
+    template<typename InputIterator> \
+    vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \
+    : vector_base(first, last, a) {} \
+    vector(const vector& c) : vector_base(c) {}  \
+    explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
+    vector(iterator start, iterator end) : vector_base(start, end) {}  \
+    vector& operator=(const vector& x) {  \
+      vector_base::operator=(x);  \
+      return *this;  \
+    }
+
+  template<typename T>
+  class vector<T,EIGEN_ALIGNED_ALLOCATOR<T> >
+    : public vector<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
+                    Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> >
+{
+  typedef vector<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
+                 Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> > vector_base;
+  EIGEN_STD_VECTOR_SPECIALIZATION_BODY
+
+  void resize(size_type new_size)
+  { resize(new_size, T()); }
+
+#if defined(_VECTOR_)
+  // workaround MSVC std::vector implementation
+  void resize(size_type new_size, const value_type& x)
+  {
+    if (vector_base::size() < new_size)
+      vector_base::_Insert_n(vector_base::end(), new_size - vector_base::size(), x);
+    else if (new_size < vector_base::size())
+      vector_base::erase(vector_base::begin() + new_size, vector_base::end());
+  }
+  void push_back(const value_type& x)
+  { vector_base::push_back(x); } 
+  using vector_base::insert;  
+  iterator insert(const_iterator position, const value_type& x)
+  { return vector_base::insert(position,x); }
+  void insert(const_iterator position, size_type new_size, const value_type& x)
+  { vector_base::insert(position, new_size, x); }
+#elif defined(_GLIBCXX_VECTOR) && (!(EIGEN_GNUC_AT_LEAST(4,1)))
+  /* Note that before gcc-4.1 we already have: std::vector::resize(size_type,const T&).
+   * However, this specialization is still needed to make the above EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION trick to work. */
+  void resize(size_type new_size, const value_type& x)
+  {
+    vector_base::resize(new_size,x);
+  }
+#elif defined(_GLIBCXX_VECTOR) && EIGEN_GNUC_AT_LEAST(4,2)
+  // workaround GCC std::vector implementation
+  void resize(size_type new_size, const value_type& x)
+  {
+    if (new_size < vector_base::size())
+      vector_base::_M_erase_at_end(this->_M_impl._M_start + new_size);
+    else
+      vector_base::insert(vector_base::end(), new_size - vector_base::size(), x);
+  }
+#else
+  // either GCC 4.1 or non-GCC
+  // default implementation which should always work.
+  void resize(size_type new_size, const value_type& x)
+  {
+    if (new_size < vector_base::size())
+      vector_base::erase(vector_base::begin() + new_size, vector_base::end());
+    else if (new_size > vector_base::size())
+      vector_base::insert(vector_base::end(), new_size - vector_base::size(), x);
+  }
+#endif
+  };
+}
+
+#endif // EIGEN_STDVECTOR_H
diff --git a/include/eigen3/Eigen/src/StlSupport/details.h b/include/eigen3/Eigen/src/StlSupport/details.h
new file mode 100644
index 0000000..d8debc7
--- /dev/null
+++ b/include/eigen3/Eigen/src/StlSupport/details.h
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel at googlemail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STL_DETAILS_H
+#define EIGEN_STL_DETAILS_H
+
+#ifndef EIGEN_ALIGNED_ALLOCATOR
+  #define EIGEN_ALIGNED_ALLOCATOR Eigen::aligned_allocator
+#endif
+
+namespace Eigen {
+
+  // This one is needed to prevent reimplementing the whole std::vector.
+  template <class T>
+  class aligned_allocator_indirection : public EIGEN_ALIGNED_ALLOCATOR<T>
+  {
+  public:
+    typedef size_t    size_type;
+    typedef ptrdiff_t difference_type;
+    typedef T*        pointer;
+    typedef const T*  const_pointer;
+    typedef T&        reference;
+    typedef const T&  const_reference;
+    typedef T         value_type;
+
+    template<class U>
+    struct rebind
+    {
+      typedef aligned_allocator_indirection<U> other;
+    };
+
+    aligned_allocator_indirection() {}
+    aligned_allocator_indirection(const aligned_allocator_indirection& ) : EIGEN_ALIGNED_ALLOCATOR<T>() {}
+    aligned_allocator_indirection(const EIGEN_ALIGNED_ALLOCATOR<T>& ) {}
+    template<class U>
+    aligned_allocator_indirection(const aligned_allocator_indirection<U>& ) {}
+    template<class U>
+    aligned_allocator_indirection(const EIGEN_ALIGNED_ALLOCATOR<U>& ) {}
+    ~aligned_allocator_indirection() {}
+  };
+
+#ifdef _MSC_VER
+
+  // sometimes, MSVC detects, at compile time, that the argument x
+  // in std::vector::resize(size_t s,T x) won't be aligned and generate an error
+  // even if this function is never called. Whence this little wrapper.
+#define EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T) \
+  typename Eigen::internal::conditional< \
+    Eigen::internal::is_arithmetic<T>::value, \
+    T, \
+    Eigen::internal::workaround_msvc_stl_support<T> \
+  >::type
+
+  namespace internal {
+  template<typename T> struct workaround_msvc_stl_support : public T
+  {
+    inline workaround_msvc_stl_support() : T() {}
+    inline workaround_msvc_stl_support(const T& other) : T(other) {}
+    inline operator T& () { return *static_cast<T*>(this); }
+    inline operator const T& () const { return *static_cast<const T*>(this); }
+    template<typename OtherT>
+    inline T& operator=(const OtherT& other)
+    { T::operator=(other); return *this; }
+    inline workaround_msvc_stl_support& operator=(const workaround_msvc_stl_support& other)
+    { T::operator=(other); return *this; }
+  };
+  }
+
+#else
+
+#define EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T) T
+
+#endif
+
+}
+
+#endif // EIGEN_STL_DETAILS_H
diff --git a/include/eigen3/Eigen/src/SuperLUSupport/SuperLUSupport.h b/include/eigen3/Eigen/src/SuperLUSupport/SuperLUSupport.h
new file mode 100644
index 0000000..bcb3557
--- /dev/null
+++ b/include/eigen3/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -0,0 +1,1026 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SUPERLUSUPPORT_H
+#define EIGEN_SUPERLUSUPPORT_H
+
+namespace Eigen { 
+
+#define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
+    extern "C" {                                                                                          \
+      typedef struct { FLOATTYPE for_lu; FLOATTYPE total_needed; int expansions; } PREFIX##mem_usage_t;   \
+      extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
+                                char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
+                                void *, int, SuperMatrix *, SuperMatrix *,                                \
+                                FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
+                                PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                           \
+    }                                                                                                     \
+    inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
+         int *perm_c, int *perm_r, int *etree, char *equed,                                               \
+         FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L,                                                      \
+         SuperMatrix *U, void *work, int lwork,                                                           \
+         SuperMatrix *B, SuperMatrix *X,                                                                  \
+         FLOATTYPE *recip_pivot_growth,                                                                   \
+         FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
+         SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
+    PREFIX##mem_usage_t mem_usage;                                                                        \
+    PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
+         U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
+         ferr, berr, &mem_usage, stats, info);                                                            \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
+  }
+
+DECL_GSSVX(s,float,float)
+DECL_GSSVX(c,float,std::complex<float>)
+DECL_GSSVX(d,double,double)
+DECL_GSSVX(z,double,std::complex<double>)
+
+#ifdef MILU_ALPHA
+#define EIGEN_SUPERLU_HAS_ILU
+#endif
+
+#ifdef EIGEN_SUPERLU_HAS_ILU
+
+// similarly for the incomplete factorization using gsisx
+#define DECL_GSISX(PREFIX,FLOATTYPE,KEYTYPE)                                                    \
+    extern "C" {                                                                                \
+      extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *,        \
+                         char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,        \
+                         void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *, FLOATTYPE *,   \
+                         PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                        \
+    }                                                                                           \
+    inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A,                      \
+         int *perm_c, int *perm_r, int *etree, char *equed,                                     \
+         FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L,                                            \
+         SuperMatrix *U, void *work, int lwork,                                                 \
+         SuperMatrix *B, SuperMatrix *X,                                                        \
+         FLOATTYPE *recip_pivot_growth,                                                         \
+         FLOATTYPE *rcond,                                                                      \
+         SuperLUStat_t *stats, int *info, KEYTYPE) {                                            \
+    PREFIX##mem_usage_t mem_usage;                                                              \
+    PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L,                            \
+         U, work, lwork, B, X, recip_pivot_growth, rcond,                                       \
+         &mem_usage, stats, info);                                                              \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                             \
+  }
+
+DECL_GSISX(s,float,float)
+DECL_GSISX(c,float,std::complex<float>)
+DECL_GSISX(d,double,double)
+DECL_GSISX(z,double,std::complex<double>)
+
+#endif
+
+template<typename MatrixType>
+struct SluMatrixMapHelper;
+
+/** \internal
+  *
+  * A wrapper class for SuperLU matrices. It supports only compressed sparse matrices
+  * and dense matrices. Supernodal and other fancy format are not supported by this wrapper.
+  *
+  * This wrapper class mainly aims to avoids the need of dynamic allocation of the storage structure.
+  */
+struct SluMatrix : SuperMatrix
+{
+  SluMatrix()
+  {
+    Store = &storage;
+  }
+
+  SluMatrix(const SluMatrix& other)
+    : SuperMatrix(other)
+  {
+    Store = &storage;
+    storage = other.storage;
+  }
+
+  SluMatrix& operator=(const SluMatrix& other)
+  {
+    SuperMatrix::operator=(static_cast<const SuperMatrix&>(other));
+    Store = &storage;
+    storage = other.storage;
+    return *this;
+  }
+
+  struct
+  {
+    union {int nnz;int lda;};
+    void *values;
+    int *innerInd;
+    int *outerInd;
+  } storage;
+
+  void setStorageType(Stype_t t)
+  {
+    Stype = t;
+    if (t==SLU_NC || t==SLU_NR || t==SLU_DN)
+      Store = &storage;
+    else
+    {
+      eigen_assert(false && "storage type not supported");
+      Store = 0;
+    }
+  }
+
+  template<typename Scalar>
+  void setScalarType()
+  {
+    if (internal::is_same<Scalar,float>::value)
+      Dtype = SLU_S;
+    else if (internal::is_same<Scalar,double>::value)
+      Dtype = SLU_D;
+    else if (internal::is_same<Scalar,std::complex<float> >::value)
+      Dtype = SLU_C;
+    else if (internal::is_same<Scalar,std::complex<double> >::value)
+      Dtype = SLU_Z;
+    else
+    {
+      eigen_assert(false && "Scalar type not supported by SuperLU");
+    }
+  }
+
+  template<typename MatrixType>
+  static SluMatrix Map(MatrixBase<MatrixType>& _mat)
+  {
+    MatrixType& mat(_mat.derived());
+    eigen_assert( ((MatrixType::Flags&RowMajorBit)!=RowMajorBit) && "row-major dense matrices are not supported by SuperLU");
+    SluMatrix res;
+    res.setStorageType(SLU_DN);
+    res.setScalarType<typename MatrixType::Scalar>();
+    res.Mtype     = SLU_GE;
+
+    res.nrow      = mat.rows();
+    res.ncol      = mat.cols();
+
+    res.storage.lda       = MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride();
+    res.storage.values    = (void*)(mat.data());
+    return res;
+  }
+
+  template<typename MatrixType>
+  static SluMatrix Map(SparseMatrixBase<MatrixType>& mat)
+  {
+    SluMatrix res;
+    if ((MatrixType::Flags&RowMajorBit)==RowMajorBit)
+    {
+      res.setStorageType(SLU_NR);
+      res.nrow      = mat.cols();
+      res.ncol      = mat.rows();
+    }
+    else
+    {
+      res.setStorageType(SLU_NC);
+      res.nrow      = mat.rows();
+      res.ncol      = mat.cols();
+    }
+
+    res.Mtype       = SLU_GE;
+
+    res.storage.nnz       = mat.nonZeros();
+    res.storage.values    = mat.derived().valuePtr();
+    res.storage.innerInd  = mat.derived().innerIndexPtr();
+    res.storage.outerInd  = mat.derived().outerIndexPtr();
+
+    res.setScalarType<typename MatrixType::Scalar>();
+
+    // FIXME the following is not very accurate
+    if (MatrixType::Flags & Upper)
+      res.Mtype = SLU_TRU;
+    if (MatrixType::Flags & Lower)
+      res.Mtype = SLU_TRL;
+
+    eigen_assert(((MatrixType::Flags & SelfAdjoint)==0) && "SelfAdjoint matrix shape not supported by SuperLU");
+
+    return res;
+  }
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MRows, int MCols>
+struct SluMatrixMapHelper<Matrix<Scalar,Rows,Cols,Options,MRows,MCols> >
+{
+  typedef Matrix<Scalar,Rows,Cols,Options,MRows,MCols> MatrixType;
+  static void run(MatrixType& mat, SluMatrix& res)
+  {
+    eigen_assert( ((Options&RowMajor)!=RowMajor) && "row-major dense matrices is not supported by SuperLU");
+    res.setStorageType(SLU_DN);
+    res.setScalarType<Scalar>();
+    res.Mtype     = SLU_GE;
+
+    res.nrow      = mat.rows();
+    res.ncol      = mat.cols();
+
+    res.storage.lda       = mat.outerStride();
+    res.storage.values    = mat.data();
+  }
+};
+
+template<typename Derived>
+struct SluMatrixMapHelper<SparseMatrixBase<Derived> >
+{
+  typedef Derived MatrixType;
+  static void run(MatrixType& mat, SluMatrix& res)
+  {
+    if ((MatrixType::Flags&RowMajorBit)==RowMajorBit)
+    {
+      res.setStorageType(SLU_NR);
+      res.nrow      = mat.cols();
+      res.ncol      = mat.rows();
+    }
+    else
+    {
+      res.setStorageType(SLU_NC);
+      res.nrow      = mat.rows();
+      res.ncol      = mat.cols();
+    }
+
+    res.Mtype       = SLU_GE;
+
+    res.storage.nnz       = mat.nonZeros();
+    res.storage.values    = mat.valuePtr();
+    res.storage.innerInd  = mat.innerIndexPtr();
+    res.storage.outerInd  = mat.outerIndexPtr();
+
+    res.setScalarType<typename MatrixType::Scalar>();
+
+    // FIXME the following is not very accurate
+    if (MatrixType::Flags & Upper)
+      res.Mtype = SLU_TRU;
+    if (MatrixType::Flags & Lower)
+      res.Mtype = SLU_TRL;
+
+    eigen_assert(((MatrixType::Flags & SelfAdjoint)==0) && "SelfAdjoint matrix shape not supported by SuperLU");
+  }
+};
+
+namespace internal {
+
+template<typename MatrixType>
+SluMatrix asSluMatrix(MatrixType& mat)
+{
+  return SluMatrix::Map(mat);
+}
+
+/** View a Super LU matrix as an Eigen expression */
+template<typename Scalar, int Flags, typename Index>
+MappedSparseMatrix<Scalar,Flags,Index> map_superlu(SluMatrix& sluMat)
+{
+  eigen_assert((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR
+         || (Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC);
+
+  Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow;
+
+  return MappedSparseMatrix<Scalar,Flags,Index>(
+    sluMat.nrow, sluMat.ncol, sluMat.storage.outerInd[outerSize],
+    sluMat.storage.outerInd, sluMat.storage.innerInd, reinterpret_cast<Scalar*>(sluMat.storage.values) );
+}
+
+} // end namespace internal
+
+/** \ingroup SuperLUSupport_Module
+  * \class SuperLUBase
+  * \brief The base class for the direct and incomplete LU factorization of SuperLU
+  */
+template<typename _MatrixType, typename Derived>
+class SuperLUBase : internal::noncopyable
+{
+  public:
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;    
+    typedef SparseMatrix<Scalar> LUMatrixType;
+
+  public:
+
+    SuperLUBase() {}
+
+    ~SuperLUBase()
+    {
+      clearFactors();
+    }
+    
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+    
+    /** \returns a reference to the Super LU option object to configure the  Super LU algorithms. */
+    inline superlu_options_t& options() { return m_sluOptions; }
+    
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    void compute(const MatrixType& matrix)
+    {
+      derived().analyzePattern(matrix);
+      derived().factorize(matrix);
+    }
+    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<SuperLUBase, Rhs> solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<SuperLUBase, Rhs>(*this, b.derived());
+    }
+    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::sparse_solve_retval<SuperLUBase, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::sparse_solve_retval<SuperLUBase, Rhs>(*this, b.derived());
+    }
+    
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      * 
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& /*matrix*/)
+    {
+      m_isInitialized = true;
+      m_info = Success;
+      m_analysisIsOk = true;
+      m_factorizationIsOk = false;
+    }
+    
+    template<typename Stream>
+    void dumpMemory(Stream& /*s*/)
+    {}
+    
+  protected:
+    
+    void initFactorization(const MatrixType& a)
+    {
+      set_default_options(&this->m_sluOptions);
+      
+      const int size = a.rows();
+      m_matrix = a;
+
+      m_sluA = internal::asSluMatrix(m_matrix);
+      clearFactors();
+
+      m_p.resize(size);
+      m_q.resize(size);
+      m_sluRscale.resize(size);
+      m_sluCscale.resize(size);
+      m_sluEtree.resize(size);
+
+      // set empty B and X
+      m_sluB.setStorageType(SLU_DN);
+      m_sluB.setScalarType<Scalar>();
+      m_sluB.Mtype          = SLU_GE;
+      m_sluB.storage.values = 0;
+      m_sluB.nrow           = 0;
+      m_sluB.ncol           = 0;
+      m_sluB.storage.lda    = size;
+      m_sluX                = m_sluB;
+      
+      m_extractedDataAreDirty = true;
+    }
+    
+    void init()
+    {
+      m_info = InvalidInput;
+      m_isInitialized = false;
+      m_sluL.Store = 0;
+      m_sluU.Store = 0;
+    }
+    
+    void extractData() const;
+
+    void clearFactors()
+    {
+      if(m_sluL.Store)
+        Destroy_SuperNode_Matrix(&m_sluL);
+      if(m_sluU.Store)
+        Destroy_CompCol_Matrix(&m_sluU);
+
+      m_sluL.Store = 0;
+      m_sluU.Store = 0;
+
+      memset(&m_sluL,0,sizeof m_sluL);
+      memset(&m_sluU,0,sizeof m_sluU);
+    }
+
+    // cached data to reduce reallocation, etc.
+    mutable LUMatrixType m_l;
+    mutable LUMatrixType m_u;
+    mutable IntColVectorType m_p;
+    mutable IntRowVectorType m_q;
+
+    mutable LUMatrixType m_matrix;  // copy of the factorized matrix
+    mutable SluMatrix m_sluA;
+    mutable SuperMatrix m_sluL, m_sluU;
+    mutable SluMatrix m_sluB, m_sluX;
+    mutable SuperLUStat_t m_sluStat;
+    mutable superlu_options_t m_sluOptions;
+    mutable std::vector<int> m_sluEtree;
+    mutable Matrix<RealScalar,Dynamic,1> m_sluRscale, m_sluCscale;
+    mutable Matrix<RealScalar,Dynamic,1> m_sluFerr, m_sluBerr;
+    mutable char m_sluEqued;
+
+    mutable ComputationInfo m_info;
+    bool m_isInitialized;
+    int m_factorizationIsOk;
+    int m_analysisIsOk;
+    mutable bool m_extractedDataAreDirty;
+    
+  private:
+    SuperLUBase(SuperLUBase& ) { }
+};
+
+
+/** \ingroup SuperLUSupport_Module
+  * \class SuperLU
+  * \brief A sparse direct LU factorization and solver based on the SuperLU library
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a direct LU factorization
+  * using the SuperLU library. The sparse matrix A must be squared and invertible. The vectors or matrices
+  * X and B can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  *
+  * \sa \ref TutorialSparseDirectSolvers
+  */
+template<typename _MatrixType>
+class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
+{
+  public:
+    typedef SuperLUBase<_MatrixType,SuperLU> Base;
+    typedef _MatrixType MatrixType;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
+    typedef typename Base::Index Index;
+    typedef typename Base::IntRowVectorType IntRowVectorType;
+    typedef typename Base::IntColVectorType IntColVectorType;    
+    typedef typename Base::LUMatrixType LUMatrixType;
+    typedef TriangularView<LUMatrixType, Lower|UnitDiag>  LMatrixType;
+    typedef TriangularView<LUMatrixType,  Upper>           UMatrixType;
+
+  public:
+
+    SuperLU() : Base() { init(); }
+
+    SuperLU(const MatrixType& matrix) : Base()
+    {
+      init();
+      Base::compute(matrix);
+    }
+
+    ~SuperLU()
+    {
+    }
+    
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      * 
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& matrix)
+    {
+      m_info = InvalidInput;
+      m_isInitialized = false;
+      Base::analyzePattern(matrix);
+    }
+    
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    void factorize(const MatrixType& matrix);
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
+    #endif // EIGEN_PARSED_BY_DOXYGEN
+    
+    inline const LMatrixType& matrixL() const
+    {
+      if (m_extractedDataAreDirty) this->extractData();
+      return m_l;
+    }
+
+    inline const UMatrixType& matrixU() const
+    {
+      if (m_extractedDataAreDirty) this->extractData();
+      return m_u;
+    }
+
+    inline const IntColVectorType& permutationP() const
+    {
+      if (m_extractedDataAreDirty) this->extractData();
+      return m_p;
+    }
+
+    inline const IntRowVectorType& permutationQ() const
+    {
+      if (m_extractedDataAreDirty) this->extractData();
+      return m_q;
+    }
+    
+    Scalar determinant() const;
+    
+  protected:
+    
+    using Base::m_matrix;
+    using Base::m_sluOptions;
+    using Base::m_sluA;
+    using Base::m_sluB;
+    using Base::m_sluX;
+    using Base::m_p;
+    using Base::m_q;
+    using Base::m_sluEtree;
+    using Base::m_sluEqued;
+    using Base::m_sluRscale;
+    using Base::m_sluCscale;
+    using Base::m_sluL;
+    using Base::m_sluU;
+    using Base::m_sluStat;
+    using Base::m_sluFerr;
+    using Base::m_sluBerr;
+    using Base::m_l;
+    using Base::m_u;
+    
+    using Base::m_analysisIsOk;
+    using Base::m_factorizationIsOk;
+    using Base::m_extractedDataAreDirty;
+    using Base::m_isInitialized;
+    using Base::m_info;
+    
+    void init()
+    {
+      Base::init();
+      
+      set_default_options(&this->m_sluOptions);
+      m_sluOptions.PrintStat        = NO;
+      m_sluOptions.ConditionNumber  = NO;
+      m_sluOptions.Trans            = NOTRANS;
+      m_sluOptions.ColPerm          = COLAMD;
+    }
+    
+    
+  private:
+    SuperLU(SuperLU& ) { }
+};
+
+template<typename MatrixType>
+void SuperLU<MatrixType>::factorize(const MatrixType& a)
+{
+  eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+  if(!m_analysisIsOk)
+  {
+    m_info = InvalidInput;
+    return;
+  }
+  
+  this->initFactorization(a);
+  
+  m_sluOptions.ColPerm = COLAMD;
+  int info = 0;
+  RealScalar recip_pivot_growth, rcond;
+  RealScalar ferr, berr;
+
+  StatInit(&m_sluStat);
+  SuperLU_gssvx(&m_sluOptions, &m_sluA, m_q.data(), m_p.data(), &m_sluEtree[0],
+                &m_sluEqued, &m_sluRscale[0], &m_sluCscale[0],
+                &m_sluL, &m_sluU,
+                NULL, 0,
+                &m_sluB, &m_sluX,
+                &recip_pivot_growth, &rcond,
+                &ferr, &berr,
+                &m_sluStat, &info, Scalar());
+  StatFree(&m_sluStat);
+
+  m_extractedDataAreDirty = true;
+
+  // FIXME how to better check for errors ???
+  m_info = info == 0 ? Success : NumericalIssue;
+  m_factorizationIsOk = true;
+}
+
+template<typename MatrixType>
+template<typename Rhs,typename Dest>
+void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
+{
+  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
+
+  const int size = m_matrix.rows();
+  const int rhsCols = b.cols();
+  eigen_assert(size==b.rows());
+
+  m_sluOptions.Trans = NOTRANS;
+  m_sluOptions.Fact = FACTORED;
+  m_sluOptions.IterRefine = NOREFINE;
+  
+
+  m_sluFerr.resize(rhsCols);
+  m_sluBerr.resize(rhsCols);
+  m_sluB = SluMatrix::Map(b.const_cast_derived());
+  m_sluX = SluMatrix::Map(x.derived());
+  
+  typename Rhs::PlainObject b_cpy;
+  if(m_sluEqued!='N')
+  {
+    b_cpy = b;
+    m_sluB = SluMatrix::Map(b_cpy.const_cast_derived());  
+  }
+
+  StatInit(&m_sluStat);
+  int info = 0;
+  RealScalar recip_pivot_growth, rcond;
+  SuperLU_gssvx(&m_sluOptions, &m_sluA,
+                m_q.data(), m_p.data(),
+                &m_sluEtree[0], &m_sluEqued,
+                &m_sluRscale[0], &m_sluCscale[0],
+                &m_sluL, &m_sluU,
+                NULL, 0,
+                &m_sluB, &m_sluX,
+                &recip_pivot_growth, &rcond,
+                &m_sluFerr[0], &m_sluBerr[0],
+                &m_sluStat, &info, Scalar());
+  StatFree(&m_sluStat);
+  m_info = info==0 ? Success : NumericalIssue;
+}
+
+// the code of this extractData() function has been adapted from the SuperLU's Matlab support code,
+//
+//  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+//
+//  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+//  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+//
+template<typename MatrixType, typename Derived>
+void SuperLUBase<MatrixType,Derived>::extractData() const
+{
+  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for extracting factors, you must first call either compute() or analyzePattern()/factorize()");
+  if (m_extractedDataAreDirty)
+  {
+    int         upper;
+    int         fsupc, istart, nsupr;
+    int         lastl = 0, lastu = 0;
+    SCformat    *Lstore = static_cast<SCformat*>(m_sluL.Store);
+    NCformat    *Ustore = static_cast<NCformat*>(m_sluU.Store);
+    Scalar      *SNptr;
+
+    const int size = m_matrix.rows();
+    m_l.resize(size,size);
+    m_l.resizeNonZeros(Lstore->nnz);
+    m_u.resize(size,size);
+    m_u.resizeNonZeros(Ustore->nnz);
+
+    int* Lcol = m_l.outerIndexPtr();
+    int* Lrow = m_l.innerIndexPtr();
+    Scalar* Lval = m_l.valuePtr();
+
+    int* Ucol = m_u.outerIndexPtr();
+    int* Urow = m_u.innerIndexPtr();
+    Scalar* Uval = m_u.valuePtr();
+
+    Ucol[0] = 0;
+    Ucol[0] = 0;
+
+    /* for each supernode */
+    for (int k = 0; k <= Lstore->nsuper; ++k)
+    {
+      fsupc   = L_FST_SUPC(k);
+      istart  = L_SUB_START(fsupc);
+      nsupr   = L_SUB_START(fsupc+1) - istart;
+      upper   = 1;
+
+      /* for each column in the supernode */
+      for (int j = fsupc; j < L_FST_SUPC(k+1); ++j)
+      {
+        SNptr = &((Scalar*)Lstore->nzval)[L_NZ_START(j)];
+
+        /* Extract U */
+        for (int i = U_NZ_START(j); i < U_NZ_START(j+1); ++i)
+        {
+          Uval[lastu] = ((Scalar*)Ustore->nzval)[i];
+          /* Matlab doesn't like explicit zero. */
+          if (Uval[lastu] != 0.0)
+            Urow[lastu++] = U_SUB(i);
+        }
+        for (int i = 0; i < upper; ++i)
+        {
+          /* upper triangle in the supernode */
+          Uval[lastu] = SNptr[i];
+          /* Matlab doesn't like explicit zero. */
+          if (Uval[lastu] != 0.0)
+            Urow[lastu++] = L_SUB(istart+i);
+        }
+        Ucol[j+1] = lastu;
+
+        /* Extract L */
+        Lval[lastl] = 1.0; /* unit diagonal */
+        Lrow[lastl++] = L_SUB(istart + upper - 1);
+        for (int i = upper; i < nsupr; ++i)
+        {
+          Lval[lastl] = SNptr[i];
+          /* Matlab doesn't like explicit zero. */
+          if (Lval[lastl] != 0.0)
+            Lrow[lastl++] = L_SUB(istart+i);
+        }
+        Lcol[j+1] = lastl;
+
+        ++upper;
+      } /* for j ... */
+
+    } /* for k ... */
+
+    // squeeze the matrices :
+    m_l.resizeNonZeros(lastl);
+    m_u.resizeNonZeros(lastu);
+
+    m_extractedDataAreDirty = false;
+  }
+}
+
+template<typename MatrixType>
+typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
+{
+  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for computing the determinant, you must first call either compute() or analyzePattern()/factorize()");
+  
+  if (m_extractedDataAreDirty)
+    this->extractData();
+
+  Scalar det = Scalar(1);
+  for (int j=0; j<m_u.cols(); ++j)
+  {
+    if (m_u.outerIndexPtr()[j+1]-m_u.outerIndexPtr()[j] > 0)
+    {
+      int lastId = m_u.outerIndexPtr()[j+1]-1;
+      eigen_assert(m_u.innerIndexPtr()[lastId]<=j);
+      if (m_u.innerIndexPtr()[lastId]==j)
+        det *= m_u.valuePtr()[lastId];
+    }
+  }
+  if(m_sluEqued!='N')
+    return det/m_sluRscale.prod()/m_sluCscale.prod();
+  else
+    return det;
+}
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+#define EIGEN_SUPERLU_HAS_ILU
+#endif
+
+#ifdef EIGEN_SUPERLU_HAS_ILU
+
+/** \ingroup SuperLUSupport_Module
+  * \class SuperILU
+  * \brief A sparse direct \b incomplete LU factorization and solver based on the SuperLU library
+  *
+  * This class allows to solve for an approximate solution of A.X = B sparse linear problems via an incomplete LU factorization
+  * using the SuperLU library. This class is aimed to be used as a preconditioner of the iterative linear solvers.
+  *
+  * \warning This class requires SuperLU 4 or later.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  *
+  * \sa \ref TutorialSparseDirectSolvers, class ConjugateGradient, class BiCGSTAB
+  */
+
+template<typename _MatrixType>
+class SuperILU : public SuperLUBase<_MatrixType,SuperILU<_MatrixType> >
+{
+  public:
+    typedef SuperLUBase<_MatrixType,SuperILU> Base;
+    typedef _MatrixType MatrixType;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
+    typedef typename Base::Index Index;
+
+  public:
+
+    SuperILU() : Base() { init(); }
+
+    SuperILU(const MatrixType& matrix) : Base()
+    {
+      init();
+      Base::compute(matrix);
+    }
+
+    ~SuperILU()
+    {
+    }
+    
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      * 
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& matrix)
+    {
+      Base::analyzePattern(matrix);
+    }
+    
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    void factorize(const MatrixType& matrix);
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
+    #endif // EIGEN_PARSED_BY_DOXYGEN
+    
+  protected:
+    
+    using Base::m_matrix;
+    using Base::m_sluOptions;
+    using Base::m_sluA;
+    using Base::m_sluB;
+    using Base::m_sluX;
+    using Base::m_p;
+    using Base::m_q;
+    using Base::m_sluEtree;
+    using Base::m_sluEqued;
+    using Base::m_sluRscale;
+    using Base::m_sluCscale;
+    using Base::m_sluL;
+    using Base::m_sluU;
+    using Base::m_sluStat;
+    using Base::m_sluFerr;
+    using Base::m_sluBerr;
+    using Base::m_l;
+    using Base::m_u;
+    
+    using Base::m_analysisIsOk;
+    using Base::m_factorizationIsOk;
+    using Base::m_extractedDataAreDirty;
+    using Base::m_isInitialized;
+    using Base::m_info;
+
+    void init()
+    {
+      Base::init();
+      
+      ilu_set_default_options(&m_sluOptions);
+      m_sluOptions.PrintStat        = NO;
+      m_sluOptions.ConditionNumber  = NO;
+      m_sluOptions.Trans            = NOTRANS;
+      m_sluOptions.ColPerm          = MMD_AT_PLUS_A;
+      
+      // no attempt to preserve column sum
+      m_sluOptions.ILU_MILU = SILU;
+      // only basic ILU(k) support -- no direct control over memory consumption
+      // better to use ILU_DropRule = DROP_BASIC | DROP_AREA
+      // and set ILU_FillFactor to max memory growth
+      m_sluOptions.ILU_DropRule = DROP_BASIC;
+      m_sluOptions.ILU_DropTol = NumTraits<Scalar>::dummy_precision()*10;
+    }
+    
+  private:
+    SuperILU(SuperILU& ) { }
+};
+
+template<typename MatrixType>
+void SuperILU<MatrixType>::factorize(const MatrixType& a)
+{
+  eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+  if(!m_analysisIsOk)
+  {
+    m_info = InvalidInput;
+    return;
+  }
+  
+  this->initFactorization(a);
+
+  int info = 0;
+  RealScalar recip_pivot_growth, rcond;
+
+  StatInit(&m_sluStat);
+  SuperLU_gsisx(&m_sluOptions, &m_sluA, m_q.data(), m_p.data(), &m_sluEtree[0],
+                &m_sluEqued, &m_sluRscale[0], &m_sluCscale[0],
+                &m_sluL, &m_sluU,
+                NULL, 0,
+                &m_sluB, &m_sluX,
+                &recip_pivot_growth, &rcond,
+                &m_sluStat, &info, Scalar());
+  StatFree(&m_sluStat);
+
+  // FIXME how to better check for errors ???
+  m_info = info == 0 ? Success : NumericalIssue;
+  m_factorizationIsOk = true;
+}
+
+template<typename MatrixType>
+template<typename Rhs,typename Dest>
+void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
+{
+  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
+
+  const int size = m_matrix.rows();
+  const int rhsCols = b.cols();
+  eigen_assert(size==b.rows());
+
+  m_sluOptions.Trans = NOTRANS;
+  m_sluOptions.Fact = FACTORED;
+  m_sluOptions.IterRefine = NOREFINE;
+
+  m_sluFerr.resize(rhsCols);
+  m_sluBerr.resize(rhsCols);
+  m_sluB = SluMatrix::Map(b.const_cast_derived());
+  m_sluX = SluMatrix::Map(x.derived());
+
+  typename Rhs::PlainObject b_cpy;
+  if(m_sluEqued!='N')
+  {
+    b_cpy = b;
+    m_sluB = SluMatrix::Map(b_cpy.const_cast_derived());  
+  }
+  
+  int info = 0;
+  RealScalar recip_pivot_growth, rcond;
+
+  StatInit(&m_sluStat);
+  SuperLU_gsisx(&m_sluOptions, &m_sluA,
+                m_q.data(), m_p.data(),
+                &m_sluEtree[0], &m_sluEqued,
+                &m_sluRscale[0], &m_sluCscale[0],
+                &m_sluL, &m_sluU,
+                NULL, 0,
+                &m_sluB, &m_sluX,
+                &recip_pivot_growth, &rcond,
+                &m_sluStat, &info, Scalar());
+  StatFree(&m_sluStat);
+
+  m_info = info==0 ? Success : NumericalIssue;
+}
+#endif
+
+namespace internal {
+  
+template<typename _MatrixType, typename Derived, typename Rhs>
+struct solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
+  : solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
+{
+  typedef SuperLUBase<_MatrixType,Derived> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec().derived()._solve(rhs(),dst);
+  }
+};
+
+template<typename _MatrixType, typename Derived, typename Rhs>
+struct sparse_solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
+  : sparse_solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
+{
+  typedef SuperLUBase<_MatrixType,Derived> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    this->defaultEvalTo(dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SUPERLUSUPPORT_H
diff --git a/include/eigen3/Eigen/src/UmfPackSupport/UmfPackSupport.h b/include/eigen3/Eigen/src/UmfPackSupport/UmfPackSupport.h
new file mode 100644
index 0000000..29c60c3
--- /dev/null
+++ b/include/eigen3/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -0,0 +1,474 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_UMFPACKSUPPORT_H
+#define EIGEN_UMFPACKSUPPORT_H
+
+namespace Eigen { 
+
+/* TODO extract L, extract U, compute det, etc... */
+
+// generic double/complex<double> wrapper functions:
+
+inline void umfpack_free_numeric(void **Numeric, double)
+{ umfpack_di_free_numeric(Numeric); *Numeric = 0; }
+
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>)
+{ umfpack_zi_free_numeric(Numeric); *Numeric = 0; }
+
+inline void umfpack_free_symbolic(void **Symbolic, double)
+{ umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; }
+
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>)
+{ umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; }
+
+inline int umfpack_symbolic(int n_row,int n_col,
+                            const int Ap[], const int Ai[], const double Ax[], void **Symbolic,
+                            const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_di_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info);
+}
+
+inline int umfpack_symbolic(int n_row,int n_col,
+                            const int Ap[], const int Ai[], const std::complex<double> Ax[], void **Symbolic,
+                            const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
+}
+
+inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[],
+                            void *Symbolic, void **Numeric,
+                            const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_di_numeric(Ap,Ai,Ax,Symbolic,Numeric,Control,Info);
+}
+
+inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex<double> Ax[],
+                            void *Symbolic, void **Numeric,
+                            const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_zi_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
+}
+
+inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[],
+                          double X[], const double B[], void *Numeric,
+                          const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_di_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info);
+}
+
+inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::complex<double> Ax[],
+                          std::complex<double> X[], const std::complex<double> B[], void *Numeric,
+                          const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
+}
+
+inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double)
+{
+  return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, std::complex<double>)
+{
+  return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[],
+                               int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric)
+{
+  return umfpack_di_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric);
+}
+
+inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex<double> Lx[], int Up[], int Ui[], std::complex<double> Ux[],
+                               int P[], int Q[], std::complex<double> Dx[], int *do_recip, double Rs[], void *Numeric)
+{
+  double& lx0_real = numext::real_ref(Lx[0]);
+  double& ux0_real = numext::real_ref(Ux[0]);
+  double& dx0_real = numext::real_ref(Dx[0]);
+  return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
+                                Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
+}
+
+inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+{
+  return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info);
+}
+
+inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+{
+  double& mx_real = numext::real_ref(*Mx);
+  return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
+}
+
+namespace internal {
+  template<typename T> struct umfpack_helper_is_sparse_plain : false_type {};
+  template<typename Scalar, int Options, typename StorageIndex>
+  struct umfpack_helper_is_sparse_plain<SparseMatrix<Scalar,Options,StorageIndex> >
+    : true_type {};
+  template<typename Scalar, int Options, typename StorageIndex>
+  struct umfpack_helper_is_sparse_plain<MappedSparseMatrix<Scalar,Options,StorageIndex> >
+    : true_type {};
+}
+
+/** \ingroup UmfPackSupport_Module
+  * \brief A sparse LU factorization and solver based on UmfPack
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a LU factorization
+  * using the UmfPack library. The sparse matrix A must be squared and full rank.
+  * The vectors or matrices X and B can be either dense or sparse.
+  *
+  * \warning The input matrix A should be in a \b compressed and \b column-major form.
+  * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  *
+  * \sa \ref TutorialSparseDirectSolvers
+  */
+template<typename _MatrixType>
+class UmfPackLU : internal::noncopyable
+{
+  public:
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef SparseMatrix<Scalar> LUMatrixType;
+    typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
+
+  public:
+
+    UmfPackLU() { init(); }
+
+    UmfPackLU(const MatrixType& matrix)
+    {
+      init();
+      compute(matrix);
+    }
+
+    ~UmfPackLU()
+    {
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+    }
+
+    inline Index rows() const { return m_copyMatrix.rows(); }
+    inline Index cols() const { return m_copyMatrix.cols(); }
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+
+    inline const LUMatrixType& matrixL() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_l;
+    }
+
+    inline const LUMatrixType& matrixU() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_u;
+    }
+
+    inline const IntColVectorType& permutationP() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_p;
+    }
+
+    inline const IntRowVectorType& permutationQ() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_q;
+    }
+
+    /** Computes the sparse Cholesky decomposition of \a matrix 
+     *  Note that the matrix should be column-major, and in compressed format for best performance.
+     *  \sa SparseMatrix::makeCompressed().
+     */
+    template<typename InputMatrixType>
+    void compute(const InputMatrixType& matrix)
+    {
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      grapInput(matrix.derived());
+      analyzePattern_impl();
+      factorize_impl();
+    }
+
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<UmfPackLU, Rhs> solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<UmfPackLU, Rhs>(*this, b.derived());
+    }
+
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::sparse_solve_retval<UmfPackLU, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::sparse_solve_retval<UmfPackLU, Rhs>(*this, b.derived());
+    }
+
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize(), compute()
+      */
+    template<typename InputMatrixType>
+    void analyzePattern(const InputMatrixType& matrix)
+    {
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      
+      grapInput(matrix.derived());
+
+      analyzePattern_impl();
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
+      *
+      * \sa analyzePattern(), compute()
+      */
+    template<typename InputMatrixType>
+    void factorize(const InputMatrixType& matrix)
+    {
+      eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
+      if(m_numeric)
+        umfpack_free_numeric(&m_numeric,Scalar());
+
+      grapInput(matrix.derived());
+      
+      factorize_impl();
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal */
+    template<typename BDerived,typename XDerived>
+    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
+    #endif
+
+    Scalar determinant() const;
+
+    void extractData() const;
+
+  protected:
+
+    void init()
+    {
+      m_info                  = InvalidInput;
+      m_isInitialized         = false;
+      m_numeric               = 0;
+      m_symbolic              = 0;
+      m_outerIndexPtr         = 0;
+      m_innerIndexPtr         = 0;
+      m_valuePtr              = 0;
+      m_extractedDataAreDirty = true;
+    }
+    
+    template<typename InputMatrixType>
+    void grapInput_impl(const InputMatrixType& mat, internal::true_type)
+    {
+      m_copyMatrix.resize(mat.rows(), mat.cols());
+      if( ((MatrixType::Flags&RowMajorBit)==RowMajorBit) || sizeof(typename MatrixType::Index)!=sizeof(int) || !mat.isCompressed() )
+      {
+        // non supported input -> copy
+        m_copyMatrix = mat;
+        m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
+        m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
+        m_valuePtr      = m_copyMatrix.valuePtr();
+      }
+      else
+      {
+        m_outerIndexPtr = mat.outerIndexPtr();
+        m_innerIndexPtr = mat.innerIndexPtr();
+        m_valuePtr      = mat.valuePtr();
+      }
+    }
+    
+    template<typename InputMatrixType>
+    void grapInput_impl(const InputMatrixType& mat, internal::false_type)
+    {
+      m_copyMatrix = mat;
+      m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
+      m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
+      m_valuePtr      = m_copyMatrix.valuePtr();
+    }
+    
+    template<typename InputMatrixType>
+    void grapInput(const InputMatrixType& mat)
+    {
+      grapInput_impl(mat, internal::umfpack_helper_is_sparse_plain<InputMatrixType>());
+    }
+    
+    void analyzePattern_impl()
+    {
+      int errorCode = 0;
+      errorCode = umfpack_symbolic(m_copyMatrix.rows(), m_copyMatrix.cols(), m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
+                                   &m_symbolic, 0, 0);
+
+      m_isInitialized = true;
+      m_info = errorCode ? InvalidInput : Success;
+      m_analysisIsOk = true;
+      m_factorizationIsOk = false;
+      m_extractedDataAreDirty = true;
+    }
+    
+    void factorize_impl()
+    {
+      int errorCode;
+      errorCode = umfpack_numeric(m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
+                                  m_symbolic, &m_numeric, 0, 0);
+
+      m_info = errorCode ? NumericalIssue : Success;
+      m_factorizationIsOk = true;
+      m_extractedDataAreDirty = true;
+    }
+
+    // cached data to reduce reallocation, etc.
+    mutable LUMatrixType m_l;
+    mutable LUMatrixType m_u;
+    mutable IntColVectorType m_p;
+    mutable IntRowVectorType m_q;
+
+    UmfpackMatrixType m_copyMatrix;
+    const Scalar* m_valuePtr;
+    const int* m_outerIndexPtr;
+    const int* m_innerIndexPtr;
+    void* m_numeric;
+    void* m_symbolic;
+
+    mutable ComputationInfo m_info;
+    bool m_isInitialized;
+    int m_factorizationIsOk;
+    int m_analysisIsOk;
+    mutable bool m_extractedDataAreDirty;
+    
+  private:
+    UmfPackLU(UmfPackLU& ) { }
+};
+
+
+template<typename MatrixType>
+void UmfPackLU<MatrixType>::extractData() const
+{
+  if (m_extractedDataAreDirty)
+  {
+    // get size of the data
+    int lnz, unz, rows, cols, nz_udiag;
+    umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
+
+    // allocate data
+    m_l.resize(rows,(std::min)(rows,cols));
+    m_l.resizeNonZeros(lnz);
+
+    m_u.resize((std::min)(rows,cols),cols);
+    m_u.resizeNonZeros(unz);
+
+    m_p.resize(rows);
+    m_q.resize(cols);
+
+    // extract
+    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(),
+                        m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(),
+                        m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
+
+    m_extractedDataAreDirty = false;
+  }
+}
+
+template<typename MatrixType>
+typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() const
+{
+  Scalar det;
+  umfpack_get_determinant(&det, 0, m_numeric, 0);
+  return det;
+}
+
+template<typename MatrixType>
+template<typename BDerived,typename XDerived>
+bool UmfPackLU<MatrixType>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
+{
+  const int rhsCols = b.cols();
+  eigen_assert((BDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major rhs yet");
+  eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
+  eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
+  
+  int errorCode;
+  for (int j=0; j<rhsCols; ++j)
+  {
+    errorCode = umfpack_solve(UMFPACK_A,
+        m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
+        &x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
+    if (errorCode!=0)
+      return false;
+  }
+
+  return true;
+}
+
+
+namespace internal {
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<UmfPackLU<_MatrixType>, Rhs>
+  : solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
+{
+  typedef UmfPackLU<_MatrixType> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+template<typename _MatrixType, typename Rhs>
+struct sparse_solve_retval<UmfPackLU<_MatrixType>, Rhs>
+  : sparse_solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
+{
+  typedef UmfPackLU<_MatrixType> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    this->defaultEvalTo(dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_UMFPACKSUPPORT_H
diff --git a/include/eigen3/Eigen/src/misc/Image.h b/include/eigen3/Eigen/src/misc/Image.h
new file mode 100644
index 0000000..75c5f43
--- /dev/null
+++ b/include/eigen3/Eigen/src/misc/Image.h
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MISC_IMAGE_H
+#define EIGEN_MISC_IMAGE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \class image_retval_base
+  *
+  */
+template<typename DecompositionType>
+struct traits<image_retval_base<DecompositionType> >
+{
+  typedef typename DecompositionType::MatrixType MatrixType;
+  typedef Matrix<
+    typename MatrixType::Scalar,
+    MatrixType::RowsAtCompileTime, // the image is a subspace of the destination space, whose
+                                   // dimension is the number of rows of the original matrix
+    Dynamic,                       // we don't know at compile time the dimension of the image (the rank)
+    MatrixType::Options,
+    MatrixType::MaxRowsAtCompileTime, // the image matrix will consist of columns from the original matrix,
+    MatrixType::MaxColsAtCompileTime  // so it has the same number of rows and at most as many columns.
+  > ReturnType;
+};
+
+template<typename _DecompositionType> struct image_retval_base
+ : public ReturnByValue<image_retval_base<_DecompositionType> >
+{
+  typedef _DecompositionType DecompositionType;
+  typedef typename DecompositionType::MatrixType MatrixType;
+  typedef ReturnByValue<image_retval_base> Base;
+  typedef typename Base::Index Index;
+
+  image_retval_base(const DecompositionType& dec, const MatrixType& originalMatrix)
+    : m_dec(dec), m_rank(dec.rank()),
+      m_cols(m_rank == 0 ? 1 : m_rank),
+      m_originalMatrix(originalMatrix)
+  {}
+
+  inline Index rows() const { return m_dec.rows(); }
+  inline Index cols() const { return m_cols; }
+  inline Index rank() const { return m_rank; }
+  inline const DecompositionType& dec() const { return m_dec; }
+  inline const MatrixType& originalMatrix() const { return m_originalMatrix; }
+
+  template<typename Dest> inline void evalTo(Dest& dst) const
+  {
+    static_cast<const image_retval<DecompositionType>*>(this)->evalTo(dst);
+  }
+
+  protected:
+    const DecompositionType& m_dec;
+    Index m_rank, m_cols;
+    const MatrixType& m_originalMatrix;
+};
+
+} // end namespace internal
+
+#define EIGEN_MAKE_IMAGE_HELPERS(DecompositionType) \
+  typedef typename DecompositionType::MatrixType MatrixType; \
+  typedef typename MatrixType::Scalar Scalar; \
+  typedef typename MatrixType::RealScalar RealScalar; \
+  typedef typename MatrixType::Index Index; \
+  typedef Eigen::internal::image_retval_base<DecompositionType> Base; \
+  using Base::dec; \
+  using Base::originalMatrix; \
+  using Base::rank; \
+  using Base::rows; \
+  using Base::cols; \
+  image_retval(const DecompositionType& dec, const MatrixType& originalMatrix) \
+    : Base(dec, originalMatrix) {}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MISC_IMAGE_H
diff --git a/include/eigen3/Eigen/src/misc/Kernel.h b/include/eigen3/Eigen/src/misc/Kernel.h
new file mode 100644
index 0000000..b9e1518
--- /dev/null
+++ b/include/eigen3/Eigen/src/misc/Kernel.h
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MISC_KERNEL_H
+#define EIGEN_MISC_KERNEL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \class kernel_retval_base
+  *
+  */
+template<typename DecompositionType>
+struct traits<kernel_retval_base<DecompositionType> >
+{
+  typedef typename DecompositionType::MatrixType MatrixType;
+  typedef Matrix<
+    typename MatrixType::Scalar,
+    MatrixType::ColsAtCompileTime, // the number of rows in the "kernel matrix"
+                                   // is the number of cols of the original matrix
+                                   // so that the product "matrix * kernel = zero" makes sense
+    Dynamic,                       // we don't know at compile-time the dimension of the kernel
+    MatrixType::Options,
+    MatrixType::MaxColsAtCompileTime, // see explanation for 2nd template parameter
+    MatrixType::MaxColsAtCompileTime // the kernel is a subspace of the domain space,
+                                     // whose dimension is the number of columns of the original matrix
+  > ReturnType;
+};
+
+template<typename _DecompositionType> struct kernel_retval_base
+ : public ReturnByValue<kernel_retval_base<_DecompositionType> >
+{
+  typedef _DecompositionType DecompositionType;
+  typedef ReturnByValue<kernel_retval_base> Base;
+  typedef typename Base::Index Index;
+
+  kernel_retval_base(const DecompositionType& dec)
+    : m_dec(dec),
+      m_rank(dec.rank()),
+      m_cols(m_rank==dec.cols() ? 1 : dec.cols() - m_rank)
+  {}
+
+  inline Index rows() const { return m_dec.cols(); }
+  inline Index cols() const { return m_cols; }
+  inline Index rank() const { return m_rank; }
+  inline const DecompositionType& dec() const { return m_dec; }
+
+  template<typename Dest> inline void evalTo(Dest& dst) const
+  {
+    static_cast<const kernel_retval<DecompositionType>*>(this)->evalTo(dst);
+  }
+
+  protected:
+    const DecompositionType& m_dec;
+    Index m_rank, m_cols;
+};
+
+} // end namespace internal
+
+#define EIGEN_MAKE_KERNEL_HELPERS(DecompositionType) \
+  typedef typename DecompositionType::MatrixType MatrixType; \
+  typedef typename MatrixType::Scalar Scalar; \
+  typedef typename MatrixType::RealScalar RealScalar; \
+  typedef typename MatrixType::Index Index; \
+  typedef Eigen::internal::kernel_retval_base<DecompositionType> Base; \
+  using Base::dec; \
+  using Base::rank; \
+  using Base::rows; \
+  using Base::cols; \
+  kernel_retval(const DecompositionType& dec) : Base(dec) {}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MISC_KERNEL_H
diff --git a/include/eigen3/Eigen/src/misc/Solve.h b/include/eigen3/Eigen/src/misc/Solve.h
new file mode 100644
index 0000000..7f70d60
--- /dev/null
+++ b/include/eigen3/Eigen/src/misc/Solve.h
@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MISC_SOLVE_H
+#define EIGEN_MISC_SOLVE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \class solve_retval_base
+  *
+  */
+template<typename DecompositionType, typename Rhs>
+struct traits<solve_retval_base<DecompositionType, Rhs> >
+{
+  typedef typename DecompositionType::MatrixType MatrixType;
+  typedef Matrix<typename Rhs::Scalar,
+                 MatrixType::ColsAtCompileTime,
+                 Rhs::ColsAtCompileTime,
+                 Rhs::PlainObject::Options,
+                 MatrixType::MaxColsAtCompileTime,
+                 Rhs::MaxColsAtCompileTime> ReturnType;
+};
+
+template<typename _DecompositionType, typename Rhs> struct solve_retval_base
+ : public ReturnByValue<solve_retval_base<_DecompositionType, Rhs> >
+{
+  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
+  typedef _DecompositionType DecompositionType;
+  typedef ReturnByValue<solve_retval_base> Base;
+  typedef typename Base::Index Index;
+
+  solve_retval_base(const DecompositionType& dec, const Rhs& rhs)
+    : m_dec(dec), m_rhs(rhs)
+  {}
+
+  inline Index rows() const { return m_dec.cols(); }
+  inline Index cols() const { return m_rhs.cols(); }
+  inline const DecompositionType& dec() const { return m_dec; }
+  inline const RhsNestedCleaned& rhs() const { return m_rhs; }
+
+  template<typename Dest> inline void evalTo(Dest& dst) const
+  {
+    static_cast<const solve_retval<DecompositionType,Rhs>*>(this)->evalTo(dst);
+  }
+
+  protected:
+    const DecompositionType& m_dec;
+    typename Rhs::Nested m_rhs;
+};
+
+} // end namespace internal
+
+#define EIGEN_MAKE_SOLVE_HELPERS(DecompositionType,Rhs) \
+  typedef typename DecompositionType::MatrixType MatrixType; \
+  typedef typename MatrixType::Scalar Scalar; \
+  typedef typename MatrixType::RealScalar RealScalar; \
+  typedef typename MatrixType::Index Index; \
+  typedef Eigen::internal::solve_retval_base<DecompositionType,Rhs> Base; \
+  using Base::dec; \
+  using Base::rhs; \
+  using Base::rows; \
+  using Base::cols; \
+  solve_retval(const DecompositionType& dec, const Rhs& rhs) \
+    : Base(dec, rhs) {}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MISC_SOLVE_H
diff --git a/include/eigen3/Eigen/src/misc/SparseSolve.h b/include/eigen3/Eigen/src/misc/SparseSolve.h
new file mode 100644
index 0000000..244bb8e
--- /dev/null
+++ b/include/eigen3/Eigen/src/misc/SparseSolve.h
@@ -0,0 +1,128 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_SOLVE_H
+#define EIGEN_SPARSE_SOLVE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval_base;
+template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval;
+  
+template<typename DecompositionType, typename Rhs>
+struct traits<sparse_solve_retval_base<DecompositionType, Rhs> >
+{
+  typedef typename DecompositionType::MatrixType MatrixType;
+  typedef SparseMatrix<typename Rhs::Scalar, Rhs::Options, typename Rhs::Index> ReturnType;
+};
+
+template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval_base
+ : public ReturnByValue<sparse_solve_retval_base<_DecompositionType, Rhs> >
+{
+  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
+  typedef _DecompositionType DecompositionType;
+  typedef ReturnByValue<sparse_solve_retval_base> Base;
+  typedef typename Base::Index Index;
+
+  sparse_solve_retval_base(const DecompositionType& dec, const Rhs& rhs)
+    : m_dec(dec), m_rhs(rhs)
+  {}
+
+  inline Index rows() const { return m_dec.cols(); }
+  inline Index cols() const { return m_rhs.cols(); }
+  inline const DecompositionType& dec() const { return m_dec; }
+  inline const RhsNestedCleaned& rhs() const { return m_rhs; }
+
+  template<typename Dest> inline void evalTo(Dest& dst) const
+  {
+    static_cast<const sparse_solve_retval<DecompositionType,Rhs>*>(this)->evalTo(dst);
+  }
+
+  protected:
+    template<typename DestScalar, int DestOptions, typename DestIndex>
+    inline void defaultEvalTo(SparseMatrix<DestScalar,DestOptions,DestIndex>& dst) const
+    {
+      // we process the sparse rhs per block of NbColsAtOnce columns temporarily stored into a dense matrix.
+      static const int NbColsAtOnce = 4;
+      int rhsCols = m_rhs.cols();
+      int size = m_rhs.rows();
+      Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmp(size,rhsCols);
+      Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmpX(size,rhsCols);
+      for(int k=0; k<rhsCols; k+=NbColsAtOnce)
+      {
+        int actualCols = std::min<int>(rhsCols-k, NbColsAtOnce);
+        tmp.leftCols(actualCols) = m_rhs.middleCols(k,actualCols);
+        tmpX.leftCols(actualCols) = m_dec.solve(tmp.leftCols(actualCols));
+        dst.middleCols(k,actualCols) = tmpX.leftCols(actualCols).sparseView();
+      }
+    }
+    const DecompositionType& m_dec;
+    typename Rhs::Nested m_rhs;
+};
+
+#define EIGEN_MAKE_SPARSE_SOLVE_HELPERS(DecompositionType,Rhs) \
+  typedef typename DecompositionType::MatrixType MatrixType; \
+  typedef typename MatrixType::Scalar Scalar; \
+  typedef typename MatrixType::RealScalar RealScalar; \
+  typedef typename MatrixType::Index Index; \
+  typedef Eigen::internal::sparse_solve_retval_base<DecompositionType,Rhs> Base; \
+  using Base::dec; \
+  using Base::rhs; \
+  using Base::rows; \
+  using Base::cols; \
+  sparse_solve_retval(const DecompositionType& dec, const Rhs& rhs) \
+    : Base(dec, rhs) {}
+
+
+
+template<typename DecompositionType, typename Rhs, typename Guess> struct solve_retval_with_guess;
+
+template<typename DecompositionType, typename Rhs, typename Guess>
+struct traits<solve_retval_with_guess<DecompositionType, Rhs, Guess> >
+{
+  typedef typename DecompositionType::MatrixType MatrixType;
+  typedef Matrix<typename Rhs::Scalar,
+                 MatrixType::ColsAtCompileTime,
+                 Rhs::ColsAtCompileTime,
+                 Rhs::PlainObject::Options,
+                 MatrixType::MaxColsAtCompileTime,
+                 Rhs::MaxColsAtCompileTime> ReturnType;
+};
+
+template<typename DecompositionType, typename Rhs, typename Guess> struct solve_retval_with_guess
+ : public ReturnByValue<solve_retval_with_guess<DecompositionType, Rhs, Guess> >
+{
+  typedef typename DecompositionType::Index Index;
+
+  solve_retval_with_guess(const DecompositionType& dec, const Rhs& rhs, const Guess& guess)
+    : m_dec(dec), m_rhs(rhs), m_guess(guess)
+  {}
+
+  inline Index rows() const { return m_dec.cols(); }
+  inline Index cols() const { return m_rhs.cols(); }
+
+  template<typename Dest> inline void evalTo(Dest& dst) const
+  {
+    dst = m_guess;
+    m_dec._solveWithGuess(m_rhs,dst);
+  }
+
+  protected:
+    const DecompositionType& m_dec;
+    const typename Rhs::Nested m_rhs;
+    const typename Guess::Nested m_guess;
+};
+
+} // namepsace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_SOLVE_H
diff --git a/include/eigen3/Eigen/src/misc/blas.h b/include/eigen3/Eigen/src/misc/blas.h
new file mode 100644
index 0000000..6fce99e
--- /dev/null
+++ b/include/eigen3/Eigen/src/misc/blas.h
@@ -0,0 +1,658 @@
+#ifndef BLAS_H
+#define BLAS_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define BLASFUNC(FUNC) FUNC##_
+
+#ifdef __WIN64__
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+int    BLASFUNC(xerbla)(const char *, int *info, int);
+
+float  BLASFUNC(sdot)  (int *, float  *, int *, float  *, int *);
+float  BLASFUNC(sdsdot)(int *, float  *,        float  *, int *, float  *, int *);
+
+double BLASFUNC(dsdot) (int *, float  *, int *, float  *, int *);
+double BLASFUNC(ddot)  (int *, double *, int *, double *, int *);
+double BLASFUNC(qdot)  (int *, double *, int *, double *, int *);
+
+int  BLASFUNC(cdotuw)  (int *, float  *, int *, float  *, int *, float*);
+int  BLASFUNC(cdotcw)  (int *, float  *, int *, float  *, int *, float*);
+int  BLASFUNC(zdotuw)  (int *, double  *, int *, double  *, int *, double*);
+int  BLASFUNC(zdotcw)  (int *, double  *, int *, double  *, int *, double*);
+
+int    BLASFUNC(saxpy) (int *, float  *, float  *, int *, float  *, int *);
+int    BLASFUNC(daxpy) (int *, double *, double *, int *, double *, int *);
+int    BLASFUNC(qaxpy) (int *, double *, double *, int *, double *, int *);
+int    BLASFUNC(caxpy) (int *, float  *, float  *, int *, float  *, int *);
+int    BLASFUNC(zaxpy) (int *, double *, double *, int *, double *, int *);
+int    BLASFUNC(xaxpy) (int *, double *, double *, int *, double *, int *);
+int    BLASFUNC(caxpyc)(int *, float  *, float  *, int *, float  *, int *);
+int    BLASFUNC(zaxpyc)(int *, double *, double *, int *, double *, int *);
+int    BLASFUNC(xaxpyc)(int *, double *, double *, int *, double *, int *);
+
+int    BLASFUNC(scopy) (int *, float  *, int *, float  *, int *);
+int    BLASFUNC(dcopy) (int *, double *, int *, double *, int *);
+int    BLASFUNC(qcopy) (int *, double *, int *, double *, int *);
+int    BLASFUNC(ccopy) (int *, float  *, int *, float  *, int *);
+int    BLASFUNC(zcopy) (int *, double *, int *, double *, int *);
+int    BLASFUNC(xcopy) (int *, double *, int *, double *, int *);
+
+int    BLASFUNC(sswap) (int *, float  *, int *, float  *, int *);
+int    BLASFUNC(dswap) (int *, double *, int *, double *, int *);
+int    BLASFUNC(qswap) (int *, double *, int *, double *, int *);
+int    BLASFUNC(cswap) (int *, float  *, int *, float  *, int *);
+int    BLASFUNC(zswap) (int *, double *, int *, double *, int *);
+int    BLASFUNC(xswap) (int *, double *, int *, double *, int *);
+
+float  BLASFUNC(sasum) (int *, float  *, int *);
+float  BLASFUNC(scasum)(int *, float  *, int *);
+double BLASFUNC(dasum) (int *, double *, int *);
+double BLASFUNC(qasum) (int *, double *, int *);
+double BLASFUNC(dzasum)(int *, double *, int *);
+double BLASFUNC(qxasum)(int *, double *, int *);
+
+int    BLASFUNC(isamax)(int *, float  *, int *);
+int    BLASFUNC(idamax)(int *, double *, int *);
+int    BLASFUNC(iqamax)(int *, double *, int *);
+int    BLASFUNC(icamax)(int *, float  *, int *);
+int    BLASFUNC(izamax)(int *, double *, int *);
+int    BLASFUNC(ixamax)(int *, double *, int *);
+
+int    BLASFUNC(ismax) (int *, float  *, int *);
+int    BLASFUNC(idmax) (int *, double *, int *);
+int    BLASFUNC(iqmax) (int *, double *, int *);
+int    BLASFUNC(icmax) (int *, float  *, int *);
+int    BLASFUNC(izmax) (int *, double *, int *);
+int    BLASFUNC(ixmax) (int *, double *, int *);
+
+int    BLASFUNC(isamin)(int *, float  *, int *);
+int    BLASFUNC(idamin)(int *, double *, int *);
+int    BLASFUNC(iqamin)(int *, double *, int *);
+int    BLASFUNC(icamin)(int *, float  *, int *);
+int    BLASFUNC(izamin)(int *, double *, int *);
+int    BLASFUNC(ixamin)(int *, double *, int *);
+
+int    BLASFUNC(ismin)(int *, float  *, int *);
+int    BLASFUNC(idmin)(int *, double *, int *);
+int    BLASFUNC(iqmin)(int *, double *, int *);
+int    BLASFUNC(icmin)(int *, float  *, int *);
+int    BLASFUNC(izmin)(int *, double *, int *);
+int    BLASFUNC(ixmin)(int *, double *, int *);
+
+float  BLASFUNC(samax) (int *, float  *, int *);
+double BLASFUNC(damax) (int *, double *, int *);
+double BLASFUNC(qamax) (int *, double *, int *);
+float  BLASFUNC(scamax)(int *, float  *, int *);
+double BLASFUNC(dzamax)(int *, double *, int *);
+double BLASFUNC(qxamax)(int *, double *, int *);
+
+float  BLASFUNC(samin) (int *, float  *, int *);
+double BLASFUNC(damin) (int *, double *, int *);
+double BLASFUNC(qamin) (int *, double *, int *);
+float  BLASFUNC(scamin)(int *, float  *, int *);
+double BLASFUNC(dzamin)(int *, double *, int *);
+double BLASFUNC(qxamin)(int *, double *, int *);
+
+float  BLASFUNC(smax)  (int *, float  *, int *);
+double BLASFUNC(dmax)  (int *, double *, int *);
+double BLASFUNC(qmax)  (int *, double *, int *);
+float  BLASFUNC(scmax) (int *, float  *, int *);
+double BLASFUNC(dzmax) (int *, double *, int *);
+double BLASFUNC(qxmax) (int *, double *, int *);
+
+float  BLASFUNC(smin)  (int *, float  *, int *);
+double BLASFUNC(dmin)  (int *, double *, int *);
+double BLASFUNC(qmin)  (int *, double *, int *);
+float  BLASFUNC(scmin) (int *, float  *, int *);
+double BLASFUNC(dzmin) (int *, double *, int *);
+double BLASFUNC(qxmin) (int *, double *, int *);
+
+int    BLASFUNC(sscal) (int *,  float  *, float  *, int *);
+int    BLASFUNC(dscal) (int *,  double *, double *, int *);
+int    BLASFUNC(qscal) (int *,  double *, double *, int *);
+int    BLASFUNC(cscal) (int *,  float  *, float  *, int *);
+int    BLASFUNC(zscal) (int *,  double *, double *, int *);
+int    BLASFUNC(xscal) (int *,  double *, double *, int *);
+int    BLASFUNC(csscal)(int *,  float  *, float  *, int *);
+int    BLASFUNC(zdscal)(int *,  double *, double *, int *);
+int    BLASFUNC(xqscal)(int *,  double *, double *, int *);
+
+float  BLASFUNC(snrm2) (int *, float  *, int *);
+float  BLASFUNC(scnrm2)(int *, float  *, int *);
+
+double BLASFUNC(dnrm2) (int *, double *, int *);
+double BLASFUNC(qnrm2) (int *, double *, int *);
+double BLASFUNC(dznrm2)(int *, double *, int *);
+double BLASFUNC(qxnrm2)(int *, double *, int *);
+
+int    BLASFUNC(srot)  (int *, float  *, int *, float  *, int *, float  *, float  *);
+int    BLASFUNC(drot)  (int *, double *, int *, double *, int *, double *, double *);
+int    BLASFUNC(qrot)  (int *, double *, int *, double *, int *, double *, double *);
+int    BLASFUNC(csrot) (int *, float  *, int *, float  *, int *, float  *, float  *);
+int    BLASFUNC(zdrot) (int *, double *, int *, double *, int *, double *, double *);
+int    BLASFUNC(xqrot) (int *, double *, int *, double *, int *, double *, double *);
+
+int    BLASFUNC(srotg) (float  *, float  *, float  *, float  *);
+int    BLASFUNC(drotg) (double *, double *, double *, double *);
+int    BLASFUNC(qrotg) (double *, double *, double *, double *);
+int    BLASFUNC(crotg) (float  *, float  *, float  *, float  *);
+int    BLASFUNC(zrotg) (double *, double *, double *, double *);
+int    BLASFUNC(xrotg) (double *, double *, double *, double *);
+
+int    BLASFUNC(srotmg)(float  *, float  *, float  *, float  *, float  *);
+int    BLASFUNC(drotmg)(double *, double *, double *, double *, double *);
+
+int    BLASFUNC(srotm) (int *, float  *, int *, float  *, int *, float  *);
+int    BLASFUNC(drotm) (int *, double *, int *, double *, int *, double *);
+int    BLASFUNC(qrotm) (int *, double *, int *, double *, int *, double *);
+
+/* Level 2 routines */
+
+int BLASFUNC(sger)(int *,    int *, float *,  float *, int *,
+		   float *,  int *, float *,  int *);
+int BLASFUNC(dger)(int *,    int *, double *, double *, int *,
+		   double *, int *, double *, int *);
+int BLASFUNC(qger)(int *,    int *, double *, double *, int *,
+		   double *, int *, double *, int *);
+int BLASFUNC(cgeru)(int *,    int *, float *,  float *, int *,
+		    float *,  int *, float *,  int *);
+int BLASFUNC(cgerc)(int *,    int *, float *,  float *, int *,
+		    float *,  int *, float *,  int *);
+int BLASFUNC(zgeru)(int *,    int *, double *, double *, int *,
+		    double *, int *, double *, int *);
+int BLASFUNC(zgerc)(int *,    int *, double *, double *, int *,
+		    double *, int *, double *, int *);
+int BLASFUNC(xgeru)(int *,    int *, double *, double *, int *,
+		    double *, int *, double *, int *);
+int BLASFUNC(xgerc)(int *,    int *, double *, double *, int *,
+		    double *, int *, double *, int *);
+
+int BLASFUNC(sgemv)(char *, int *, int *, float  *, float  *, int *,
+		    float  *, int *, float  *, float  *, int *);
+int BLASFUNC(dgemv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(qgemv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(cgemv)(char *, int *, int *, float  *, float  *, int *,
+		    float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zgemv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(xgemv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+
+int BLASFUNC(strsv) (char *, char *, char *, int *, float  *, int *,
+		     float  *, int *);
+int BLASFUNC(dtrsv) (char *, char *, char *, int *, double *, int *,
+		     double *, int *);
+int BLASFUNC(qtrsv) (char *, char *, char *, int *, double *, int *,
+		     double *, int *);
+int BLASFUNC(ctrsv) (char *, char *, char *, int *, float  *, int *,
+		     float  *, int *);
+int BLASFUNC(ztrsv) (char *, char *, char *, int *, double *, int *,
+		     double *, int *);
+int BLASFUNC(xtrsv) (char *, char *, char *, int *, double *, int *,
+		     double *, int *);
+
+int BLASFUNC(stpsv) (char *, char *, char *, int *, float  *, float  *, int *);
+int BLASFUNC(dtpsv) (char *, char *, char *, int *, double *, double *, int *);
+int BLASFUNC(qtpsv) (char *, char *, char *, int *, double *, double *, int *);
+int BLASFUNC(ctpsv) (char *, char *, char *, int *, float  *, float  *, int *);
+int BLASFUNC(ztpsv) (char *, char *, char *, int *, double *, double *, int *);
+int BLASFUNC(xtpsv) (char *, char *, char *, int *, double *, double *, int *);
+
+int BLASFUNC(strmv) (char *, char *, char *, int *, float  *, int *,
+		     float  *, int *);
+int BLASFUNC(dtrmv) (char *, char *, char *, int *, double *, int *,
+		     double *, int *);
+int BLASFUNC(qtrmv) (char *, char *, char *, int *, double *, int *,
+		     double *, int *);
+int BLASFUNC(ctrmv) (char *, char *, char *, int *, float  *, int *,
+		     float  *, int *);
+int BLASFUNC(ztrmv) (char *, char *, char *, int *, double *, int *,
+		     double *, int *);
+int BLASFUNC(xtrmv) (char *, char *, char *, int *, double *, int *,
+		     double *, int *);
+
+int BLASFUNC(stpmv) (char *, char *, char *, int *, float  *, float  *, int *);
+int BLASFUNC(dtpmv) (char *, char *, char *, int *, double *, double *, int *);
+int BLASFUNC(qtpmv) (char *, char *, char *, int *, double *, double *, int *);
+int BLASFUNC(ctpmv) (char *, char *, char *, int *, float  *, float  *, int *);
+int BLASFUNC(ztpmv) (char *, char *, char *, int *, double *, double *, int *);
+int BLASFUNC(xtpmv) (char *, char *, char *, int *, double *, double *, int *);
+
+int BLASFUNC(stbmv) (char *, char *, char *, int *, int *, float  *, int *, float  *, int *);
+int BLASFUNC(dtbmv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
+int BLASFUNC(qtbmv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
+int BLASFUNC(ctbmv) (char *, char *, char *, int *, int *, float  *, int *, float  *, int *);
+int BLASFUNC(ztbmv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
+int BLASFUNC(xtbmv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
+
+int BLASFUNC(stbsv) (char *, char *, char *, int *, int *, float  *, int *, float  *, int *);
+int BLASFUNC(dtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
+int BLASFUNC(qtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
+int BLASFUNC(ctbsv) (char *, char *, char *, int *, int *, float  *, int *, float  *, int *);
+int BLASFUNC(ztbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
+int BLASFUNC(xtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
+
+int BLASFUNC(ssymv) (char *, int *, float  *, float *, int *,
+		     float  *, int *, float *, float *, int *);
+int BLASFUNC(dsymv) (char *, int *, double  *, double *, int *,
+		     double  *, int *, double *, double *, int *);
+int BLASFUNC(qsymv) (char *, int *, double  *, double *, int *,
+		     double  *, int *, double *, double *, int *);
+int BLASFUNC(csymv) (char *, int *, float  *, float *, int *,
+		     float  *, int *, float *, float *, int *);
+int BLASFUNC(zsymv) (char *, int *, double  *, double *, int *,
+		     double  *, int *, double *, double *, int *);
+int BLASFUNC(xsymv) (char *, int *, double  *, double *, int *,
+		     double  *, int *, double *, double *, int *);
+
+int BLASFUNC(sspmv) (char *, int *, float  *, float *,
+		     float  *, int *, float *, float *, int *);
+int BLASFUNC(dspmv) (char *, int *, double  *, double *,
+		     double  *, int *, double *, double *, int *);
+int BLASFUNC(qspmv) (char *, int *, double  *, double *,
+		     double  *, int *, double *, double *, int *);
+int BLASFUNC(cspmv) (char *, int *, float  *, float *,
+		     float  *, int *, float *, float *, int *);
+int BLASFUNC(zspmv) (char *, int *, double  *, double *,
+		     double  *, int *, double *, double *, int *);
+int BLASFUNC(xspmv) (char *, int *, double  *, double *,
+		     double  *, int *, double *, double *, int *);
+
+int BLASFUNC(ssyr) (char *, int *, float   *, float  *, int *,
+		    float  *, int *);
+int BLASFUNC(dsyr) (char *, int *, double  *, double *, int *,
+		    double *, int *);
+int BLASFUNC(qsyr) (char *, int *, double  *, double *, int *,
+		    double *, int *);
+int BLASFUNC(csyr) (char *, int *, float   *, float  *, int *,
+		    float  *, int *);
+int BLASFUNC(zsyr) (char *, int *, double  *, double *, int *,
+		    double *, int *);
+int BLASFUNC(xsyr) (char *, int *, double  *, double *, int *,
+		    double *, int *);
+
+int BLASFUNC(ssyr2) (char *, int *, float   *,
+		     float  *, int *, float  *, int *, float  *, int *);
+int BLASFUNC(dsyr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *, int *);
+int BLASFUNC(qsyr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *, int *);
+int BLASFUNC(csyr2) (char *, int *, float   *,
+		     float  *, int *, float  *, int *, float  *, int *);
+int BLASFUNC(zsyr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *, int *);
+int BLASFUNC(xsyr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *, int *);
+
+int BLASFUNC(sspr) (char *, int *, float   *, float  *, int *,
+		    float  *);
+int BLASFUNC(dspr) (char *, int *, double  *, double *, int *,
+		    double *);
+int BLASFUNC(qspr) (char *, int *, double  *, double *, int *,
+		    double *);
+int BLASFUNC(cspr) (char *, int *, float   *, float  *, int *,
+		    float  *);
+int BLASFUNC(zspr) (char *, int *, double  *, double *, int *,
+		    double *);
+int BLASFUNC(xspr) (char *, int *, double  *, double *, int *,
+		    double *);
+
+int BLASFUNC(sspr2) (char *, int *, float   *,
+		     float  *, int *, float  *, int *, float  *);
+int BLASFUNC(dspr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *);
+int BLASFUNC(qspr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *);
+int BLASFUNC(cspr2) (char *, int *, float   *,
+		     float  *, int *, float  *, int *, float  *);
+int BLASFUNC(zspr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *);
+int BLASFUNC(xspr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *);
+
+int BLASFUNC(cher) (char *, int *, float   *, float  *, int *,
+		    float  *, int *);
+int BLASFUNC(zher) (char *, int *, double  *, double *, int *,
+		    double *, int *);
+int BLASFUNC(xher) (char *, int *, double  *, double *, int *,
+		    double *, int *);
+
+int BLASFUNC(chpr) (char *, int *, float   *, float  *, int *, float  *);
+int BLASFUNC(zhpr) (char *, int *, double  *, double *, int *, double *);
+int BLASFUNC(xhpr) (char *, int *, double  *, double *, int *, double *);
+
+int BLASFUNC(cher2) (char *, int *, float   *,
+		     float  *, int *, float  *, int *, float  *, int *);
+int BLASFUNC(zher2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *, int *);
+int BLASFUNC(xher2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *, int *);
+
+int BLASFUNC(chpr2) (char *, int *, float   *,
+		     float  *, int *, float  *, int *, float  *);
+int BLASFUNC(zhpr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *);
+int BLASFUNC(xhpr2) (char *, int *, double  *,
+		     double *, int *, double *, int *, double *);
+
+int BLASFUNC(chemv) (char *, int *, float  *, float *, int *,
+		     float  *, int *, float *, float *, int *);
+int BLASFUNC(zhemv) (char *, int *, double  *, double *, int *,
+		     double  *, int *, double *, double *, int *);
+int BLASFUNC(xhemv) (char *, int *, double  *, double *, int *,
+		     double  *, int *, double *, double *, int *);
+
+int BLASFUNC(chpmv) (char *, int *, float  *, float *,
+		     float  *, int *, float *, float *, int *);
+int BLASFUNC(zhpmv) (char *, int *, double  *, double *,
+		     double  *, int *, double *, double *, int *);
+int BLASFUNC(xhpmv) (char *, int *, double  *, double *,
+		     double  *, int *, double *, double *, int *);
+
+int BLASFUNC(snorm)(char *, int *, int *, float  *, int *);
+int BLASFUNC(dnorm)(char *, int *, int *, double *, int *);
+int BLASFUNC(cnorm)(char *, int *, int *, float  *, int *);
+int BLASFUNC(znorm)(char *, int *, int *, double *, int *);
+
+int BLASFUNC(sgbmv)(char *, int *, int *, int *, int *, float  *, float  *, int *,
+		    float  *, int *, float  *, float  *, int *);
+int BLASFUNC(dgbmv)(char *, int *, int *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(qgbmv)(char *, int *, int *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(cgbmv)(char *, int *, int *, int *, int *, float  *, float  *, int *,
+		    float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zgbmv)(char *, int *, int *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(xgbmv)(char *, int *, int *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+
+int BLASFUNC(ssbmv)(char *, int *, int *, float  *, float  *, int *,
+		    float  *, int *, float  *, float  *, int *);
+int BLASFUNC(dsbmv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(qsbmv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(csbmv)(char *, int *, int *, float  *, float  *, int *,
+		    float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zsbmv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(xsbmv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+
+int BLASFUNC(chbmv)(char *, int *, int *, float  *, float  *, int *,
+		    float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zhbmv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+int BLASFUNC(xhbmv)(char *, int *, int *, double *, double *, int *,
+		    double *, int *, double *, double *, int *);
+
+/* Level 3 routines */
+
+int BLASFUNC(sgemm)(char *, char *, int *, int *, int *, float *,
+	   float  *, int *, float  *, int *, float  *, float  *, int *);
+int BLASFUNC(dgemm)(char *, char *, int *, int *, int *, double *,
+	   double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(qgemm)(char *, char *, int *, int *, int *, double *,
+	   double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(cgemm)(char *, char *, int *, int *, int *, float *,
+	   float  *, int *, float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zgemm)(char *, char *, int *, int *, int *, double *,
+	   double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(xgemm)(char *, char *, int *, int *, int *, double *,
+	   double *, int *, double *, int *, double *, double *, int *);
+
+int BLASFUNC(cgemm3m)(char *, char *, int *, int *, int *, float *,
+	   float  *, int *, float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zgemm3m)(char *, char *, int *, int *, int *, double *,
+	   double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(xgemm3m)(char *, char *, int *, int *, int *, double *,
+	   double *, int *, double *, int *, double *, double *, int *);
+
+int BLASFUNC(sge2mm)(char *, char *, char *, int *, int *,
+		     float *, float  *, int *, float  *, int *,
+		     float *, float  *, int *);
+int BLASFUNC(dge2mm)(char *, char *, char *, int *, int *,
+		     double *, double  *, int *, double  *, int *,
+		     double *, double  *, int *);
+int BLASFUNC(cge2mm)(char *, char *, char *, int *, int *,
+		     float *, float  *, int *, float  *, int *,
+		     float *, float  *, int *);
+int BLASFUNC(zge2mm)(char *, char *, char *, int *, int *,
+		     double *, double  *, int *, double  *, int *,
+		     double *, double  *, int *);
+
+int BLASFUNC(strsm)(char *, char *, char *, char *, int *, int *,
+	   float *,  float *, int *, float *, int *);
+int BLASFUNC(dtrsm)(char *, char *, char *, char *, int *, int *,
+	   double *,  double *, int *, double *, int *);
+int BLASFUNC(qtrsm)(char *, char *, char *, char *, int *, int *,
+	   double *,  double *, int *, double *, int *);
+int BLASFUNC(ctrsm)(char *, char *, char *, char *, int *, int *,
+	   float *,  float *, int *, float *, int *);
+int BLASFUNC(ztrsm)(char *, char *, char *, char *, int *, int *,
+	   double *,  double *, int *, double *, int *);
+int BLASFUNC(xtrsm)(char *, char *, char *, char *, int *, int *,
+	   double *,  double *, int *, double *, int *);
+
+int BLASFUNC(strmm)(char *, char *, char *, char *, int *, int *,
+	   float *,  float *, int *, float *, int *);
+int BLASFUNC(dtrmm)(char *, char *, char *, char *, int *, int *,
+	   double *,  double *, int *, double *, int *);
+int BLASFUNC(qtrmm)(char *, char *, char *, char *, int *, int *,
+	   double *,  double *, int *, double *, int *);
+int BLASFUNC(ctrmm)(char *, char *, char *, char *, int *, int *,
+	   float *,  float *, int *, float *, int *);
+int BLASFUNC(ztrmm)(char *, char *, char *, char *, int *, int *,
+	   double *,  double *, int *, double *, int *);
+int BLASFUNC(xtrmm)(char *, char *, char *, char *, int *, int *,
+	   double *,  double *, int *, double *, int *);
+
+int BLASFUNC(ssymm)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float  *, int *, float  *, float  *, int *);
+int BLASFUNC(dsymm)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+int BLASFUNC(qsymm)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+int BLASFUNC(csymm)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zsymm)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+int BLASFUNC(xsymm)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+
+int BLASFUNC(csymm3m)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+
+int BLASFUNC(ssyrk)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float  *, float  *, int *);
+int BLASFUNC(dsyrk)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, double *, int *);
+int BLASFUNC(qsyrk)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, double *, int *);
+int BLASFUNC(csyrk)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float  *, float  *, int *);
+int BLASFUNC(zsyrk)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, double *, int *);
+int BLASFUNC(xsyrk)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, double *, int *);
+
+int BLASFUNC(ssyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float *, int *, float  *, float  *, int *);
+int BLASFUNC(dsyr2k)(char *, char *, int *, int *, double *, double *, int *,
+	   double*, int *, double *, double *, int *);
+int BLASFUNC(qsyr2k)(char *, char *, int *, int *, double *, double *, int *,
+	   double*, int *, double *, double *, int *);
+int BLASFUNC(csyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float *, int *, float  *, float  *, int *);
+int BLASFUNC(zsyr2k)(char *, char *, int *, int *, double *, double *, int *,
+	   double*, int *, double *, double *, int *);
+int BLASFUNC(xsyr2k)(char *, char *, int *, int *, double *, double *, int *,
+	   double*, int *, double *, double *, int *);
+
+int BLASFUNC(chemm)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zhemm)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+int BLASFUNC(xhemm)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+
+int BLASFUNC(chemm3m)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zhemm3m)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+int BLASFUNC(xhemm3m)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, int *, double *, double *, int *);
+
+int BLASFUNC(cherk)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float  *, float  *, int *);
+int BLASFUNC(zherk)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, double *, int *);
+int BLASFUNC(xherk)(char *, char *, int *, int *, double *, double *, int *,
+	   double *, double *, int *);
+
+int BLASFUNC(cher2k)(char *, char *, int *, int *, float  *, float  *, int *,
+	   float *, int *, float  *, float  *, int *);
+int BLASFUNC(zher2k)(char *, char *, int *, int *, double *, double *, int *,
+	   double*, int *, double *, double *, int *);
+int BLASFUNC(xher2k)(char *, char *, int *, int *, double *, double *, int *,
+	   double*, int *, double *, double *, int *);
+int BLASFUNC(cher2m)(char *, char *, char *, int *, int *, float  *, float  *, int *,
+	   float *, int *, float  *, float  *, int *);
+int BLASFUNC(zher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
+	   double*, int *, double *, double *, int *);
+int BLASFUNC(xher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
+	   double*, int *, double *, double *, int *);
+
+int BLASFUNC(sgemt)(char *, int *, int *, float  *, float  *, int *,
+		    float  *, int *);
+int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *,
+		    double *, int *);
+int BLASFUNC(cgemt)(char *, int *, int *, float  *, float  *, int *,
+		    float  *, int *);
+int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *,
+		    double *, int *);
+
+int BLASFUNC(sgema)(char *, char *, int *, int *, float  *,
+		    float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(dgema)(char *, char *, int *, int *, double *,
+		    double *, int *, double*, double *, int *, double*, int *);
+int BLASFUNC(cgema)(char *, char *, int *, int *, float  *,
+		    float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(zgema)(char *, char *, int *, int *, double *,
+		    double *, int *, double*, double *, int *, double*, int *);
+
+int BLASFUNC(sgems)(char *, char *, int *, int *, float  *,
+		    float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(dgems)(char *, char *, int *, int *, double *,
+		    double *, int *, double*, double *, int *, double*, int *);
+int BLASFUNC(cgems)(char *, char *, int *, int *, float  *,
+		    float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(zgems)(char *, char *, int *, int *, double *,
+		    double *, int *, double*, double *, int *, double*, int *);
+
+int BLASFUNC(sgetf2)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(cgetf2)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *);
+
+int BLASFUNC(sgetrf)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(cgetrf)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *);
+
+int BLASFUNC(slaswp)(int *, float  *, int *, int *, int *, int *, int *);
+int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(claswp)(int *, float  *, int *, int *, int *, int *, int *);
+int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *);
+
+int BLASFUNC(sgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
+int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(cgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
+int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+
+int BLASFUNC(sgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
+int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(cgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
+int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+
+int BLASFUNC(spotf2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotf2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(spotrf)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotrf)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(slauu2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(clauu2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(slauum)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(qlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(clauum)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(xlauum)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(strti2)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(ctrti2)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *);
+
+int BLASFUNC(strtri)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(ctrtri)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *);
+
+int BLASFUNC(spotri)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotri)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotri)(char *, int *, double *, int *, int *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/include/eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h
new file mode 100644
index 0000000..5c8c476
--- /dev/null
+++ b/include/eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -0,0 +1,211 @@
+/** \returns an expression of the coefficient wise product of \c *this and \a other
+  *
+  * \sa MatrixBase::cwiseProduct
+  */
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
+operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient wise quotient of \c *this and \a other
+  *
+  * \sa MatrixBase::cwiseQuotient
+  */
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise min of \c *this and \a other
+  *
+  * Example: \include Cwise_min.cpp
+  * Output: \verbinclude Cwise_min.out
+  *
+  * \sa max()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(min,internal::scalar_min_op)
+
+/** \returns an expression of the coefficient-wise min of \c *this and scalar \a other
+  *
+  * \sa max()
+  */
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived,
+                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const Scalar &other) const
+{
+  return (min)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+/** \returns an expression of the coefficient-wise max of \c *this and \a other
+  *
+  * Example: \include Cwise_max.cpp
+  * Output: \verbinclude Cwise_max.out
+  *
+  * \sa min()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(max,internal::scalar_max_op)
+
+/** \returns an expression of the coefficient-wise max of \c *this and scalar \a other
+  *
+  * \sa min()
+  */
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived,
+                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const Scalar &other) const
+{
+  return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+/** \returns an expression of the coefficient-wise \< operator of *this and \a other
+  *
+  * Example: \include Cwise_less.cpp
+  * Output: \verbinclude Cwise_less.out
+  *
+  * \sa all(), any(), operator>(), operator<=()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(operator<,std::less)
+
+/** \returns an expression of the coefficient-wise \<= operator of *this and \a other
+  *
+  * Example: \include Cwise_less_equal.cpp
+  * Output: \verbinclude Cwise_less_equal.out
+  *
+  * \sa all(), any(), operator>=(), operator<()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(operator<=,std::less_equal)
+
+/** \returns an expression of the coefficient-wise \> operator of *this and \a other
+  *
+  * Example: \include Cwise_greater.cpp
+  * Output: \verbinclude Cwise_greater.out
+  *
+  * \sa all(), any(), operator>=(), operator<()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(operator>,std::greater)
+
+/** \returns an expression of the coefficient-wise \>= operator of *this and \a other
+  *
+  * Example: \include Cwise_greater_equal.cpp
+  * Output: \verbinclude Cwise_greater_equal.out
+  *
+  * \sa all(), any(), operator>(), operator<=()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(operator>=,std::greater_equal)
+
+/** \returns an expression of the coefficient-wise == operator of *this and \a other
+  *
+  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+  * isMuchSmallerThan().
+  *
+  * Example: \include Cwise_equal_equal.cpp
+  * Output: \verbinclude Cwise_equal_equal.out
+  *
+  * \sa all(), any(), isApprox(), isMuchSmallerThan()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(operator==,std::equal_to)
+
+/** \returns an expression of the coefficient-wise != operator of *this and \a other
+  *
+  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+  * isMuchSmallerThan().
+  *
+  * Example: \include Cwise_not_equal.cpp
+  * Output: \verbinclude Cwise_not_equal.out
+  *
+  * \sa all(), any(), isApprox(), isMuchSmallerThan()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(operator!=,std::not_equal_to)
+
+// scalar addition
+
+/** \returns an expression of \c *this with each coeff incremented by the constant \a scalar
+  *
+  * Example: \include Cwise_plus.cpp
+  * Output: \verbinclude Cwise_plus.out
+  *
+  * \sa operator+=(), operator-()
+  */
+inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+operator+(const Scalar& scalar) const
+{
+  return CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>(derived(), internal::scalar_add_op<Scalar>(scalar));
+}
+
+friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+operator+(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
+{
+  return other + scalar;
+}
+
+/** \returns an expression of \c *this with each coeff decremented by the constant \a scalar
+  *
+  * Example: \include Cwise_minus.cpp
+  * Output: \verbinclude Cwise_minus.out
+  *
+  * \sa operator+(), operator-=()
+  */
+inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+operator-(const Scalar& scalar) const
+{
+  return *this + (-scalar);
+}
+
+friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> >
+operator-(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
+{
+  return (-other) + scalar;
+}
+
+/** \returns an expression of the coefficient-wise && operator of *this and \a other
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_and.cpp
+  * Output: \verbinclude Cwise_boolean_and.out
+  *
+  * \sa operator||(), select()
+  */
+template<typename OtherDerived>
+inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>(derived(),other.derived());
+}
+
+/** \returns an expression of the coefficient-wise || operator of *this and \a other
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_or.cpp
+  * Output: \verbinclude Cwise_boolean_or.out
+  *
+  * \sa operator&&(), select()
+  */
+template<typename OtherDerived>
+inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
+}
diff --git a/include/eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/include/eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h
new file mode 100644
index 0000000..a596367
--- /dev/null
+++ b/include/eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -0,0 +1,203 @@
+
+
+/** \returns an expression of the coefficient-wise absolute value of \c *this
+  *
+  * Example: \include Cwise_abs.cpp
+  * Output: \verbinclude Cwise_abs.out
+  *
+  * \sa abs2()
+  */
+EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+abs() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise squared absolute value of \c *this
+  *
+  * Example: \include Cwise_abs2.cpp
+  * Output: \verbinclude Cwise_abs2.out
+  *
+  * \sa abs(), square()
+  */
+EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
+abs2() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise exponential of *this.
+  *
+  * Example: \include Cwise_exp.cpp
+  * Output: \verbinclude Cwise_exp.out
+  *
+  * \sa pow(), log(), sin(), cos()
+  */
+inline const CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+exp() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise logarithm of *this.
+  *
+  * Example: \include Cwise_log.cpp
+  * Output: \verbinclude Cwise_log.out
+  *
+  * \sa exp()
+  */
+inline const CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+log() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise square root of *this.
+  *
+  * Example: \include Cwise_sqrt.cpp
+  * Output: \verbinclude Cwise_sqrt.out
+  *
+  * \sa pow(), square()
+  */
+inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+sqrt() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise cosine of *this.
+  *
+  * Example: \include Cwise_cos.cpp
+  * Output: \verbinclude Cwise_cos.out
+  *
+  * \sa sin(), acos()
+  */
+inline const CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived>
+cos() const
+{
+  return derived();
+}
+
+
+/** \returns an expression of the coefficient-wise sine of *this.
+  *
+  * Example: \include Cwise_sin.cpp
+  * Output: \verbinclude Cwise_sin.out
+  *
+  * \sa cos(), asin()
+  */
+inline const CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived>
+sin() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise arc cosine of *this.
+  *
+  * Example: \include Cwise_acos.cpp
+  * Output: \verbinclude Cwise_acos.out
+  *
+  * \sa cos(), asin()
+  */
+inline const CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived>
+acos() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise arc sine of *this.
+  *
+  * Example: \include Cwise_asin.cpp
+  * Output: \verbinclude Cwise_asin.out
+  *
+  * \sa sin(), acos()
+  */
+inline const CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived>
+asin() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise tan of *this.
+  *
+  * Example: \include Cwise_tan.cpp
+  * Output: \verbinclude Cwise_tan.out
+  *
+  * \sa cos(), sin()
+  */
+inline const CwiseUnaryOp<internal::scalar_tan_op<Scalar>, Derived>
+tan() const
+{
+  return derived();
+}
+
+
+/** \returns an expression of the coefficient-wise power of *this to the given exponent.
+  *
+  * Example: \include Cwise_pow.cpp
+  * Output: \verbinclude Cwise_pow.out
+  *
+  * \sa exp(), log()
+  */
+inline const CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+pow(const Scalar& exponent) const
+{
+  return CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+          (derived(), internal::scalar_pow_op<Scalar>(exponent));
+}
+
+
+/** \returns an expression of the coefficient-wise inverse of *this.
+  *
+  * Example: \include Cwise_inverse.cpp
+  * Output: \verbinclude Cwise_inverse.out
+  *
+  * \sa operator/(), operator*()
+  */
+inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+inverse() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise square of *this.
+  *
+  * Example: \include Cwise_square.cpp
+  * Output: \verbinclude Cwise_square.out
+  *
+  * \sa operator/(), operator*(), abs2()
+  */
+inline const CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+square() const
+{
+  return derived();
+}
+
+/** \returns an expression of the coefficient-wise cube of *this.
+  *
+  * Example: \include Cwise_cube.cpp
+  * Output: \verbinclude Cwise_cube.out
+  *
+  * \sa square(), pow()
+  */
+inline const CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+cube() const
+{
+  return derived();
+}
+
+#define EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(METHOD_NAME,FUNCTOR) \
+  inline const CwiseUnaryOp<std::binder2nd<FUNCTOR<Scalar> >, const Derived> \
+  METHOD_NAME(const Scalar& s) const { \
+    return CwiseUnaryOp<std::binder2nd<FUNCTOR<Scalar> >, const Derived> \
+            (derived(), std::bind2nd(FUNCTOR<Scalar>(), s)); \
+  }
+
+EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator==,  std::equal_to)
+EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator!=,  std::not_equal_to)
+EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator<,   std::less)
+EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator<=,  std::less_equal)
+EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator>,   std::greater)
+EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator>=,  std::greater_equal)
+
+
diff --git a/include/eigen3/Eigen/src/plugins/BlockMethods.h b/include/eigen3/Eigen/src/plugins/BlockMethods.h
new file mode 100644
index 0000000..2788251
--- /dev/null
+++ b/include/eigen3/Eigen/src/plugins/BlockMethods.h
@@ -0,0 +1,935 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+/** \internal expression type of a column */
+typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ColXpr;
+typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ConstColXpr;
+/** \internal expression type of a row */
+typedef Block<Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowXpr;
+typedef const Block<const Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowXpr;
+/** \internal expression type of a block of whole columns */
+typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr;
+typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ConstColsBlockXpr;
+/** \internal expression type of a block of whole rows */
+typedef Block<Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowsBlockXpr;
+typedef const Block<const Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowsBlockXpr;
+/** \internal expression type of a block of whole columns */
+template<int N> struct NColsBlockXpr { typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
+template<int N> struct ConstNColsBlockXpr { typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
+/** \internal expression type of a block of whole rows */
+template<int N> struct NRowsBlockXpr { typedef Block<Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
+template<int N> struct ConstNRowsBlockXpr { typedef const Block<const Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
+
+typedef VectorBlock<Derived> SegmentReturnType;
+typedef const VectorBlock<const Derived> ConstSegmentReturnType;
+template<int Size> struct FixedSegmentReturnType { typedef VectorBlock<Derived, Size> Type; };
+template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBlock<const Derived, Size> Type; };
+
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+/** \returns a dynamic-size expression of a block in *this.
+  *
+  * \param startRow the first row in the block
+  * \param startCol the first column in the block
+  * \param blockRows the number of rows in the block
+  * \param blockCols the number of columns in the block
+  *
+  * Example: \include MatrixBase_block_int_int_int_int.cpp
+  * Output: \verbinclude MatrixBase_block_int_int_int_int.out
+  *
+  * \note Even though the returned expression has dynamic size, in the case
+  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
+  * which means that evaluating it does not cause a dynamic memory allocation.
+  *
+  * \sa class Block, block(Index,Index)
+  */
+inline Block<Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols)
+{
+  return Block<Derived>(derived(), startRow, startCol, blockRows, blockCols);
+}
+
+/** This is the const version of block(Index,Index,Index,Index). */
+inline const Block<const Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
+{
+  return Block<const Derived>(derived(), startRow, startCol, blockRows, blockCols);
+}
+
+
+
+
+/** \returns a dynamic-size expression of a top-right corner of *this.
+  *
+  * \param cRows the number of rows in the corner
+  * \param cCols the number of columns in the corner
+  *
+  * Example: \include MatrixBase_topRightCorner_int_int.cpp
+  * Output: \verbinclude MatrixBase_topRightCorner_int_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline Block<Derived> topRightCorner(Index cRows, Index cCols)
+{
+  return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
+}
+
+/** This is the const version of topRightCorner(Index, Index).*/
+inline const Block<const Derived> topRightCorner(Index cRows, Index cCols) const
+{
+  return Block<const Derived>(derived(), 0, cols() - cCols, cRows, cCols);
+}
+
+/** \returns an expression of a fixed-size top-right corner of *this.
+  *
+  * \tparam CRows the number of rows in the corner
+  * \tparam CCols the number of columns in the corner
+  *
+  * Example: \include MatrixBase_template_int_int_topRightCorner.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
+  *
+  * \sa class Block, block<int,int>(Index,Index)
+  */
+template<int CRows, int CCols>
+inline Block<Derived, CRows, CCols> topRightCorner()
+{
+  return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
+}
+
+/** This is the const version of topRightCorner<int, int>().*/
+template<int CRows, int CCols>
+inline const Block<const Derived, CRows, CCols> topRightCorner() const
+{
+  return Block<const Derived, CRows, CCols>(derived(), 0, cols() - CCols);
+}
+
+/** \returns an expression of a top-right corner of *this.
+  *
+  * \tparam CRows number of rows in corner as specified at compile-time
+  * \tparam CCols number of columns in corner as specified at compile-time
+  * \param  cRows number of rows in corner as specified at run-time
+  * \param  cCols number of columns in corner as specified at run-time
+  *
+  * This function is mainly useful for corners where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+  * information should not contradict. In other words, \a cRows should equal \a CRows unless
+  * \a CRows is \a Dynamic, and the same for the number of columns.
+  *
+  * Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
+  *
+  * \sa class Block
+  */
+template<int CRows, int CCols>
+inline Block<Derived, CRows, CCols> topRightCorner(Index cRows, Index cCols)
+{
+  return Block<Derived, CRows, CCols>(derived(), 0, cols() - cCols, cRows, cCols);
+}
+
+/** This is the const version of topRightCorner<int, int>(Index, Index).*/
+template<int CRows, int CCols>
+inline const Block<const Derived, CRows, CCols> topRightCorner(Index cRows, Index cCols) const
+{
+  return Block<const Derived, CRows, CCols>(derived(), 0, cols() - cCols, cRows, cCols);
+}
+
+
+
+/** \returns a dynamic-size expression of a top-left corner of *this.
+  *
+  * \param cRows the number of rows in the corner
+  * \param cCols the number of columns in the corner
+  *
+  * Example: \include MatrixBase_topLeftCorner_int_int.cpp
+  * Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline Block<Derived> topLeftCorner(Index cRows, Index cCols)
+{
+  return Block<Derived>(derived(), 0, 0, cRows, cCols);
+}
+
+/** This is the const version of topLeftCorner(Index, Index).*/
+inline const Block<const Derived> topLeftCorner(Index cRows, Index cCols) const
+{
+  return Block<const Derived>(derived(), 0, 0, cRows, cCols);
+}
+
+/** \returns an expression of a fixed-size top-left corner of *this.
+  *
+  * The template parameters CRows and CCols are the number of rows and columns in the corner.
+  *
+  * Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int CRows, int CCols>
+inline Block<Derived, CRows, CCols> topLeftCorner()
+{
+  return Block<Derived, CRows, CCols>(derived(), 0, 0);
+}
+
+/** This is the const version of topLeftCorner<int, int>().*/
+template<int CRows, int CCols>
+inline const Block<const Derived, CRows, CCols> topLeftCorner() const
+{
+  return Block<const Derived, CRows, CCols>(derived(), 0, 0);
+}
+
+/** \returns an expression of a top-left corner of *this.
+  *
+  * \tparam CRows number of rows in corner as specified at compile-time
+  * \tparam CCols number of columns in corner as specified at compile-time
+  * \param  cRows number of rows in corner as specified at run-time
+  * \param  cCols number of columns in corner as specified at run-time
+  *
+  * This function is mainly useful for corners where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+  * information should not contradict. In other words, \a cRows should equal \a CRows unless
+  * \a CRows is \a Dynamic, and the same for the number of columns.
+  *
+  * Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
+  *
+  * \sa class Block
+  */
+template<int CRows, int CCols>
+inline Block<Derived, CRows, CCols> topLeftCorner(Index cRows, Index cCols)
+{
+  return Block<Derived, CRows, CCols>(derived(), 0, 0, cRows, cCols);
+}
+
+/** This is the const version of topLeftCorner<int, int>(Index, Index).*/
+template<int CRows, int CCols>
+inline const Block<const Derived, CRows, CCols> topLeftCorner(Index cRows, Index cCols) const
+{
+  return Block<const Derived, CRows, CCols>(derived(), 0, 0, cRows, cCols);
+}
+
+
+
+/** \returns a dynamic-size expression of a bottom-right corner of *this.
+  *
+  * \param cRows the number of rows in the corner
+  * \param cCols the number of columns in the corner
+  *
+  * Example: \include MatrixBase_bottomRightCorner_int_int.cpp
+  * Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline Block<Derived> bottomRightCorner(Index cRows, Index cCols)
+{
+  return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+}
+
+/** This is the const version of bottomRightCorner(Index, Index).*/
+inline const Block<const Derived> bottomRightCorner(Index cRows, Index cCols) const
+{
+  return Block<const Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+}
+
+/** \returns an expression of a fixed-size bottom-right corner of *this.
+  *
+  * The template parameters CRows and CCols are the number of rows and columns in the corner.
+  *
+  * Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int CRows, int CCols>
+inline Block<Derived, CRows, CCols> bottomRightCorner()
+{
+  return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
+}
+
+/** This is the const version of bottomRightCorner<int, int>().*/
+template<int CRows, int CCols>
+inline const Block<const Derived, CRows, CCols> bottomRightCorner() const
+{
+  return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
+}
+
+/** \returns an expression of a bottom-right corner of *this.
+  *
+  * \tparam CRows number of rows in corner as specified at compile-time
+  * \tparam CCols number of columns in corner as specified at compile-time
+  * \param  cRows number of rows in corner as specified at run-time
+  * \param  cCols number of columns in corner as specified at run-time
+  *
+  * This function is mainly useful for corners where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+  * information should not contradict. In other words, \a cRows should equal \a CRows unless
+  * \a CRows is \a Dynamic, and the same for the number of columns.
+  *
+  * Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
+  *
+  * \sa class Block
+  */
+template<int CRows, int CCols>
+inline Block<Derived, CRows, CCols> bottomRightCorner(Index cRows, Index cCols)
+{
+  return Block<Derived, CRows, CCols>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+}
+
+/** This is the const version of bottomRightCorner<int, int>(Index, Index).*/
+template<int CRows, int CCols>
+inline const Block<const Derived, CRows, CCols> bottomRightCorner(Index cRows, Index cCols) const
+{
+  return Block<const Derived, CRows, CCols>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+}
+
+
+
+/** \returns a dynamic-size expression of a bottom-left corner of *this.
+  *
+  * \param cRows the number of rows in the corner
+  * \param cCols the number of columns in the corner
+  *
+  * Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
+  * Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline Block<Derived> bottomLeftCorner(Index cRows, Index cCols)
+{
+  return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
+}
+
+/** This is the const version of bottomLeftCorner(Index, Index).*/
+inline const Block<const Derived> bottomLeftCorner(Index cRows, Index cCols) const
+{
+  return Block<const Derived>(derived(), rows() - cRows, 0, cRows, cCols);
+}
+
+/** \returns an expression of a fixed-size bottom-left corner of *this.
+  *
+  * The template parameters CRows and CCols are the number of rows and columns in the corner.
+  *
+  * Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int CRows, int CCols>
+inline Block<Derived, CRows, CCols> bottomLeftCorner()
+{
+  return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
+}
+
+/** This is the const version of bottomLeftCorner<int, int>().*/
+template<int CRows, int CCols>
+inline const Block<const Derived, CRows, CCols> bottomLeftCorner() const
+{
+  return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, 0);
+}
+
+/** \returns an expression of a bottom-left corner of *this.
+  *
+  * \tparam CRows number of rows in corner as specified at compile-time
+  * \tparam CCols number of columns in corner as specified at compile-time
+  * \param  cRows number of rows in corner as specified at run-time
+  * \param  cCols number of columns in corner as specified at run-time
+  *
+  * This function is mainly useful for corners where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+  * information should not contradict. In other words, \a cRows should equal \a CRows unless
+  * \a CRows is \a Dynamic, and the same for the number of columns.
+  *
+  * Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
+  *
+  * \sa class Block
+  */
+template<int CRows, int CCols>
+inline Block<Derived, CRows, CCols> bottomLeftCorner(Index cRows, Index cCols)
+{
+  return Block<Derived, CRows, CCols>(derived(), rows() - cRows, 0, cRows, cCols);
+}
+
+/** This is the const version of bottomLeftCorner<int, int>(Index, Index).*/
+template<int CRows, int CCols>
+inline const Block<const Derived, CRows, CCols> bottomLeftCorner(Index cRows, Index cCols) const
+{
+  return Block<const Derived, CRows, CCols>(derived(), rows() - cRows, 0, cRows, cCols);
+}
+
+
+
+/** \returns a block consisting of the top rows of *this.
+  *
+  * \param n the number of rows in the block
+  *
+  * Example: \include MatrixBase_topRows_int.cpp
+  * Output: \verbinclude MatrixBase_topRows_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline RowsBlockXpr topRows(Index n)
+{
+  return RowsBlockXpr(derived(), 0, 0, n, cols());
+}
+
+/** This is the const version of topRows(Index).*/
+inline ConstRowsBlockXpr topRows(Index n) const
+{
+  return ConstRowsBlockXpr(derived(), 0, 0, n, cols());
+}
+
+/** \returns a block consisting of the top rows of *this.
+  *
+  * \tparam N the number of rows in the block as specified at compile-time
+  * \param n the number of rows in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
+  *
+  * Example: \include MatrixBase_template_int_topRows.cpp
+  * Output: \verbinclude MatrixBase_template_int_topRows.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int N>
+inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
+{
+  return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
+}
+
+/** This is the const version of topRows<int>().*/
+template<int N>
+inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
+{
+  return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
+}
+
+
+
+/** \returns a block consisting of the bottom rows of *this.
+  *
+  * \param n the number of rows in the block
+  *
+  * Example: \include MatrixBase_bottomRows_int.cpp
+  * Output: \verbinclude MatrixBase_bottomRows_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline RowsBlockXpr bottomRows(Index n)
+{
+  return RowsBlockXpr(derived(), rows() - n, 0, n, cols());
+}
+
+/** This is the const version of bottomRows(Index).*/
+inline ConstRowsBlockXpr bottomRows(Index n) const
+{
+  return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols());
+}
+
+/** \returns a block consisting of the bottom rows of *this.
+  *
+  * \tparam N the number of rows in the block as specified at compile-time
+  * \param n the number of rows in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
+  *
+  * Example: \include MatrixBase_template_int_bottomRows.cpp
+  * Output: \verbinclude MatrixBase_template_int_bottomRows.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int N>
+inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
+{
+  return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
+}
+
+/** This is the const version of bottomRows<int>().*/
+template<int N>
+inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
+{
+  return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
+}
+
+
+
+/** \returns a block consisting of a range of rows of *this.
+  *
+  * \param startRow the index of the first row in the block
+  * \param n the number of rows in the block
+  *
+  * Example: \include DenseBase_middleRows_int.cpp
+  * Output: \verbinclude DenseBase_middleRows_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline RowsBlockXpr middleRows(Index startRow, Index n)
+{
+  return RowsBlockXpr(derived(), startRow, 0, n, cols());
+}
+
+/** This is the const version of middleRows(Index,Index).*/
+inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
+{
+  return ConstRowsBlockXpr(derived(), startRow, 0, n, cols());
+}
+
+/** \returns a block consisting of a range of rows of *this.
+  *
+  * \tparam N the number of rows in the block as specified at compile-time
+  * \param startRow the index of the first row in the block
+  * \param n the number of rows in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
+  *
+  * Example: \include DenseBase_template_int_middleRows.cpp
+  * Output: \verbinclude DenseBase_template_int_middleRows.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int N>
+inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
+{
+  return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
+}
+
+/** This is the const version of middleRows<int>().*/
+template<int N>
+inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
+{
+  return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
+}
+
+
+
+/** \returns a block consisting of the left columns of *this.
+  *
+  * \param n the number of columns in the block
+  *
+  * Example: \include MatrixBase_leftCols_int.cpp
+  * Output: \verbinclude MatrixBase_leftCols_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline ColsBlockXpr leftCols(Index n)
+{
+  return ColsBlockXpr(derived(), 0, 0, rows(), n);
+}
+
+/** This is the const version of leftCols(Index).*/
+inline ConstColsBlockXpr leftCols(Index n) const
+{
+  return ConstColsBlockXpr(derived(), 0, 0, rows(), n);
+}
+
+/** \returns a block consisting of the left columns of *this.
+  *
+  * \tparam N the number of columns in the block as specified at compile-time
+  * \param n the number of columns in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
+  *
+  * Example: \include MatrixBase_template_int_leftCols.cpp
+  * Output: \verbinclude MatrixBase_template_int_leftCols.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int N>
+inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
+{
+  return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
+}
+
+/** This is the const version of leftCols<int>().*/
+template<int N>
+inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
+{
+  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
+}
+
+
+
+/** \returns a block consisting of the right columns of *this.
+  *
+  * \param n the number of columns in the block
+  *
+  * Example: \include MatrixBase_rightCols_int.cpp
+  * Output: \verbinclude MatrixBase_rightCols_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline ColsBlockXpr rightCols(Index n)
+{
+  return ColsBlockXpr(derived(), 0, cols() - n, rows(), n);
+}
+
+/** This is the const version of rightCols(Index).*/
+inline ConstColsBlockXpr rightCols(Index n) const
+{
+  return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n);
+}
+
+/** \returns a block consisting of the right columns of *this.
+  *
+  * \tparam N the number of columns in the block as specified at compile-time
+  * \param n the number of columns in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
+  *
+  * Example: \include MatrixBase_template_int_rightCols.cpp
+  * Output: \verbinclude MatrixBase_template_int_rightCols.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int N>
+inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
+{
+  return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
+}
+
+/** This is the const version of rightCols<int>().*/
+template<int N>
+inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
+{
+  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
+}
+
+
+
+/** \returns a block consisting of a range of columns of *this.
+  *
+  * \param startCol the index of the first column in the block
+  * \param numCols the number of columns in the block
+  *
+  * Example: \include DenseBase_middleCols_int.cpp
+  * Output: \verbinclude DenseBase_middleCols_int.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+inline ColsBlockXpr middleCols(Index startCol, Index numCols)
+{
+  return ColsBlockXpr(derived(), 0, startCol, rows(), numCols);
+}
+
+/** This is the const version of middleCols(Index,Index).*/
+inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
+{
+  return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols);
+}
+
+/** \returns a block consisting of a range of columns of *this.
+  *
+  * \tparam N the number of columns in the block as specified at compile-time
+  * \param startCol the index of the first column in the block
+  * \param n the number of columns in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
+  *
+  * Example: \include DenseBase_template_int_middleCols.cpp
+  * Output: \verbinclude DenseBase_template_int_middleCols.out
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int N>
+inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
+{
+  return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
+}
+
+/** This is the const version of middleCols<int>().*/
+template<int N>
+inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
+{
+  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
+}
+
+
+
+/** \returns a fixed-size expression of a block in *this.
+  *
+  * The template parameters \a BlockRows and \a BlockCols are the number of
+  * rows and columns in the block.
+  *
+  * \param startRow the first row in the block
+  * \param startCol the first column in the block
+  *
+  * Example: \include MatrixBase_block_int_int.cpp
+  * Output: \verbinclude MatrixBase_block_int_int.out
+  *
+  * \note since block is a templated member, the keyword template has to be used
+  * if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int BlockRows, int BlockCols>
+inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol)
+{
+  return Block<Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
+}
+
+/** This is the const version of block<>(Index, Index). */
+template<int BlockRows, int BlockCols>
+inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, Index startCol) const
+{
+  return Block<const Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
+}
+
+/** \returns an expression of a block in *this.
+  *
+  * \tparam BlockRows number of rows in block as specified at compile-time
+  * \tparam BlockCols number of columns in block as specified at compile-time
+  * \param  startRow  the first row in the block
+  * \param  startCol  the first column in the block
+  * \param  blockRows number of rows in block as specified at run-time
+  * \param  blockCols number of columns in block as specified at run-time
+  *
+  * This function is mainly useful for blocks where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+  * information should not contradict. In other words, \a blockRows should equal \a BlockRows unless
+  * \a BlockRows is \a Dynamic, and the same for the number of columns.
+  *
+  * Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
+  * Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
+  *
+  * \sa class Block, block(Index,Index,Index,Index)
+  */
+template<int BlockRows, int BlockCols>
+inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol, 
+                                                  Index blockRows, Index blockCols)
+{
+  return Block<Derived, BlockRows, BlockCols>(derived(), startRow, startCol, blockRows, blockCols);
+}
+
+/** This is the const version of block<>(Index, Index, Index, Index). */
+template<int BlockRows, int BlockCols>
+inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, Index startCol,
+                                                              Index blockRows, Index blockCols) const
+{
+  return Block<const Derived, BlockRows, BlockCols>(derived(), startRow, startCol, blockRows, blockCols);
+}
+
+/** \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
+  *
+  * Example: \include MatrixBase_col.cpp
+  * Output: \verbinclude MatrixBase_col.out
+  *
+  * \sa row(), class Block */
+inline ColXpr col(Index i)
+{
+  return ColXpr(derived(), i);
+}
+
+/** This is the const version of col(). */
+inline ConstColXpr col(Index i) const
+{
+  return ConstColXpr(derived(), i);
+}
+
+/** \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
+  *
+  * Example: \include MatrixBase_row.cpp
+  * Output: \verbinclude MatrixBase_row.out
+  *
+  * \sa col(), class Block */
+inline RowXpr row(Index i)
+{
+  return RowXpr(derived(), i);
+}
+
+/** This is the const version of row(). */
+inline ConstRowXpr row(Index i) const
+{
+  return ConstRowXpr(derived(), i);
+}
+
+/** \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
+  *
+  * \only_for_vectors
+  *
+  * \param start the first coefficient in the segment
+  * \param n the number of coefficients in the segment
+  *
+  * Example: \include MatrixBase_segment_int_int.cpp
+  * Output: \verbinclude MatrixBase_segment_int_int.out
+  *
+  * \note Even though the returned expression has dynamic size, in the case
+  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+  * which means that evaluating it does not cause a dynamic memory allocation.
+  *
+  * \sa class Block, segment(Index)
+  */
+inline SegmentReturnType segment(Index start, Index n)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return SegmentReturnType(derived(), start, n);
+}
+
+
+/** This is the const version of segment(Index,Index).*/
+inline ConstSegmentReturnType segment(Index start, Index n) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return ConstSegmentReturnType(derived(), start, n);
+}
+
+/** \returns a dynamic-size expression of the first coefficients of *this.
+  *
+  * \only_for_vectors
+  *
+  * \param n the number of coefficients in the segment
+  *
+  * Example: \include MatrixBase_start_int.cpp
+  * Output: \verbinclude MatrixBase_start_int.out
+  *
+  * \note Even though the returned expression has dynamic size, in the case
+  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+  * which means that evaluating it does not cause a dynamic memory allocation.
+  *
+  * \sa class Block, block(Index,Index)
+  */
+inline SegmentReturnType head(Index n)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return SegmentReturnType(derived(), 0, n);
+}
+
+/** This is the const version of head(Index).*/
+inline ConstSegmentReturnType head(Index n) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return ConstSegmentReturnType(derived(), 0, n);
+}
+
+/** \returns a dynamic-size expression of the last coefficients of *this.
+  *
+  * \only_for_vectors
+  *
+  * \param n the number of coefficients in the segment
+  *
+  * Example: \include MatrixBase_end_int.cpp
+  * Output: \verbinclude MatrixBase_end_int.out
+  *
+  * \note Even though the returned expression has dynamic size, in the case
+  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+  * which means that evaluating it does not cause a dynamic memory allocation.
+  *
+  * \sa class Block, block(Index,Index)
+  */
+inline SegmentReturnType tail(Index n)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return SegmentReturnType(derived(), this->size() - n, n);
+}
+
+/** This is the const version of tail(Index).*/
+inline ConstSegmentReturnType tail(Index n) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return ConstSegmentReturnType(derived(), this->size() - n, n);
+}
+
+/** \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
+  *
+  * \only_for_vectors
+  *
+  * \tparam N the number of coefficients in the segment as specified at compile-time
+  * \param start the index of the first element in the segment
+  * \param n the number of coefficients in the segment as specified at compile-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
+  *
+  * Example: \include MatrixBase_template_int_segment.cpp
+  * Output: \verbinclude MatrixBase_template_int_segment.out
+  *
+  * \sa class Block
+  */
+template<int N>
+inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
+}
+
+/** This is the const version of segment<int>(Index).*/
+template<int N>
+inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
+}
+
+/** \returns a fixed-size expression of the first coefficients of *this.
+  *
+  * \only_for_vectors
+  *
+  * \tparam N the number of coefficients in the segment as specified at compile-time
+  * \param  n the number of coefficients in the segment as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
+  *
+  * Example: \include MatrixBase_template_int_start.cpp
+  * Output: \verbinclude MatrixBase_template_int_start.out
+  *
+  * \sa class Block
+  */
+template<int N>
+inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
+}
+
+/** This is the const version of head<int>().*/
+template<int N>
+inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
+}
+
+/** \returns a fixed-size expression of the last coefficients of *this.
+  *
+  * \only_for_vectors
+  *
+  * \tparam N the number of coefficients in the segment as specified at compile-time
+  * \param  n the number of coefficients in the segment as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
+  *
+  * Example: \include MatrixBase_template_int_end.cpp
+  * Output: \verbinclude MatrixBase_template_int_end.out
+  *
+  * \sa class Block
+  */
+template<int N>
+inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
+}
+
+/** This is the const version of tail<int>.*/
+template<int N>
+inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n);
+}
diff --git a/include/eigen3/Eigen/src/plugins/CommonCwiseBinaryOps.h b/include/eigen3/Eigen/src/plugins/CommonCwiseBinaryOps.h
new file mode 100644
index 0000000..688d224
--- /dev/null
+++ b/include/eigen3/Eigen/src/plugins/CommonCwiseBinaryOps.h
@@ -0,0 +1,46 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This file is a base class plugin containing common coefficient wise functions.
+
+/** \returns an expression of the difference of \c *this and \a other
+  *
+  * \note If you want to substract a given scalar from all coefficients, see Cwise::operator-().
+  *
+  * \sa class CwiseBinaryOp, operator-=()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(operator-,internal::scalar_difference_op)
+
+/** \returns an expression of the sum of \c *this and \a other
+  *
+  * \note If you want to add a given scalar to all coefficients, see Cwise::operator+().
+  *
+  * \sa class CwiseBinaryOp, operator+=()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(operator+,internal::scalar_sum_op)
+
+/** \returns an expression of a custom coefficient-wise operator \a func of *this and \a other
+  *
+  * The template parameter \a CustomBinaryOp is the type of the functor
+  * of the custom operator (see class CwiseBinaryOp for an example)
+  *
+  * Here is an example illustrating the use of custom functors:
+  * \include class_CwiseBinaryOp.cpp
+  * Output: \verbinclude class_CwiseBinaryOp.out
+  *
+  * \sa class CwiseBinaryOp, operator+(), operator-(), cwiseProduct()
+  */
+template<typename CustomBinaryOp, typename OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+binaryExpr(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other, const CustomBinaryOp& func = CustomBinaryOp()) const
+{
+  return CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other.derived(), func);
+}
+
diff --git a/include/eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h b/include/eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h
new file mode 100644
index 0000000..08e931a
--- /dev/null
+++ b/include/eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This file is a base class plugin containing common coefficient wise functions.
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+/** \internal Represents a scalar multiple of an expression */
+typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> ScalarMultipleReturnType;
+/** \internal Represents a quotient of an expression by a scalar*/
+typedef CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> ScalarQuotient1ReturnType;
+/** \internal the return type of conjugate() */
+typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                    const CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
+                    const Derived&
+                  >::type ConjugateReturnType;
+/** \internal the return type of real() const */
+typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                    const CwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>,
+                    const Derived&
+                  >::type RealReturnType;
+/** \internal the return type of real() */
+typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                    CwiseUnaryView<internal::scalar_real_ref_op<Scalar>, Derived>,
+                    Derived&
+                  >::type NonConstRealReturnType;
+/** \internal the return type of imag() const */
+typedef CwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived> ImagReturnType;
+/** \internal the return type of imag() */
+typedef CwiseUnaryView<internal::scalar_imag_ref_op<Scalar>, Derived> NonConstImagReturnType;
+
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+/** \returns an expression of the opposite of \c *this
+  */
+inline const CwiseUnaryOp<internal::scalar_opposite_op<typename internal::traits<Derived>::Scalar>, const Derived>
+operator-() const { return derived(); }
+
+
+/** \returns an expression of \c *this scaled by the scalar factor \a scalar */
+inline const ScalarMultipleReturnType
+operator*(const Scalar& scalar) const
+{
+  return CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+    (derived(), internal::scalar_multiple_op<Scalar>(scalar));
+}
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+const ScalarMultipleReturnType operator*(const RealScalar& scalar) const;
+#endif
+
+/** \returns an expression of \c *this divided by the scalar value \a scalar */
+inline const CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>, const Derived>
+operator/(const Scalar& scalar) const
+{
+  return CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+    (derived(), internal::scalar_quotient1_op<Scalar>(scalar));
+}
+
+/** Overloaded for efficient real matrix times complex scalar value */
+inline const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
+operator*(const std::complex<Scalar>& scalar) const
+{
+  return CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
+    (*static_cast<const Derived*>(this), internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >(scalar));
+}
+
+inline friend const ScalarMultipleReturnType
+operator*(const Scalar& scalar, const StorageBaseType& matrix)
+{ return matrix*scalar; }
+
+inline friend const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
+operator*(const std::complex<Scalar>& scalar, const StorageBaseType& matrix)
+{ return matrix*scalar; }
+
+/** \returns an expression of *this with the \a Scalar type casted to
+  * \a NewScalar.
+  *
+  * The template parameter \a NewScalar is the type we are casting the scalars to.
+  *
+  * \sa class CwiseUnaryOp
+  */
+template<typename NewType>
+typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<typename internal::traits<Derived>::Scalar, NewType>, const Derived> >::type
+cast() const
+{
+  return derived();
+}
+
+/** \returns an expression of the complex conjugate of \c *this.
+  *
+  * \sa adjoint() */
+inline ConjugateReturnType
+conjugate() const
+{
+  return ConjugateReturnType(derived());
+}
+
+/** \returns a read-only expression of the real part of \c *this.
+  *
+  * \sa imag() */
+inline RealReturnType
+real() const { return derived(); }
+
+/** \returns an read-only expression of the imaginary part of \c *this.
+  *
+  * \sa real() */
+inline const ImagReturnType
+imag() const { return derived(); }
+
+/** \brief Apply a unary operator coefficient-wise
+  * \param[in]  func  Functor implementing the unary operator
+  * \tparam  CustomUnaryOp Type of \a func  
+  * \returns An expression of a custom coefficient-wise unary operator \a func of *this
+  *
+  * The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
+  *
+  * Example:
+  * \include class_CwiseUnaryOp_ptrfun.cpp
+  * Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
+  *
+  * Genuine functors allow for more possibilities, for instance it may contain a state.
+  *
+  * Example:
+  * \include class_CwiseUnaryOp.cpp
+  * Output: \verbinclude class_CwiseUnaryOp.out
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp
+  */
+template<typename CustomUnaryOp>
+inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
+unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
+{
+  return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
+}
+
+/** \returns an expression of a custom coefficient-wise unary operator \a func of *this
+  *
+  * The template parameter \a CustomUnaryOp is the type of the functor
+  * of the custom unary operator.
+  *
+  * Example:
+  * \include class_CwiseUnaryOp.cpp
+  * Output: \verbinclude class_CwiseUnaryOp.out
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp
+  */
+template<typename CustomViewOp>
+inline const CwiseUnaryView<CustomViewOp, const Derived>
+unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
+{
+  return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
+}
+
+/** \returns a non const expression of the real part of \c *this.
+  *
+  * \sa imag() */
+inline NonConstRealReturnType
+real() { return derived(); }
+
+/** \returns a non const expression of the imaginary part of \c *this.
+  *
+  * \sa real() */
+inline NonConstImagReturnType
+imag() { return derived(); }
diff --git a/include/eigen3/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/include/eigen3/Eigen/src/plugins/MatrixCwiseBinaryOps.h
new file mode 100644
index 0000000..7f62149
--- /dev/null
+++ b/include/eigen3/Eigen/src/plugins/MatrixCwiseBinaryOps.h
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This file is a base class plugin containing matrix specifics coefficient wise functions.
+
+/** \returns an expression of the Schur product (coefficient wise product) of *this and \a other
+  *
+  * Example: \include MatrixBase_cwiseProduct.cpp
+  * Output: \verbinclude MatrixBase_cwiseProduct.out
+  *
+  * \sa class CwiseBinaryOp, cwiseAbs2
+  */
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
+cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise == operator of *this and \a other
+  *
+  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+  * isMuchSmallerThan().
+  *
+  * Example: \include MatrixBase_cwiseEqual.cpp
+  * Output: \verbinclude MatrixBase_cwiseEqual.out
+  *
+  * \sa cwiseNotEqual(), isApprox(), isMuchSmallerThan()
+  */
+template<typename OtherDerived>
+inline const CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
+cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise != operator of *this and \a other
+  *
+  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+  * isMuchSmallerThan().
+  *
+  * Example: \include MatrixBase_cwiseNotEqual.cpp
+  * Output: \verbinclude MatrixBase_cwiseNotEqual.out
+  *
+  * \sa cwiseEqual(), isApprox(), isMuchSmallerThan()
+  */
+template<typename OtherDerived>
+inline const CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
+cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise min of *this and \a other
+  *
+  * Example: \include MatrixBase_cwiseMin.cpp
+  * Output: \verbinclude MatrixBase_cwiseMin.out
+  *
+  * \sa class CwiseBinaryOp, max()
+  */
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise min of *this and scalar \a other
+  *
+  * \sa class CwiseBinaryOp, min()
+  */
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const ConstantReturnType>
+cwiseMin(const Scalar &other) const
+{
+  return cwiseMin(Derived::Constant(rows(), cols(), other));
+}
+
+/** \returns an expression of the coefficient-wise max of *this and \a other
+  *
+  * Example: \include MatrixBase_cwiseMax.cpp
+  * Output: \verbinclude MatrixBase_cwiseMax.out
+  *
+  * \sa class CwiseBinaryOp, min()
+  */
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise max of *this and scalar \a other
+  *
+  * \sa class CwiseBinaryOp, min()
+  */
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const ConstantReturnType>
+cwiseMax(const Scalar &other) const
+{
+  return cwiseMax(Derived::Constant(rows(), cols(), other));
+}
+
+
+/** \returns an expression of the coefficient-wise quotient of *this and \a other
+  *
+  * Example: \include MatrixBase_cwiseQuotient.cpp
+  * Output: \verbinclude MatrixBase_cwiseQuotient.out
+  *
+  * \sa class CwiseBinaryOp, cwiseProduct(), cwiseInverse()
+  */
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+}
diff --git a/include/eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/include/eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h
new file mode 100644
index 0000000..0cf0640
--- /dev/null
+++ b/include/eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This file is a base class plugin containing matrix specifics coefficient wise functions.
+
+/** \returns an expression of the coefficient-wise absolute value of \c *this
+  *
+  * Example: \include MatrixBase_cwiseAbs.cpp
+  * Output: \verbinclude MatrixBase_cwiseAbs.out
+  *
+  * \sa cwiseAbs2()
+  */
+EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+cwiseAbs() const { return derived(); }
+
+/** \returns an expression of the coefficient-wise squared absolute value of \c *this
+  *
+  * Example: \include MatrixBase_cwiseAbs2.cpp
+  * Output: \verbinclude MatrixBase_cwiseAbs2.out
+  *
+  * \sa cwiseAbs()
+  */
+EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
+cwiseAbs2() const { return derived(); }
+
+/** \returns an expression of the coefficient-wise square root of *this.
+  *
+  * Example: \include MatrixBase_cwiseSqrt.cpp
+  * Output: \verbinclude MatrixBase_cwiseSqrt.out
+  *
+  * \sa cwisePow(), cwiseSquare()
+  */
+inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+cwiseSqrt() const { return derived(); }
+
+/** \returns an expression of the coefficient-wise inverse of *this.
+  *
+  * Example: \include MatrixBase_cwiseInverse.cpp
+  * Output: \verbinclude MatrixBase_cwiseInverse.out
+  *
+  * \sa cwiseProduct()
+  */
+inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+cwiseInverse() const { return derived(); }
+
+/** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
+  *
+  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+  * isMuchSmallerThan().
+  *
+  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
+  */
+inline const CwiseUnaryOp<std::binder1st<std::equal_to<Scalar> >, const Derived>
+cwiseEqual(const Scalar& s) const
+{
+  return CwiseUnaryOp<std::binder1st<std::equal_to<Scalar> >,const Derived>
+          (derived(), std::bind1st(std::equal_to<Scalar>(), s));
+}
diff --git a/include/eigen3/signature_of_eigen3_matrix_library b/include/eigen3/signature_of_eigen3_matrix_library
new file mode 100644
index 0000000..80aaf46
--- /dev/null
+++ b/include/eigen3/signature_of_eigen3_matrix_library
@@ -0,0 +1 @@
+This file is just there as a signature to help identify directories containing Eigen3. When writing a script looking for Eigen3, just look for this file. This is especially useful to help disambiguate with Eigen2...
diff --git a/include/eigen3/unsupported/Eigen/AdolcForward b/include/eigen3/unsupported/Eigen/AdolcForward
new file mode 100644
index 0000000..2627dec
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/AdolcForward
@@ -0,0 +1,156 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ADLOC_FORWARD
+#define EIGEN_ADLOC_FORWARD
+
+//--------------------------------------------------------------------------------
+//
+// This file provides support for adolc's adouble type in forward mode.
+// ADOL-C is a C++ automatic differentiation library,
+// see https://projects.coin-or.org/ADOL-C for more information.
+//
+// Note that the maximal number of directions is controlled by
+// the preprocessor token NUMBER_DIRECTIONS. The default is 2.
+//
+//--------------------------------------------------------------------------------
+
+#define ADOLC_TAPELESS
+#ifndef NUMBER_DIRECTIONS
+# define NUMBER_DIRECTIONS 2
+#endif
+#include <adolc/adouble.h>
+
+// adolc defines some very stupid macros:
+#if defined(malloc)
+# undef malloc
+#endif
+
+#if defined(calloc)
+# undef calloc
+#endif
+
+#if defined(realloc)
+# undef realloc
+#endif
+
+#include <Eigen/Core>
+
+namespace Eigen {
+
+/**
+  * \defgroup AdolcForward_Module Adolc forward module
+  * This module provides support for adolc's adouble type in forward mode.
+  * ADOL-C is a C++ automatic differentiation library,
+  * see https://projects.coin-or.org/ADOL-C for more information.
+  * It mainly consists in:
+  *  - a struct Eigen::NumTraits<adtl::adouble> specialization
+  *  - overloads of internal::* math function for adtl::adouble type.
+  *
+  * Note that the maximal number of directions is controlled by
+  * the preprocessor token NUMBER_DIRECTIONS. The default is 2.
+  *
+  * \code
+  * #include <unsupported/Eigen/AdolcSupport>
+  * \endcode
+  */
+  //@{
+
+} // namespace Eigen
+
+// Eigen's require a few additional functions which must be defined in the same namespace
+// than the custom scalar type own namespace
+namespace adtl {
+
+inline const adouble& conj(const adouble& x)  { return x; }
+inline const adouble& real(const adouble& x)  { return x; }
+inline adouble imag(const adouble&)    { return 0.; }
+inline adouble abs(const adouble&  x)  { return fabs(x); }
+inline adouble abs2(const adouble& x)  { return x*x; }
+
+}
+
+namespace Eigen {
+
+template<> struct NumTraits<adtl::adouble>
+    : NumTraits<double>
+{
+  typedef adtl::adouble Real;
+  typedef adtl::adouble NonInteger;
+  typedef adtl::adouble Nested;
+  enum {
+    IsComplex = 0,
+    IsInteger = 0,
+    IsSigned = 1,
+    RequireInitialization = 1,
+    ReadCost = 1,
+    AddCost = 1,
+    MulCost = 1
+  };
+};
+
+template<typename Functor> class AdolcForwardJacobian : public Functor
+{
+  typedef adtl::adouble ActiveScalar;
+public:
+
+  AdolcForwardJacobian() : Functor() {}
+  AdolcForwardJacobian(const Functor& f) : Functor(f) {}
+
+  // forward constructors
+  template<typename T0>
+  AdolcForwardJacobian(const T0& a0) : Functor(a0) {}
+  template<typename T0, typename T1>
+  AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
+  template<typename T0, typename T1, typename T2>
+  AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {}
+
+  typedef typename Functor::InputType InputType;
+  typedef typename Functor::ValueType ValueType;
+  typedef typename Functor::JacobianType JacobianType;
+
+  typedef Matrix<ActiveScalar, InputType::SizeAtCompileTime, 1> ActiveInput;
+  typedef Matrix<ActiveScalar, ValueType::SizeAtCompileTime, 1> ActiveValue;
+
+  void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const
+  {
+    eigen_assert(v!=0);
+    if (!_jac)
+    {
+      Functor::operator()(x, v);
+      return;
+    }
+
+    JacobianType& jac = *_jac;
+
+    ActiveInput ax = x.template cast<ActiveScalar>();
+    ActiveValue av(jac.rows());
+
+    for (int j=0; j<jac.cols(); j++)
+      for (int i=0; i<jac.cols(); i++)
+        ax[i].setADValue(j, i==j ? 1 : 0);
+
+    Functor::operator()(ax, &av);
+
+    for (int i=0; i<jac.rows(); i++)
+    {
+      (*v)[i] = av[i].getValue();
+      for (int j=0; j<jac.cols(); j++)
+        jac.coeffRef(i,j) = av[i].getADValue(j);
+    }
+  }
+protected:
+
+};
+
+//@}
+
+}
+
+#endif // EIGEN_ADLOC_FORWARD
diff --git a/include/eigen3/unsupported/Eigen/AlignedVector3 b/include/eigen3/unsupported/Eigen/AlignedVector3
new file mode 100644
index 0000000..7b45e6c
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/AlignedVector3
@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ALIGNED_VECTOR3
+#define EIGEN_ALIGNED_VECTOR3
+
+#include <Eigen/Geometry>
+
+namespace Eigen {
+
+/**
+  * \defgroup AlignedVector3_Module Aligned vector3 module
+  *
+  * \code
+  * #include <unsupported/Eigen/AlignedVector3>
+  * \endcode
+  */
+  //@{
+
+
+/** \class AlignedVector3
+  *
+  * \brief A vectorization friendly 3D vector
+  *
+  * This class represents a 3D vector internally using a 4D vector
+  * such that vectorization can be seamlessly enabled. Of course,
+  * the same result can be achieved by directly using a 4D vector.
+  * This class makes this process simpler.
+  *
+  */
+// TODO specialize Cwise
+template<typename _Scalar> class AlignedVector3;
+
+namespace internal {
+template<typename _Scalar> struct traits<AlignedVector3<_Scalar> >
+  : traits<Matrix<_Scalar,3,1,0,4,1> >
+{
+};
+}
+
+template<typename _Scalar> class AlignedVector3
+  : public MatrixBase<AlignedVector3<_Scalar> >
+{
+    typedef Matrix<_Scalar,4,1> CoeffType;
+    CoeffType m_coeffs;
+  public:
+
+    typedef MatrixBase<AlignedVector3<_Scalar> > Base;	
+    EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3)
+    using Base::operator*;
+
+    inline Index rows() const { return 3; }
+    inline Index cols() const { return 1; }
+
+    inline const Scalar& coeff(Index row, Index col) const
+    { return m_coeffs.coeff(row, col); }
+
+    inline Scalar& coeffRef(Index row, Index col)
+    { return m_coeffs.coeffRef(row, col); }
+
+    inline const Scalar& coeff(Index index) const
+    { return m_coeffs.coeff(index); }
+
+    inline Scalar& coeffRef(Index index)
+    { return m_coeffs.coeffRef(index);}
+
+
+    inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z)
+      : m_coeffs(x, y, z, Scalar(0))
+    {}
+
+    inline AlignedVector3(const AlignedVector3& other)
+      : Base(), m_coeffs(other.m_coeffs)
+    {}
+
+    template<typename XprType, int Size=XprType::SizeAtCompileTime>
+    struct generic_assign_selector {};
+
+    template<typename XprType> struct generic_assign_selector<XprType,4>
+    {
+      inline static void run(AlignedVector3& dest, const XprType& src)
+      {
+        dest.m_coeffs = src;
+      }
+    };
+
+    template<typename XprType> struct generic_assign_selector<XprType,3>
+    {
+      inline static void run(AlignedVector3& dest, const XprType& src)
+      {
+        dest.m_coeffs.template head<3>() = src;
+        dest.m_coeffs.w() = Scalar(0);
+      }
+    };
+
+    template<typename Derived>
+    inline explicit AlignedVector3(const MatrixBase<Derived>& other)
+    {
+      generic_assign_selector<Derived>::run(*this,other.derived());
+    }
+
+    inline AlignedVector3& operator=(const AlignedVector3& other)
+    { m_coeffs = other.m_coeffs; return *this; }
+
+
+    inline AlignedVector3 operator+(const AlignedVector3& other) const
+    { return AlignedVector3(m_coeffs + other.m_coeffs); }
+
+    inline AlignedVector3& operator+=(const AlignedVector3& other)
+    { m_coeffs += other.m_coeffs; return *this; }
+
+    inline AlignedVector3 operator-(const AlignedVector3& other) const
+    { return AlignedVector3(m_coeffs - other.m_coeffs); }
+
+    inline AlignedVector3 operator-=(const AlignedVector3& other)
+    { m_coeffs -= other.m_coeffs; return *this; }
+
+    inline AlignedVector3 operator*(const Scalar& s) const
+    { return AlignedVector3(m_coeffs * s); }
+
+    inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec)
+    { return AlignedVector3(s * vec.m_coeffs); }
+
+    inline AlignedVector3& operator*=(const Scalar& s)
+    { m_coeffs *= s; return *this; }
+
+    inline AlignedVector3 operator/(const Scalar& s) const
+    { return AlignedVector3(m_coeffs / s); }
+
+    inline AlignedVector3& operator/=(const Scalar& s)
+    { m_coeffs /= s; return *this; }
+
+    inline Scalar dot(const AlignedVector3& other) const
+    {
+      eigen_assert(m_coeffs.w()==Scalar(0));
+      eigen_assert(other.m_coeffs.w()==Scalar(0));
+      return m_coeffs.dot(other.m_coeffs);
+    }
+
+    inline void normalize()
+    {
+      m_coeffs /= norm();
+    }
+
+    inline AlignedVector3 normalized()
+    {
+      return AlignedVector3(m_coeffs / norm());
+    }
+
+    inline Scalar sum() const
+    {
+      eigen_assert(m_coeffs.w()==Scalar(0));
+      return m_coeffs.sum();
+    }
+
+    inline Scalar squaredNorm() const
+    {
+      eigen_assert(m_coeffs.w()==Scalar(0));
+      return m_coeffs.squaredNorm();
+    }
+
+    inline Scalar norm() const
+    {
+      using std::sqrt;
+      return sqrt(squaredNorm());
+    }
+
+    inline AlignedVector3 cross(const AlignedVector3& other) const
+    {
+      return AlignedVector3(m_coeffs.cross3(other.m_coeffs));
+    }
+
+    template<typename Derived>
+    inline bool isApprox(const MatrixBase<Derived>& other, RealScalar eps=NumTraits<Scalar>::dummy_precision()) const
+    {
+      return m_coeffs.template head<3>().isApprox(other,eps);
+    }
+};
+
+//@}
+
+}
+
+#endif // EIGEN_ALIGNED_VECTOR3
diff --git a/include/eigen3/unsupported/Eigen/AutoDiff b/include/eigen3/unsupported/Eigen/AutoDiff
new file mode 100644
index 0000000..abf5b7d
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/AutoDiff
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_MODULE
+#define EIGEN_AUTODIFF_MODULE
+
+namespace Eigen {
+
+/**
+  * \defgroup AutoDiff_Module Auto Diff module
+  *
+  * This module features forward automatic differentation via a simple
+  * templated scalar type wrapper AutoDiffScalar.
+  *
+  * Warning : this should NOT be confused with numerical differentiation, which
+  * is a different method and has its own module in Eigen : \ref NumericalDiff_Module.
+  *
+  * \code
+  * #include <unsupported/Eigen/AutoDiff>
+  * \endcode
+  */
+//@{
+
+}
+
+#include "src/AutoDiff/AutoDiffScalar.h"
+// #include "src/AutoDiff/AutoDiffVector.h"
+#include "src/AutoDiff/AutoDiffJacobian.h"
+
+namespace Eigen {
+//@}
+}
+
+#endif // EIGEN_AUTODIFF_MODULE
diff --git a/include/eigen3/unsupported/Eigen/BVH b/include/eigen3/unsupported/Eigen/BVH
new file mode 100644
index 0000000..0161a54
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/BVH
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran at mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BVH_MODULE_H
+#define EIGEN_BVH_MODULE_H
+
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+#include <Eigen/StdVector>
+#include <algorithm>
+#include <queue>
+
+namespace Eigen {
+
+/**
+  * \defgroup BVH_Module BVH module
+  * \brief This module provides generic bounding volume hierarchy algorithms
+  * and reference tree implementations.
+  *
+  *
+  * \code
+  * #include <unsupported/Eigen/BVH>
+  * \endcode
+  *
+  * A bounding volume hierarchy (BVH) can accelerate many geometric queries.  This module provides a generic implementation
+  * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization
+  * of a function over the objects in the hierarchy.  It also provides intersection and minimization over a cartesian product of
+  * two BVH's.  A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot
+  * intersect any object contained in that volume.  Similarly, a BVH accelerates minimization because the minimum of a function
+  * over a volume is no greater than the minimum of a function over any object contained in it.
+  *
+  * Some sample queries that can be written in terms of intersection are:
+  *   - Determine all points where a ray intersects a triangle mesh
+  *   - Given a set of points, determine which are contained in a query sphere
+  *   - Given a set of spheres, determine which contain the query point
+  *   - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$
+  *     in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction)
+  *   - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set
+  *     of points with itself)
+  *
+  * Some sample queries that can be written in terms of function minimization over a set of objects are:
+  *   - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray)
+  *   - Given a polyline and a query point, determine the closest point on the polyline to the query
+  *   - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function)
+  *   - Determine how far two meshes are from colliding (this is also a cartesian product query)
+  *
+  * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and
+  * from the particulars of the query.  To enable abstraction from the BVH, the BVH is required to implement a generic mechanism
+  * for traversal.  To abstract from the query, the query is responsible for keeping track of results.
+  *
+  * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code
+      typedef Volume  //the type of bounding volume
+      typedef Object  //the type of object in the hierarchy
+      typedef Index   //a reference to a node in the hierarchy--typically an int or a pointer
+      typedef VolumeIterator //an iterator type over node children--returns Index
+      typedef ObjectIterator //an iterator over object (leaf) children--returns const Object &
+      Index getRootIndex() const //returns the index of the hierarchy root
+      const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index
+      void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
+                      ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
+      //getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children
+      //and [outOBegin, outOEnd) range over its object children
+    \endcode
+  *
+  * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector.
+  * For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions:
+  * \code
+      bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume
+      bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately
+    \endcode
+  * The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume
+  * intersects the query (but possibly on other objects too) unless the search is terminated prematurely.  It is the
+  * responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate.
+  * The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation.
+  *
+  * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair:
+  * \include BVH_Example.cpp
+  * Output: \verbinclude BVH_Example.out
+  */
+}
+
+//@{
+
+#include "src/BVH/BVAlgorithms.h"
+#include "src/BVH/KdBVH.h"
+
+//@}
+
+#endif // EIGEN_BVH_MODULE_H
diff --git a/include/eigen3/unsupported/Eigen/FFT b/include/eigen3/unsupported/Eigen/FFT
new file mode 100644
index 0000000..2c45b39
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/FFT
@@ -0,0 +1,418 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. 
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FFT_H
+#define EIGEN_FFT_H
+
+#include <complex>
+#include <vector>
+#include <map>
+#include <Eigen/Core>
+
+
+/**
+  * \defgroup FFT_Module Fast Fourier Transform module
+  *
+  * \code
+  * #include <unsupported/Eigen/FFT>
+  * \endcode
+  *
+  * This module provides Fast Fourier transformation, with a configurable backend
+  * implementation.
+  *
+  * The default implementation is based on kissfft. It is a small, free, and
+  * reasonably efficient default.
+  *
+  * There are currently two implementation backend:
+  *
+  * - fftw (http://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size.
+  * - MKL (http://en.wikipedia.org/wiki/Math_Kernel_Library) : fastest, commercial -- may be incompatible with Eigen in GPL form.
+  *
+  * \section FFTDesign Design
+  *
+  * The following design decisions were made concerning scaling and
+  * half-spectrum for real FFT.
+  *
+  * The intent is to facilitate generic programming and ease migrating code
+  * from  Matlab/octave.
+  * We think the default behavior of Eigen/FFT should favor correctness and
+  * generality over speed. Of course, the caller should be able to "opt-out" from this
+  * behavior and get the speed increase if they want it.
+  *
+  * 1) %Scaling:
+  * Other libraries (FFTW,IMKL,KISSFFT)  do not perform scaling, so there
+  * is a constant gain incurred after the forward&inverse transforms , so 
+  * IFFT(FFT(x)) = Kx;  this is done to avoid a vector-by-value multiply.  
+  * The downside is that algorithms that worked correctly in Matlab/octave 
+  * don't behave the same way once implemented in C++.
+  *
+  * How Eigen/FFT differs: invertible scaling is performed so IFFT( FFT(x) ) = x. 
+  *
+  * 2) Real FFT half-spectrum
+  * Other libraries use only half the frequency spectrum (plus one extra 
+  * sample for the Nyquist bin) for a real FFT, the other half is the 
+  * conjugate-symmetric of the first half.  This saves them a copy and some 
+  * memory.  The downside is the caller needs to have special logic for the 
+  * number of bins in complex vs real.
+  *
+  * How Eigen/FFT differs: The full spectrum is returned from the forward 
+  * transform.  This facilitates generic template programming by obviating 
+  * separate specializations for real vs complex.  On the inverse
+  * transform, only half the spectrum is actually used if the output type is real.
+  */
+ 
+
+#ifdef EIGEN_FFTW_DEFAULT
+// FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
+#  include <fftw3.h>
+#  include "src/FFT/ei_fftw_impl.h"
+   namespace Eigen {
+     //template <typename T> typedef struct internal::fftw_impl  default_fft_impl; this does not work
+     template <typename T> struct default_fft_impl : public internal::fftw_impl<T> {};
+   }
+#elif defined EIGEN_MKL_DEFAULT
+// TODO 
+// intel Math Kernel Library: fastest, commercial -- may be incompatible with Eigen in GPL form
+#  include "src/FFT/ei_imklfft_impl.h"
+   namespace Eigen {
+     template <typename T> struct default_fft_impl : public internal::imklfft_impl {};
+   }
+#else
+// internal::kissfft_impl:  small, free, reasonably efficient default, derived from kissfft
+//
+# include "src/FFT/ei_kissfft_impl.h"
+  namespace Eigen {
+     template <typename T> 
+       struct default_fft_impl : public internal::kissfft_impl<T> {};
+  }
+#endif
+
+namespace Eigen {
+
+ 
+// 
+template<typename T_SrcMat,typename T_FftIfc> struct fft_fwd_proxy;
+template<typename T_SrcMat,typename T_FftIfc> struct fft_inv_proxy;
+
+namespace internal {
+template<typename T_SrcMat,typename T_FftIfc>
+struct traits< fft_fwd_proxy<T_SrcMat,T_FftIfc> >
+{
+  typedef typename T_SrcMat::PlainObject ReturnType;
+};
+template<typename T_SrcMat,typename T_FftIfc>
+struct traits< fft_inv_proxy<T_SrcMat,T_FftIfc> >
+{
+  typedef typename T_SrcMat::PlainObject ReturnType;
+};
+}
+
+template<typename T_SrcMat,typename T_FftIfc> 
+struct fft_fwd_proxy
+ : public ReturnByValue<fft_fwd_proxy<T_SrcMat,T_FftIfc> >
+{
+  typedef DenseIndex Index;
+
+  fft_fwd_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {}
+
+  template<typename T_DestMat> void evalTo(T_DestMat& dst) const;
+
+  Index rows() const { return m_src.rows(); }
+  Index cols() const { return m_src.cols(); }
+protected:
+  const T_SrcMat & m_src;
+  T_FftIfc & m_ifc;
+  Index m_nfft;
+private:
+  fft_fwd_proxy& operator=(const fft_fwd_proxy&);
+};
+
+template<typename T_SrcMat,typename T_FftIfc> 
+struct fft_inv_proxy
+ : public ReturnByValue<fft_inv_proxy<T_SrcMat,T_FftIfc> >
+{
+  typedef DenseIndex Index;
+
+  fft_inv_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {}
+
+  template<typename T_DestMat> void evalTo(T_DestMat& dst) const;
+
+  Index rows() const { return m_src.rows(); }
+  Index cols() const { return m_src.cols(); }
+protected:
+  const T_SrcMat & m_src;
+  T_FftIfc & m_ifc;
+  Index m_nfft;
+private:
+  fft_inv_proxy& operator=(const fft_inv_proxy&);
+};
+
+
+template <typename T_Scalar,
+         typename T_Impl=default_fft_impl<T_Scalar> >
+class FFT
+{
+  public:
+    typedef T_Impl impl_type;
+    typedef DenseIndex Index;
+    typedef typename impl_type::Scalar Scalar;
+    typedef typename impl_type::Complex Complex;
+
+    enum Flag {
+      Default=0, // goof proof
+      Unscaled=1,
+      HalfSpectrum=2,
+      // SomeOtherSpeedOptimization=4
+      Speedy=32767
+    };
+
+    FFT( const impl_type & impl=impl_type() , Flag flags=Default ) :m_impl(impl),m_flag(flags) { }
+
+    inline
+    bool HasFlag(Flag f) const { return (m_flag & (int)f) == f;}
+
+    inline
+    void SetFlag(Flag f) { m_flag |= (int)f;}
+
+    inline
+    void ClearFlag(Flag f) { m_flag &= (~(int)f);}
+
+    inline
+    void fwd( Complex * dst, const Scalar * src, Index nfft)
+    {
+        m_impl.fwd(dst,src,static_cast<int>(nfft));
+        if ( HasFlag(HalfSpectrum) == false)
+          ReflectSpectrum(dst,nfft);
+    }
+
+    inline
+    void fwd( Complex * dst, const Complex * src, Index nfft)
+    {
+        m_impl.fwd(dst,src,static_cast<int>(nfft));
+    }
+
+    /*
+    inline 
+    void fwd2(Complex * dst, const Complex * src, int n0,int n1)
+    {
+      m_impl.fwd2(dst,src,n0,n1);
+    }
+    */
+
+    template <typename _Input>
+    inline
+    void fwd( std::vector<Complex> & dst, const std::vector<_Input> & src) 
+    {
+      if ( NumTraits<_Input>::IsComplex == 0 && HasFlag(HalfSpectrum) )
+        dst.resize( (src.size()>>1)+1); // half the bins + Nyquist bin
+      else
+        dst.resize(src.size());
+      fwd(&dst[0],&src[0],src.size());
+    }
+
+    template<typename InputDerived, typename ComplexDerived>
+    inline
+    void fwd( MatrixBase<ComplexDerived> & dst, const MatrixBase<InputDerived> & src, Index nfft=-1)
+    {
+      typedef typename ComplexDerived::Scalar dst_type;
+      typedef typename InputDerived::Scalar src_type;
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(InputDerived)
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
+      EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,InputDerived) // size at compile-time
+      EIGEN_STATIC_ASSERT((internal::is_same<dst_type, Complex>::value),
+            YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+      EIGEN_STATIC_ASSERT(int(InputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
+            THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+
+      if (nfft<1)
+        nfft = src.size();
+
+      if ( NumTraits< src_type >::IsComplex == 0 && HasFlag(HalfSpectrum) )
+        dst.derived().resize( (nfft>>1)+1);
+      else
+        dst.derived().resize(nfft);
+
+      if ( src.innerStride() != 1 || src.size() < nfft ) {
+        Matrix<src_type,1,Dynamic> tmp;
+        if (src.size()<nfft) {
+          tmp.setZero(nfft);
+          tmp.block(0,0,src.size(),1 ) = src;
+        }else{
+          tmp = src;
+        }
+        fwd( &dst[0],&tmp[0],nfft );
+      }else{
+        fwd( &dst[0],&src[0],nfft );
+      }
+    }
+ 
+    template<typename InputDerived>
+    inline
+    fft_fwd_proxy< MatrixBase<InputDerived>, FFT<T_Scalar,T_Impl> >
+    fwd( const MatrixBase<InputDerived> & src, Index nfft=-1)
+    {
+      return fft_fwd_proxy< MatrixBase<InputDerived> ,FFT<T_Scalar,T_Impl> >( src, *this,nfft );
+    }
+
+    template<typename InputDerived>
+    inline
+    fft_inv_proxy< MatrixBase<InputDerived>, FFT<T_Scalar,T_Impl> >
+    inv( const MatrixBase<InputDerived> & src, Index nfft=-1)
+    {
+      return  fft_inv_proxy< MatrixBase<InputDerived> ,FFT<T_Scalar,T_Impl> >( src, *this,nfft );
+    }
+
+    inline
+    void inv( Complex * dst, const Complex * src, Index nfft)
+    {
+      m_impl.inv( dst,src,static_cast<int>(nfft) );
+      if ( HasFlag( Unscaled ) == false)
+        scale(dst,Scalar(1./nfft),nfft); // scale the time series
+    }
+
+    inline
+    void inv( Scalar * dst, const Complex * src, Index nfft)
+    {
+      m_impl.inv( dst,src,static_cast<int>(nfft) );
+      if ( HasFlag( Unscaled ) == false)
+        scale(dst,Scalar(1./nfft),nfft); // scale the time series
+    }
+
+    template<typename OutputDerived, typename ComplexDerived>
+    inline
+    void inv( MatrixBase<OutputDerived> & dst, const MatrixBase<ComplexDerived> & src, Index nfft=-1)
+    {
+      typedef typename ComplexDerived::Scalar src_type;
+      typedef typename OutputDerived::Scalar dst_type;
+      const bool realfft= (NumTraits<dst_type>::IsComplex == 0);
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived)
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
+      EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,OutputDerived) // size at compile-time
+      EIGEN_STATIC_ASSERT((internal::is_same<src_type, Complex>::value),
+            YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+      EIGEN_STATIC_ASSERT(int(OutputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
+            THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+
+      if (nfft<1) { //automatic FFT size determination
+        if ( realfft && HasFlag(HalfSpectrum) ) 
+          nfft = 2*(src.size()-1); //assume even fft size
+        else
+          nfft = src.size();
+      }
+      dst.derived().resize( nfft );
+
+      // check for nfft that does not fit the input data size
+      Index resize_input= ( realfft && HasFlag(HalfSpectrum) )
+        ? ( (nfft/2+1) - src.size() )
+        : ( nfft - src.size() );
+
+      if ( src.innerStride() != 1 || resize_input ) {
+        // if the vector is strided, then we need to copy it to a packed temporary
+        Matrix<src_type,1,Dynamic> tmp;
+        if ( resize_input ) {
+          size_t ncopy = (std::min)(src.size(),src.size() + resize_input);
+          tmp.setZero(src.size() + resize_input);
+          if ( realfft && HasFlag(HalfSpectrum) ) {
+            // pad at the Nyquist bin
+            tmp.head(ncopy) = src.head(ncopy);
+            tmp(ncopy-1) = real(tmp(ncopy-1)); // enforce real-only Nyquist bin
+          }else{
+            size_t nhead,ntail;
+            nhead = 1+ncopy/2-1; // range  [0:pi)
+            ntail = ncopy/2-1;   // range (-pi:0)
+            tmp.head(nhead) = src.head(nhead);
+            tmp.tail(ntail) = src.tail(ntail);
+            if (resize_input<0) { //shrinking -- create the Nyquist bin as the average of the two bins that fold into it
+              tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*src_type(.5);
+            }else{ // expanding -- split the old Nyquist bin into two halves
+              tmp(nhead) = src(nhead) * src_type(.5);
+              tmp(tmp.size()-nhead) = tmp(nhead);
+            }
+          }
+        }else{
+          tmp = src;
+        }
+        inv( &dst[0],&tmp[0], nfft);
+      }else{
+        inv( &dst[0],&src[0], nfft);
+      }
+    }
+
+    template <typename _Output>
+    inline
+    void inv( std::vector<_Output> & dst, const std::vector<Complex> & src,Index nfft=-1)
+    {
+      if (nfft<1)
+        nfft = ( NumTraits<_Output>::IsComplex == 0 && HasFlag(HalfSpectrum) ) ? 2*(src.size()-1) : src.size();
+      dst.resize( nfft );
+      inv( &dst[0],&src[0],nfft);
+    }
+
+
+    /*
+    // TODO: multi-dimensional FFTs
+    inline 
+    void inv2(Complex * dst, const Complex * src, int n0,int n1)
+    {
+      m_impl.inv2(dst,src,n0,n1);
+      if ( HasFlag( Unscaled ) == false)
+          scale(dst,1./(n0*n1),n0*n1);
+    }
+  */
+
+    inline
+    impl_type & impl() {return m_impl;}
+  private:
+
+    template <typename T_Data>
+    inline
+    void scale(T_Data * x,Scalar s,Index nx)
+    {
+#if 1
+      for (int k=0;k<nx;++k)
+        *x++ *= s;
+#else
+      if ( ((ptrdiff_t)x) & 15 )
+        Matrix<T_Data, Dynamic, 1>::Map(x,nx) *= s;
+      else
+        Matrix<T_Data, Dynamic, 1>::MapAligned(x,nx) *= s;
+         //Matrix<T_Data, Dynamic, Dynamic>::Map(x,nx) * s;
+#endif  
+    }
+
+    inline
+    void ReflectSpectrum(Complex * freq, Index nfft)
+    {
+      // create the implicit right-half spectrum (conjugate-mirror of the left-half)
+      Index nhbins=(nfft>>1)+1;
+      for (Index k=nhbins;k < nfft; ++k )
+        freq[k] = conj(freq[nfft-k]);
+    }
+
+    impl_type m_impl;
+    int m_flag;
+};
+
+template<typename T_SrcMat,typename T_FftIfc> 
+template<typename T_DestMat> inline 
+void fft_fwd_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
+{
+    m_ifc.fwd( dst, m_src, m_nfft);
+}
+
+template<typename T_SrcMat,typename T_FftIfc> 
+template<typename T_DestMat> inline 
+void fft_inv_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
+{
+    m_ifc.inv( dst, m_src, m_nfft);
+}
+
+}
+#endif
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/unsupported/Eigen/IterativeSolvers b/include/eigen3/unsupported/Eigen/IterativeSolvers
new file mode 100644
index 0000000..aa15403
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/IterativeSolvers
@@ -0,0 +1,45 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ITERATIVE_SOLVERS_MODULE_H
+#define EIGEN_ITERATIVE_SOLVERS_MODULE_H
+
+#include <Eigen/Sparse>
+
+/**
+  * \defgroup IterativeSolvers_Module Iterative solvers module
+  * This module aims to provide various iterative linear and non linear solver algorithms.
+  * It currently provides:
+  *  - a constrained conjugate gradient
+  *  - a Householder GMRES implementation
+  * \code
+  * #include <unsupported/Eigen/IterativeSolvers>
+  * \endcode
+  */
+//@{
+
+#include "../../Eigen/src/misc/Solve.h"
+#include "../../Eigen/src/misc/SparseSolve.h"
+
+#ifndef EIGEN_MPL2_ONLY
+#include "src/IterativeSolvers/IterationController.h"
+#include "src/IterativeSolvers/ConstrainedConjGrad.h"
+#endif
+
+#include "src/IterativeSolvers/IncompleteLU.h"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/Householder"
+#include "src/IterativeSolvers/GMRES.h"
+#include "src/IterativeSolvers/IncompleteCholesky.h"
+//#include "src/IterativeSolvers/SSORPreconditioner.h"
+#include "src/IterativeSolvers/MINRES.h"
+
+//@}
+
+#endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H
diff --git a/include/eigen3/unsupported/Eigen/KroneckerProduct b/include/eigen3/unsupported/Eigen/KroneckerProduct
new file mode 100644
index 0000000..c932c06
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/KroneckerProduct
@@ -0,0 +1,34 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KRONECKER_PRODUCT_MODULE_H
+#define EIGEN_KRONECKER_PRODUCT_MODULE_H
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+  * \defgroup KroneckerProduct_Module KroneckerProduct module
+  *
+  * This module contains an experimental Kronecker product implementation.
+  *
+  * \code
+  * #include <Eigen/KroneckerProduct>
+  * \endcode
+  */
+
+} // namespace Eigen
+
+#include "src/KroneckerProduct/KroneckerTensorProduct.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_KRONECKER_PRODUCT_MODULE_H
diff --git a/include/eigen3/unsupported/Eigen/LevenbergMarquardt b/include/eigen3/unsupported/Eigen/LevenbergMarquardt
new file mode 100644
index 0000000..0fe2680
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/LevenbergMarquardt
@@ -0,0 +1,45 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel at freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEVENBERGMARQUARDT_MODULE
+#define EIGEN_LEVENBERGMARQUARDT_MODULE
+
+// #include <vector>
+
+#include <Eigen/Core>
+#include <Eigen/Jacobi>
+#include <Eigen/QR>
+#include <unsupported/Eigen/NumericalDiff> 
+
+#include <Eigen/SparseQR>
+
+/**
+  * \defgroup LevenbergMarquardt_Module Levenberg-Marquardt module
+  *
+  * \code
+  * #include </Eigen/LevenbergMarquardt>
+  * \endcode
+  *
+  * 
+  */
+
+#include "Eigen/SparseCore"
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+#include "src/LevenbergMarquardt/LMqrsolv.h"
+#include "src/LevenbergMarquardt/LMcovar.h"
+#include "src/LevenbergMarquardt/LMpar.h"
+
+#endif
+
+#include "src/LevenbergMarquardt/LevenbergMarquardt.h"
+#include "src/LevenbergMarquardt/LMonestep.h"
+
+
+#endif // EIGEN_LEVENBERGMARQUARDT_MODULE
diff --git a/include/eigen3/unsupported/Eigen/MPRealSupport b/include/eigen3/unsupported/Eigen/MPRealSupport
new file mode 100644
index 0000000..d4b0364
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/MPRealSupport
@@ -0,0 +1,203 @@
+// This file is part of a joint effort between Eigen, a lightweight C++ template library
+// for linear algebra, and MPFR C++, a C++ interface to MPFR library (http://www.holoborodko.com/pavel/)
+//
+// Copyright (C) 2010-2012 Pavel Holoborodko <pavel at holoborodko.com>
+// Copyright (C) 2010 Konstantin Holoborodko <konstantin at holoborodko.com>
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MPREALSUPPORT_MODULE_H
+#define EIGEN_MPREALSUPPORT_MODULE_H
+
+#include <Eigen/Core>
+#include <mpreal.h>
+
+namespace Eigen {
+  
+/**
+  * \defgroup MPRealSupport_Module MPFRC++ Support module
+  * \code
+  * #include <Eigen/MPRealSupport>
+  * \endcode
+  *
+  * This module provides support for multi precision floating point numbers
+  * via the <a href="http://www.holoborodko.com/pavel/mpfr">MPFR C++</a>
+  * library which itself is built upon <a href="http://www.mpfr.org/">MPFR</a>/<a href="http://gmplib.org/">GMP</a>.
+  *
+  * You can find a copy of MPFR C++ that is known to be compatible in the unsupported/test/mpreal folder.
+  *
+  * Here is an example:
+  *
+\code
+#include <iostream>
+#include <Eigen/MPRealSupport>
+#include <Eigen/LU>
+using namespace mpfr;
+using namespace Eigen;
+int main()
+{
+  // set precision to 256 bits (double has only 53 bits)
+  mpreal::set_default_prec(256);
+  // Declare matrix and vector types with multi-precision scalar type
+  typedef Matrix<mpreal,Dynamic,Dynamic>  MatrixXmp;
+  typedef Matrix<mpreal,Dynamic,1>        VectorXmp;
+
+  MatrixXmp A = MatrixXmp::Random(100,100);
+  VectorXmp b = VectorXmp::Random(100);
+
+  // Solve Ax=b using LU
+  VectorXmp x = A.lu().solve(b);
+  std::cout << "relative error: " << (A*x - b).norm() / b.norm() << std::endl;
+  return 0;
+}
+\endcode
+  *
+  */
+	
+  template<> struct NumTraits<mpfr::mpreal>
+    : GenericNumTraits<mpfr::mpreal>
+  {
+    enum {
+      IsInteger = 0,
+      IsSigned = 1,
+      IsComplex = 0,
+      RequireInitialization = 1,
+      ReadCost = 10,
+      AddCost = 10,
+      MulCost = 40
+    };
+
+    typedef mpfr::mpreal Real;
+    typedef mpfr::mpreal NonInteger;
+    
+    inline static Real highest   (long Precision = mpfr::mpreal::get_default_prec())  { return  mpfr::maxval(Precision); }
+    inline static Real lowest    (long Precision = mpfr::mpreal::get_default_prec())  { return -mpfr::maxval(Precision); }
+
+    // Constants
+    inline static Real Pi       (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_pi(Precision);        }
+    inline static Real Euler    (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_euler(Precision);     }
+    inline static Real Log2     (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_log2(Precision);      }
+    inline static Real Catalan  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_catalan(Precision);   }
+
+    inline static Real epsilon  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::machine_epsilon(Precision); }
+    inline static Real epsilon  (const Real& x)                                         {    return mpfr::machine_epsilon(x); }
+
+    inline static Real dummy_precision()   
+    { 
+        unsigned int weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
+        return mpfr::machine_epsilon(weak_prec);
+    }
+  };
+
+  namespace internal {
+
+  template<> inline mpfr::mpreal random<mpfr::mpreal>()
+  {
+    return mpfr::random();
+  }
+
+  template<> inline mpfr::mpreal random<mpfr::mpreal>(const mpfr::mpreal& a, const mpfr::mpreal& b)
+  {
+    return a + (b-a) * random<mpfr::mpreal>();
+  }
+
+  inline bool isMuchSmallerThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
+  {
+    return mpfr::abs(a) <= mpfr::abs(b) * eps;
+  }
+
+  inline bool isApprox(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
+  {
+    return mpfr::isEqualFuzzy(a,b,eps);
+  }
+
+  inline bool isApproxOrLessThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
+  {
+    return a <= b || mpfr::isEqualFuzzy(a,b,eps);
+  }
+
+  template<> inline long double cast<mpfr::mpreal,long double>(const mpfr::mpreal& x)
+  { return x.toLDouble(); }
+
+  template<> inline double cast<mpfr::mpreal,double>(const mpfr::mpreal& x)
+  { return x.toDouble(); }
+
+  template<> inline long cast<mpfr::mpreal,long>(const mpfr::mpreal& x)
+  { return x.toLong(); }
+
+  template<> inline int cast<mpfr::mpreal,int>(const mpfr::mpreal& x)
+  { return int(x.toLong()); }
+
+  // Specialize GEBP kernel and traits for mpreal (no need for peeling, nor complicated stuff)
+  // This also permits to directly call mpfr's routines and avoid many temporaries produced by mpreal
+    template<>
+    class gebp_traits<mpfr::mpreal, mpfr::mpreal, false, false>
+    {
+    public:
+      typedef mpfr::mpreal ResScalar;
+      enum {
+        nr = 2, // must be 2 for proper packing...
+        mr = 1,
+        WorkSpaceFactor = nr,
+        LhsProgress = 1,
+        RhsProgress = 1
+      };
+    };
+
+    template<typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+    struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,mr,nr,ConjugateLhs,ConjugateRhs>
+    {
+      typedef mpfr::mpreal mpreal;
+
+      EIGEN_DONT_INLINE
+      void operator()(mpreal* res, Index resStride, const mpreal* blockA, const mpreal* blockB, Index rows, Index depth, Index cols, mpreal alpha,
+                      Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, mpreal* /*unpackedB*/ = 0)
+      {
+        mpreal acc1, acc2, tmp;
+        
+        if(strideA==-1) strideA = depth;
+        if(strideB==-1) strideB = depth;
+
+        for(Index j=0; j<cols; j+=nr)
+        {
+          Index actual_nr = (std::min<Index>)(nr,cols-j);
+          mpreal *C1 = res + j*resStride;
+          mpreal *C2 = res + (j+1)*resStride;
+          for(Index i=0; i<rows; i++)
+          {
+            mpreal *B = const_cast<mpreal*>(blockB) + j*strideB + offsetB*actual_nr;
+            mpreal *A = const_cast<mpreal*>(blockA) + i*strideA + offsetA;
+            acc1 = 0;
+            acc2 = 0;
+            for(Index k=0; k<depth; k++)
+            {
+              mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_ptr(), B[0].mpfr_ptr(), mpreal::get_default_rnd());
+              mpfr_add(acc1.mpfr_ptr(), acc1.mpfr_ptr(), tmp.mpfr_ptr(),  mpreal::get_default_rnd());
+              
+              if(actual_nr==2) {
+                mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_ptr(), B[1].mpfr_ptr(), mpreal::get_default_rnd());
+                mpfr_add(acc2.mpfr_ptr(), acc2.mpfr_ptr(), tmp.mpfr_ptr(),  mpreal::get_default_rnd());
+              }
+              
+              B+=actual_nr;
+            }
+            
+            mpfr_mul(acc1.mpfr_ptr(), acc1.mpfr_ptr(), alpha.mpfr_ptr(), mpreal::get_default_rnd());
+            mpfr_add(C1[i].mpfr_ptr(), C1[i].mpfr_ptr(), acc1.mpfr_ptr(),  mpreal::get_default_rnd());
+            
+            if(actual_nr==2) {
+              mpfr_mul(acc2.mpfr_ptr(), acc2.mpfr_ptr(), alpha.mpfr_ptr(), mpreal::get_default_rnd());
+              mpfr_add(C2[i].mpfr_ptr(), C2[i].mpfr_ptr(), acc2.mpfr_ptr(),  mpreal::get_default_rnd());
+            }
+          }
+        }
+      }
+    };
+
+  } // end namespace internal
+}
+
+#endif // EIGEN_MPREALSUPPORT_MODULE_H
diff --git a/include/eigen3/unsupported/Eigen/MatrixFunctions b/include/eigen3/unsupported/Eigen/MatrixFunctions
new file mode 100644
index 0000000..0991817
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/MatrixFunctions
@@ -0,0 +1,447 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Jitse Niesen <jitse at maths.leeds.ac.uk>
+// Copyright (C) 2012 Chen-Pang He <jdh8 at ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_FUNCTIONS
+#define EIGEN_MATRIX_FUNCTIONS
+
+#include <cfloat>
+#include <list>
+#include <functional>
+#include <iterator>
+
+#include <Eigen/Core>
+#include <Eigen/LU>
+#include <Eigen/Eigenvalues>
+
+/**
+  * \defgroup MatrixFunctions_Module Matrix functions module
+  * \brief This module aims to provide various methods for the computation of
+  * matrix functions. 
+  *
+  * To use this module, add 
+  * \code
+  * #include <unsupported/Eigen/MatrixFunctions>
+  * \endcode
+  * at the start of your source file.
+  *
+  * This module defines the following MatrixBase methods.
+  *  - \ref matrixbase_cos "MatrixBase::cos()", for computing the matrix cosine
+  *  - \ref matrixbase_cosh "MatrixBase::cosh()", for computing the matrix hyperbolic cosine
+  *  - \ref matrixbase_exp "MatrixBase::exp()", for computing the matrix exponential
+  *  - \ref matrixbase_log "MatrixBase::log()", for computing the matrix logarithm
+  *  - \ref matrixbase_pow "MatrixBase::pow()", for computing the matrix power
+  *  - \ref matrixbase_matrixfunction "MatrixBase::matrixFunction()", for computing general matrix functions
+  *  - \ref matrixbase_sin "MatrixBase::sin()", for computing the matrix sine
+  *  - \ref matrixbase_sinh "MatrixBase::sinh()", for computing the matrix hyperbolic sine
+  *  - \ref matrixbase_sqrt "MatrixBase::sqrt()", for computing the matrix square root
+  *
+  * These methods are the main entry points to this module. 
+  *
+  * %Matrix functions are defined as follows.  Suppose that \f$ f \f$
+  * is an entire function (that is, a function on the complex plane
+  * that is everywhere complex differentiable).  Then its Taylor
+  * series
+  * \f[ f(0) + f'(0) x + \frac{f''(0)}{2} x^2 + \frac{f'''(0)}{3!} x^3 + \cdots \f]
+  * converges to \f$ f(x) \f$. In this case, we can define the matrix
+  * function by the same series:
+  * \f[ f(M) = f(0) + f'(0) M + \frac{f''(0)}{2} M^2 + \frac{f'''(0)}{3!} M^3 + \cdots \f]
+  *
+  */
+
+#include "src/MatrixFunctions/MatrixExponential.h"
+#include "src/MatrixFunctions/MatrixFunction.h"
+#include "src/MatrixFunctions/MatrixSquareRoot.h"
+#include "src/MatrixFunctions/MatrixLogarithm.h"
+#include "src/MatrixFunctions/MatrixPower.h"
+
+
+/** 
+\page matrixbaseextra_page
+\ingroup MatrixFunctions_Module
+
+\section matrixbaseextra MatrixBase methods defined in the MatrixFunctions module
+
+The remainder of the page documents the following MatrixBase methods
+which are defined in the MatrixFunctions module.
+
+
+
+\subsection matrixbase_cos MatrixBase::cos()
+
+Compute the matrix cosine.
+
+\code
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const
+\endcode
+
+\param[in]  M  a square matrix.
+\returns  expression representing \f$ \cos(M) \f$.
+
+This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cos().
+
+\sa \ref matrixbase_sin "sin()" for an example.
+
+
+
+\subsection matrixbase_cosh MatrixBase::cosh()
+
+Compute the matrix hyberbolic cosine.
+
+\code
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cosh() const
+\endcode
+
+\param[in]  M  a square matrix.
+\returns  expression representing \f$ \cosh(M) \f$
+
+This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cosh().
+
+\sa \ref matrixbase_sinh "sinh()" for an example.
+
+
+
+\subsection matrixbase_exp MatrixBase::exp()
+
+Compute the matrix exponential.
+
+\code
+const MatrixExponentialReturnValue<Derived> MatrixBase<Derived>::exp() const
+\endcode
+
+\param[in]  M  matrix whose exponential is to be computed.
+\returns    expression representing the matrix exponential of \p M.
+
+The matrix exponential of \f$ M \f$ is defined by
+\f[ \exp(M) = \sum_{k=0}^\infty \frac{M^k}{k!}. \f]
+The matrix exponential can be used to solve linear ordinary
+differential equations: the solution of \f$ y' = My \f$ with the
+initial condition \f$ y(0) = y_0 \f$ is given by
+\f$ y(t) = \exp(M) y_0 \f$.
+
+The cost of the computation is approximately \f$ 20 n^3 \f$ for
+matrices of size \f$ n \f$. The number 20 depends weakly on the
+norm of the matrix.
+
+The matrix exponential is computed using the scaling-and-squaring
+method combined with Padé approximation. The matrix is first
+rescaled, then the exponential of the reduced matrix is computed
+approximant, and then the rescaling is undone by repeated
+squaring. The degree of the Padé approximant is chosen such
+that the approximation error is less than the round-off
+error. However, errors may accumulate during the squaring phase.
+
+Details of the algorithm can be found in: Nicholas J. Higham, "The
+scaling and squaring method for the matrix exponential revisited,"
+<em>SIAM J. %Matrix Anal. Applic.</em>, <b>26</b>:1179–1193,
+2005.
+
+Example: The following program checks that
+\f[ \exp \left[ \begin{array}{ccc}
+      0 & \frac14\pi & 0 \\
+      -\frac14\pi & 0 & 0 \\
+      0 & 0 & 0
+    \end{array} \right] = \left[ \begin{array}{ccc}
+      \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
+      \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
+      0 & 0 & 1
+    \end{array} \right]. \f]
+This corresponds to a rotation of \f$ \frac14\pi \f$ radians around
+the z-axis.
+
+\include MatrixExponential.cpp
+Output: \verbinclude MatrixExponential.out
+
+\note \p M has to be a matrix of \c float, \c double, \c long double
+\c complex<float>, \c complex<double>, or \c complex<long double> .
+
+
+\subsection matrixbase_log MatrixBase::log()
+
+Compute the matrix logarithm.
+
+\code
+const MatrixLogarithmReturnValue<Derived> MatrixBase<Derived>::log() const
+\endcode
+
+\param[in]  M  invertible matrix whose logarithm is to be computed.
+\returns    expression representing the matrix logarithm root of \p M.
+
+The matrix logarithm of \f$ M \f$ is a matrix \f$ X \f$ such that 
+\f$ \exp(X) = M \f$ where exp denotes the matrix exponential. As for
+the scalar logarithm, the equation \f$ \exp(X) = M \f$ may have
+multiple solutions; this function returns a matrix whose eigenvalues
+have imaginary part in the interval \f$ (-\pi,\pi] \f$.
+
+In the real case, the matrix \f$ M \f$ should be invertible and
+it should have no eigenvalues which are real and negative (pairs of
+complex conjugate eigenvalues are allowed). In the complex case, it
+only needs to be invertible.
+
+This function computes the matrix logarithm using the Schur-Parlett
+algorithm as implemented by MatrixBase::matrixFunction(). The
+logarithm of an atomic block is computed by MatrixLogarithmAtomic,
+which uses direct computation for 1-by-1 and 2-by-2 blocks and an
+inverse scaling-and-squaring algorithm for bigger blocks, with the
+square roots computed by MatrixBase::sqrt().
+
+Details of the algorithm can be found in Section 11.6.2 of:
+Nicholas J. Higham,
+<em>Functions of Matrices: Theory and Computation</em>,
+SIAM 2008. ISBN 978-0-898716-46-7.
+
+Example: The following program checks that
+\f[ \log \left[ \begin{array}{ccc} 
+      \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
+      \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
+      0 & 0 & 1
+    \end{array} \right] = \left[ \begin{array}{ccc}
+      0 & \frac14\pi & 0 \\ 
+      -\frac14\pi & 0 & 0 \\
+      0 & 0 & 0 
+    \end{array} \right]. \f]
+This corresponds to a rotation of \f$ \frac14\pi \f$ radians around
+the z-axis. This is the inverse of the example used in the
+documentation of \ref matrixbase_exp "exp()".
+
+\include MatrixLogarithm.cpp
+Output: \verbinclude MatrixLogarithm.out
+
+\note \p M has to be a matrix of \c float, \c double, <tt>long
+double</tt>, \c complex<float>, \c complex<double>, or \c complex<long
+double> .
+
+\sa MatrixBase::exp(), MatrixBase::matrixFunction(), 
+    class MatrixLogarithmAtomic, MatrixBase::sqrt().
+
+
+\subsection matrixbase_pow MatrixBase::pow()
+
+Compute the matrix raised to arbitrary real power.
+
+\code
+const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(RealScalar p) const
+\endcode
+
+\param[in]  M  base of the matrix power, should be a square matrix.
+\param[in]  p  exponent of the matrix power, should be real.
+
+The matrix power \f$ M^p \f$ is defined as \f$ \exp(p \log(M)) \f$,
+where exp denotes the matrix exponential, and log denotes the matrix
+logarithm.
+
+The matrix \f$ M \f$ should meet the conditions to be an argument of
+matrix logarithm. If \p p is not of the real scalar type of \p M, it
+is casted into the real scalar type of \p M.
+
+This function computes the matrix power using the Schur-Padé
+algorithm as implemented by class MatrixPower. The exponent is split
+into integral part and fractional part, where the fractional part is
+in the interval \f$ (-1, 1) \f$. The main diagonal and the first
+super-diagonal is directly computed.
+
+Details of the algorithm can be found in: Nicholas J. Higham and
+Lijing Lin, "A Schur-Padé algorithm for fractional powers of a
+matrix," <em>SIAM J. %Matrix Anal. Applic.</em>,
+<b>32(3)</b>:1056–1078, 2011.
+
+Example: The following program checks that
+\f[ \left[ \begin{array}{ccc}
+      \cos1 & -\sin1 & 0 \\
+      \sin1 & \cos1 & 0 \\
+      0 & 0 & 1
+    \end{array} \right]^{\frac14\pi} = \left[ \begin{array}{ccc}
+      \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
+      \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
+      0 & 0 & 1
+    \end{array} \right]. \f]
+This corresponds to \f$ \frac14\pi \f$ rotations of 1 radian around
+the z-axis.
+
+\include MatrixPower.cpp
+Output: \verbinclude MatrixPower.out
+
+MatrixBase::pow() is user-friendly. However, there are some
+circumstances under which you should use class MatrixPower directly.
+MatrixPower can save the result of Schur decomposition, so it's
+better for computing various powers for the same matrix.
+
+Example:
+\include MatrixPower_optimal.cpp
+Output: \verbinclude MatrixPower_optimal.out
+
+\note \p M has to be a matrix of \c float, \c double, <tt>long
+double</tt>, \c complex<float>, \c complex<double>, or \c complex<long
+double> .
+
+\sa MatrixBase::exp(), MatrixBase::log(), class MatrixPower.
+
+
+\subsection matrixbase_matrixfunction MatrixBase::matrixFunction()
+
+Compute a matrix function.
+
+\code
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::matrixFunction(typename internal::stem_function<typename internal::traits<Derived>::Scalar>::type f) const
+\endcode
+
+\param[in]  M  argument of matrix function, should be a square matrix.
+\param[in]  f  an entire function; \c f(x,n) should compute the n-th
+derivative of f at x.
+\returns  expression representing \p f applied to \p M.
+
+Suppose that \p M is a matrix whose entries have type \c Scalar. 
+Then, the second argument, \p f, should be a function with prototype
+\code 
+ComplexScalar f(ComplexScalar, int) 
+\endcode
+where \c ComplexScalar = \c std::complex<Scalar> if \c Scalar is
+real (e.g., \c float or \c double) and \c ComplexScalar =
+\c Scalar if \c Scalar is complex. The return value of \c f(x,n)
+should be \f$ f^{(n)}(x) \f$, the n-th derivative of f at x.
+
+This routine uses the algorithm described in:
+Philip Davies and Nicholas J. Higham, 
+"A Schur-Parlett algorithm for computing matrix functions", 
+<em>SIAM J. %Matrix Anal. Applic.</em>, <b>25</b>:464–485, 2003.
+
+The actual work is done by the MatrixFunction class.
+
+Example: The following program checks that
+\f[ \exp \left[ \begin{array}{ccc} 
+      0 & \frac14\pi & 0 \\ 
+      -\frac14\pi & 0 & 0 \\
+      0 & 0 & 0 
+    \end{array} \right] = \left[ \begin{array}{ccc}
+      \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
+      \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
+      0 & 0 & 1
+    \end{array} \right]. \f]
+This corresponds to a rotation of \f$ \frac14\pi \f$ radians around
+the z-axis. This is the same example as used in the documentation
+of \ref matrixbase_exp "exp()".
+
+\include MatrixFunction.cpp
+Output: \verbinclude MatrixFunction.out
+
+Note that the function \c expfn is defined for complex numbers 
+\c x, even though the matrix \c A is over the reals. Instead of
+\c expfn, we could also have used StdStemFunctions::exp:
+\code
+A.matrixFunction(StdStemFunctions<std::complex<double> >::exp, &B);
+\endcode
+
+
+
+\subsection matrixbase_sin MatrixBase::sin()
+
+Compute the matrix sine.
+
+\code
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const
+\endcode
+
+\param[in]  M  a square matrix.
+\returns  expression representing \f$ \sin(M) \f$.
+
+This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sin().
+
+Example: \include MatrixSine.cpp
+Output: \verbinclude MatrixSine.out
+
+
+
+\subsection matrixbase_sinh MatrixBase::sinh()
+
+Compute the matrix hyperbolic sine.
+
+\code
+MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sinh() const
+\endcode
+
+\param[in]  M  a square matrix.
+\returns  expression representing \f$ \sinh(M) \f$
+
+This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sinh().
+
+Example: \include MatrixSinh.cpp
+Output: \verbinclude MatrixSinh.out
+
+
+\subsection matrixbase_sqrt MatrixBase::sqrt()
+
+Compute the matrix square root.
+
+\code
+const MatrixSquareRootReturnValue<Derived> MatrixBase<Derived>::sqrt() const
+\endcode
+
+\param[in]  M  invertible matrix whose square root is to be computed.
+\returns    expression representing the matrix square root of \p M.
+
+The matrix square root of \f$ M \f$ is the matrix \f$ M^{1/2} \f$
+whose square is the original matrix; so if \f$ S = M^{1/2} \f$ then
+\f$ S^2 = M \f$. 
+
+In the <b>real case</b>, the matrix \f$ M \f$ should be invertible and
+it should have no eigenvalues which are real and negative (pairs of
+complex conjugate eigenvalues are allowed). In that case, the matrix
+has a square root which is also real, and this is the square root
+computed by this function. 
+
+The matrix square root is computed by first reducing the matrix to
+quasi-triangular form with the real Schur decomposition. The square
+root of the quasi-triangular matrix can then be computed directly. The
+cost is approximately \f$ 25 n^3 \f$ real flops for the real Schur
+decomposition and \f$ 3\frac13 n^3 \f$ real flops for the remainder
+(though the computation time in practice is likely more than this
+indicates).
+
+Details of the algorithm can be found in: Nicholas J. Highan,
+"Computing real square roots of a real matrix", <em>Linear Algebra
+Appl.</em>, 88/89:405–430, 1987.
+
+If the matrix is <b>positive-definite symmetric</b>, then the square
+root is also positive-definite symmetric. In this case, it is best to
+use SelfAdjointEigenSolver::operatorSqrt() to compute it.
+
+In the <b>complex case</b>, the matrix \f$ M \f$ should be invertible;
+this is a restriction of the algorithm. The square root computed by
+this algorithm is the one whose eigenvalues have an argument in the
+interval \f$ (-\frac12\pi, \frac12\pi] \f$. This is the usual branch
+cut.
+
+The computation is the same as in the real case, except that the
+complex Schur decomposition is used to reduce the matrix to a
+triangular matrix. The theoretical cost is the same. Details are in:
+Åke Björck and Sven Hammarling, "A Schur method for the
+square root of a matrix", <em>Linear Algebra Appl.</em>,
+52/53:127–140, 1983.
+
+Example: The following program checks that the square root of
+\f[ \left[ \begin{array}{cc} 
+              \cos(\frac13\pi) & -\sin(\frac13\pi) \\
+              \sin(\frac13\pi) & \cos(\frac13\pi)
+    \end{array} \right], \f]
+corresponding to a rotation over 60 degrees, is a rotation over 30 degrees:
+\f[ \left[ \begin{array}{cc} 
+              \cos(\frac16\pi) & -\sin(\frac16\pi) \\
+              \sin(\frac16\pi) & \cos(\frac16\pi)
+    \end{array} \right]. \f]
+
+\include MatrixSquareRoot.cpp
+Output: \verbinclude MatrixSquareRoot.out
+
+\sa class RealSchur, class ComplexSchur, class MatrixSquareRoot,
+    SelfAdjointEigenSolver::operatorSqrt().
+
+*/
+
+#endif // EIGEN_MATRIX_FUNCTIONS
+
diff --git a/include/eigen3/unsupported/Eigen/MoreVectorization b/include/eigen3/unsupported/Eigen/MoreVectorization
new file mode 100644
index 0000000..470e724
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/MoreVectorization
@@ -0,0 +1,24 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MOREVECTORIZATION_MODULE_H
+#define EIGEN_MOREVECTORIZATION_MODULE_H
+
+#include <Eigen/Core>
+
+namespace Eigen {
+
+/**
+  * \defgroup MoreVectorization More vectorization module
+  */
+
+}
+
+#include "src/MoreVectorization/MathFunctions.h"
+
+#endif // EIGEN_MOREVECTORIZATION_MODULE_H
diff --git a/include/eigen3/unsupported/Eigen/NonLinearOptimization b/include/eigen3/unsupported/Eigen/NonLinearOptimization
new file mode 100644
index 0000000..600ab4c
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/NonLinearOptimization
@@ -0,0 +1,134 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel at freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NONLINEAROPTIMIZATION_MODULE
+#define EIGEN_NONLINEAROPTIMIZATION_MODULE
+
+#include <vector>
+
+#include <Eigen/Core>
+#include <Eigen/Jacobi>
+#include <Eigen/QR>
+#include <unsupported/Eigen/NumericalDiff>
+
+/**
+  * \defgroup NonLinearOptimization_Module Non linear optimization module
+  *
+  * \code
+  * #include <unsupported/Eigen/NonLinearOptimization>
+  * \endcode
+  *
+  * This module provides implementation of two important algorithms in non linear
+  * optimization. In both cases, we consider a system of non linear functions. Of
+  * course, this should work, and even work very well if those functions are
+  * actually linear. But if this is so, you should probably better use other
+  * methods more fitted to this special case.
+  *
+  * One algorithm allows to find an extremum of such a system (Levenberg
+  * Marquardt algorithm) and the second one is used to find 
+  * a zero for the system (Powell hybrid "dogleg" method).
+  *
+  * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
+  * Minpack is a very famous, old, robust and well-reknown package, written in 
+  * fortran. Those implementations have been carefully tuned, tested, and used
+  * for several decades.
+  *
+  * The original fortran code was automatically translated using f2c (http://en.wikipedia.org/wiki/F2c) in C,
+  * then c++, and then cleaned by several different authors.
+  * The last one of those cleanings being our starting point : 
+  * http://devernay.free.fr/hacks/cminpack.html
+  * 
+  * Finally, we ported this code to Eigen, creating classes and API
+  * coherent with Eigen. When possible, we switched to Eigen
+  * implementation, such as most linear algebra (vectors, matrices, stable norms).
+  *
+  * Doing so, we were very careful to check the tests we setup at the very
+  * beginning, which ensure that the same results are found.
+  *
+  * \section Tests Tests
+  * 
+  * The tests are placed in the file unsupported/test/NonLinear.cpp.
+  * 
+  * There are two kinds of tests : those that come from examples bundled with cminpack.
+  * They guaranty we get the same results as the original algorithms (value for 'x',
+  * for the number of evaluations of the function, and for the number of evaluations
+  * of the jacobian if ever).
+  * 
+  * Other tests were added by myself at the very beginning of the 
+  * process and check the results for levenberg-marquardt using the reference data 
+  * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've 
+  * carefully checked that the same results were obtained when modifiying the 
+  * code. Please note that we do not always get the exact same decimals as they do,
+  * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
+  * which is 64 bits on most platforms (x86 and amd64, at least).
+  * I've performed those tests on several other implementations of levenberg-marquardt, and
+  * (c)minpack performs VERY well compared to those, both in accuracy and speed.
+  * 
+  * The documentation for running the tests is on the wiki
+  * http://eigen.tuxfamily.org/index.php?title=Tests
+  * 
+  * \section API API : overview of methods
+  * 
+  * Both algorithms can use either the jacobian (provided by the user) or compute 
+  * an approximation by themselves (actually using Eigen \ref NumericalDiff_Module).
+  * The part of API referring to the latter use 'NumericalDiff' in the method names
+  * (exemple: LevenbergMarquardt.minimizeNumericalDiff() ) 
+  * 
+  * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and 
+  * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original 
+  * minpack package that you probably should NOT use until you are porting a code that
+  *  was previously using minpack. They just define a 'simple' API with default values 
+  * for some parameters.
+  * 
+  * All algorithms are provided using Two APIs :
+  *     - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants : 
+  * this way the caller have control over the steps
+  *     - one where the user just calls a method (optimize() or solve()) which will 
+  * handle the loop: init + loop until a stop condition is met. Those are provided for
+  *  convenience.
+  * 
+  * As an example, the method LevenbergMarquardt::minimize() is 
+  * implemented as follow : 
+  * \code
+  * Status LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x, const int mode)
+  * {
+  *     Status status = minimizeInit(x, mode);
+  *     do {
+  *         status = minimizeOneStep(x, mode);
+  *     } while (status==Running);
+  *     return status;
+  * }
+  * \endcode
+  * 
+  * \section examples Examples
+  * 
+  * The easiest way to understand how to use this module is by looking at the many examples in the file
+  * unsupported/test/NonLinearOptimization.cpp.
+  */
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+#include "src/NonLinearOptimization/qrsolv.h"
+#include "src/NonLinearOptimization/r1updt.h"
+#include "src/NonLinearOptimization/r1mpyq.h"
+#include "src/NonLinearOptimization/rwupdt.h"
+#include "src/NonLinearOptimization/fdjac1.h"
+#include "src/NonLinearOptimization/lmpar.h"
+#include "src/NonLinearOptimization/dogleg.h"
+#include "src/NonLinearOptimization/covar.h"
+
+#include "src/NonLinearOptimization/chkder.h"
+
+#endif
+
+#include "src/NonLinearOptimization/HybridNonLinearSolver.h"
+#include "src/NonLinearOptimization/LevenbergMarquardt.h"
+
+
+#endif // EIGEN_NONLINEAROPTIMIZATION_MODULE
diff --git a/include/eigen3/unsupported/Eigen/NumericalDiff b/include/eigen3/unsupported/Eigen/NumericalDiff
new file mode 100644
index 0000000..433334c
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/NumericalDiff
@@ -0,0 +1,56 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel at freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NUMERICALDIFF_MODULE
+#define EIGEN_NUMERICALDIFF_MODULE
+
+#include <Eigen/Core>
+
+namespace Eigen {
+
+/**
+  * \defgroup NumericalDiff_Module Numerical differentiation module
+  *
+  * \code
+  * #include <unsupported/Eigen/NumericalDiff>
+  * \endcode
+  *
+  * See http://en.wikipedia.org/wiki/Numerical_differentiation
+  *
+  * Warning : this should NOT be confused with automatic differentiation, which
+  * is a different method and has its own module in Eigen : \ref
+  * AutoDiff_Module.
+  *
+  * Currently only "Forward" and "Central" schemes are implemented. Those
+  * are basic methods, and there exist some more elaborated way of
+  * computing such approximates. They are implemented using both
+  * proprietary and free software, and usually requires linking to an
+  * external library. It is very easy for you to write a functor
+  * using such software, and the purpose is quite orthogonal to what we
+  * want to achieve with Eigen.
+  *
+  * This is why we will not provide wrappers for every great numerical
+  * differentiation software that exist, but should rather stick with those
+  * basic ones, that still are useful for testing.
+  *
+  * Also, the \ref NonLinearOptimization_Module needs this in order to
+  * provide full features compatibility with the original (c)minpack
+  * package.
+  *
+  */
+}
+
+//@{
+
+#include "src/NumericalDiff/NumericalDiff.h"
+
+//@}
+
+
+#endif // EIGEN_NUMERICALDIFF_MODULE
diff --git a/include/eigen3/unsupported/Eigen/OpenGLSupport b/include/eigen3/unsupported/Eigen/OpenGLSupport
new file mode 100644
index 0000000..e276944
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/OpenGLSupport
@@ -0,0 +1,322 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_OPENGL_MODULE
+#define EIGEN_OPENGL_MODULE
+
+#include <Eigen/Geometry>
+
+#if defined(__APPLE_CC__)
+  #include <OpenGL/gl.h>
+#else
+  #include <GL/gl.h>
+#endif
+
+namespace Eigen {
+
+/**
+  * \defgroup OpenGLSUpport_Module OpenGL Support module
+  *
+  * This module provides wrapper functions for a couple of OpenGL functions
+  * which simplify the way to pass Eigen's object to openGL.
+  * Here is an exmaple:
+  * 
+  * \code
+  * // You need to add path_to_eigen/unsupported to your include path.
+  * #include <Eigen/OpenGLSupport>
+  * // ...
+  * Vector3f x, y;
+  * Matrix3f rot;
+  * 
+  * glVertex(y + x * rot);
+  * 
+  * Quaternion q;
+  * glRotate(q);
+  * 
+  * // ...
+  * \endcode
+  *
+  */
+//@{
+
+#define EIGEN_GL_FUNC_DECLARATION(FUNC)                                                                             \
+namespace internal {                                                                                                \
+  template< typename XprType,                                                                                       \
+            typename Scalar = typename XprType::Scalar,                                                             \
+            int Rows = XprType::RowsAtCompileTime,                                                                  \
+            int Cols = XprType::ColsAtCompileTime,                                                                  \
+            bool IsGLCompatible = bool(XprType::Flags&LinearAccessBit)                                              \
+                              && bool(XprType::Flags&DirectAccessBit)                                               \
+                              && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)>               \
+  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl);                                                                      \
+                                                                                                                    \
+  template<typename XprType, typename Scalar, int Rows, int Cols>                                                   \
+  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType,Scalar,Rows,Cols,false> {                                     \
+    inline static void run(const XprType& p) {                                                                      \
+      EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<typename plain_matrix_type_column_major<XprType>::type>::run(p); }       \
+  };                                                                                                                \
+}                                                                                                                   \
+                                                                                                                    \
+template<typename Derived> inline void FUNC(const Eigen::DenseBase<Derived>& p) {                                   \
+  EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)<Derived>::run(p.derived());                                        \
+}
+
+
+#define EIGEN_GL_FUNC_SPECIALIZATION_MAT(FUNC,SCALAR,ROWS,COLS,SUFFIX)                                              \
+namespace internal {                                                                                                \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, ROWS, COLS, true> {      \
+    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }                                            \
+  };                                                                                                                \
+}
+
+  
+#define EIGEN_GL_FUNC_SPECIALIZATION_VEC(FUNC,SCALAR,SIZE,SUFFIX)                                                   \
+namespace internal {                                                                                                \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, SIZE, 1, true> {         \
+    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }                                            \
+  };                                                                                                                \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, 1, SIZE, true> {         \
+    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }                                            \
+  };                                                                                                                \
+}
+
+  
+EIGEN_GL_FUNC_DECLARATION       (glVertex)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int,    2,2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short,  2,2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int,    3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short,  3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 3,3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int,    4,4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short,  4,4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float,  4,4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 4,4dv)
+
+EIGEN_GL_FUNC_DECLARATION       (glTexCoord)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int,    2,2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short,  2,2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int,    3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short,  3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 3,3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int,    4,4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short,  4,4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float,  4,4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 4,4dv)
+
+EIGEN_GL_FUNC_DECLARATION       (glColor)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int,    2,2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short,  2,2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int,    3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short,  3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 3,3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int,    4,4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short,  4,4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float,  4,4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 4,4dv)
+
+EIGEN_GL_FUNC_DECLARATION       (glNormal)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,int,    3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,short,  3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,double, 3,3dv)
+
+inline void glScale2fv(const float*  v) { glScalef(v[0], v[1], 1.f);  }
+inline void glScale2dv(const double* v) { glScaled(v[0], v[1], 1.0);  }
+inline void glScale3fv(const float*  v) { glScalef(v[0], v[1], v[2]); }
+inline void glScale3dv(const double* v) { glScaled(v[0], v[1], v[2]); }
+
+EIGEN_GL_FUNC_DECLARATION       (glScale)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 3,3dv)
+
+template<typename Scalar> void glScale(const UniformScaling<Scalar>& s)  { glScale(Matrix<Scalar,3,1>::Constant(s.factor())); }
+
+inline void glTranslate2fv(const float*  v) { glTranslatef(v[0], v[1], 0.f);  }
+inline void glTranslate2dv(const double* v) { glTranslated(v[0], v[1], 0.0);  }
+inline void glTranslate3fv(const float*  v) { glTranslatef(v[0], v[1], v[2]); }
+inline void glTranslate3dv(const double* v) { glTranslated(v[0], v[1], v[2]); }
+
+EIGEN_GL_FUNC_DECLARATION       (glTranslate)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 3,3dv)
+
+template<typename Scalar> void glTranslate(const Translation<Scalar,2>& t)  { glTranslate(t.vector()); }
+template<typename Scalar> void glTranslate(const Translation<Scalar,3>& t)  { glTranslate(t.vector()); }
+
+EIGEN_GL_FUNC_DECLARATION       (glMultMatrix)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,float,  4,4,f)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,double, 4,4,d)
+
+template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,Affine>& t)        { glMultMatrix(t.matrix()); }
+template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,Projective>& t)    { glMultMatrix(t.matrix()); }
+template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,AffineCompact>& t) { glMultMatrix(Transform<Scalar,3,Affine>(t).matrix()); }
+
+EIGEN_GL_FUNC_DECLARATION       (glLoadMatrix)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,float,  4,4,f)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,double, 4,4,d)
+
+template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Affine>& t)        { glLoadMatrix(t.matrix()); }
+template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Projective>& t)    { glLoadMatrix(t.matrix()); }
+template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompact>& t) { glLoadMatrix(Transform<Scalar,3,Affine>(t).matrix()); }
+
+inline void glRotate(const Rotation2D<float>& rot)
+{
+  glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f);
+}
+inline void glRotate(const Rotation2D<double>& rot)
+{
+  glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0);
+}
+
+template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
+{  
+  Transform<typename Derived::Scalar,3,Projective> tr(rot);
+  glMultMatrix(tr.matrix());
+}
+
+#define EIGEN_GL_MAKE_CONST_const const
+#define EIGEN_GL_MAKE_CONST__ 
+#define EIGEN_GL_EVAL(X) X
+
+#define EIGEN_GL_FUNC1_DECLARATION(FUNC,ARG1,CONST)                                                                             \
+namespace internal {                                                                                                            \
+  template< typename XprType,                                                                                                   \
+            typename Scalar = typename XprType::Scalar,                                                                         \
+            int Rows = XprType::RowsAtCompileTime,                                                                              \
+            int Cols = XprType::ColsAtCompileTime,                                                                              \
+            bool IsGLCompatible = bool(XprType::Flags&LinearAccessBit)                                                          \
+                              && bool(XprType::Flags&DirectAccessBit)                                                           \
+                              && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)>                           \
+  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl);                                                                                  \
+                                                                                                                                \
+  template<typename XprType, typename Scalar, int Rows, int Cols>                                                               \
+  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType,Scalar,Rows,Cols,false> {                                                 \
+    inline static void run(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) {                                      \
+      EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<typename plain_matrix_type_column_major<XprType>::type>::run(a,p); }                 \
+  };                                                                                                                            \
+}                                                                                                                               \
+                                                                                                                                \
+template<typename Derived> inline void FUNC(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) Eigen::DenseBase<Derived>& p) {   \
+  EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)<Derived>::run(a,p.derived());                                                  \
+}
+
+
+#define EIGEN_GL_FUNC1_SPECIALIZATION_MAT(FUNC,ARG1,CONST,SCALAR,ROWS,COLS,SUFFIX)                                              \
+namespace internal {                                                                                                            \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, ROWS, COLS, true> {                  \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); }         \
+  }; \
+}
+
+  
+#define EIGEN_GL_FUNC1_SPECIALIZATION_VEC(FUNC,ARG1,CONST,SCALAR,SIZE,SUFFIX)                                                   \
+namespace internal {                                                                                                            \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, SIZE, 1, true> {                     \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); }         \
+  };                                                                                                                            \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, 1, SIZE, true> {                     \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); }         \
+  };                                                                                                                            \
+}
+
+EIGEN_GL_FUNC1_DECLARATION       (glGet,GLenum,_)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,float,  4,4,Floatv)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,double, 4,4,Doublev)
+
+// glUniform API
+
+#ifdef GL_VERSION_2_0
+
+inline void glUniform2fv_ei  (GLint loc, const float* v)         { glUniform2fv(loc,1,v); }
+inline void glUniform2iv_ei  (GLint loc, const int* v)           { glUniform2iv(loc,1,v); }
+
+inline void glUniform3fv_ei  (GLint loc, const float* v)         { glUniform3fv(loc,1,v); }
+inline void glUniform3iv_ei  (GLint loc, const int* v)           { glUniform3iv(loc,1,v); }
+
+inline void glUniform4fv_ei  (GLint loc, const float* v)         { glUniform4fv(loc,1,v); }
+inline void glUniform4iv_ei  (GLint loc, const int* v)           { glUniform4iv(loc,1,v); }
+
+inline void glUniformMatrix2fv_ei  (GLint loc, const float* v)         { glUniformMatrix2fv(loc,1,false,v); }
+inline void glUniformMatrix3fv_ei  (GLint loc, const float* v)         { glUniformMatrix3fv(loc,1,false,v); }
+inline void glUniformMatrix4fv_ei  (GLint loc, const float* v)         { glUniformMatrix4fv(loc,1,false,v); }
+
+
+EIGEN_GL_FUNC1_DECLARATION       (glUniform,GLint,const)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float,        2,2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int,          2,2iv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float,        3,3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int,          3,3iv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float,        4,4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int,          4,4iv_ei)
+
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,2,Matrix2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,3,Matrix3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,4,Matrix4fv_ei)
+
+#endif
+
+#ifdef GL_VERSION_2_1
+
+static void glUniformMatrix2x3fv_ei(GLint loc, const float* v)         { glUniformMatrix2x3fv(loc,1,false,v); }
+static void glUniformMatrix3x2fv_ei(GLint loc, const float* v)         { glUniformMatrix3x2fv(loc,1,false,v); }
+static void glUniformMatrix2x4fv_ei(GLint loc, const float* v)         { glUniformMatrix2x4fv(loc,1,false,v); }
+static void glUniformMatrix4x2fv_ei(GLint loc, const float* v)         { glUniformMatrix4x2fv(loc,1,false,v); }
+static void glUniformMatrix3x4fv_ei(GLint loc, const float* v)         { glUniformMatrix3x4fv(loc,1,false,v); }
+static void glUniformMatrix4x3fv_ei(GLint loc, const float* v)         { glUniformMatrix4x3fv(loc,1,false,v); }
+
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,3,Matrix2x3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,2,Matrix3x2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,4,Matrix2x4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,2,Matrix4x2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,4,Matrix3x4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,3,Matrix4x3fv_ei)
+
+#endif
+
+#ifdef GL_VERSION_3_0
+
+inline void glUniform2uiv_ei (GLint loc, const unsigned int* v)  { glUniform2uiv(loc,1,v); }
+inline void glUniform3uiv_ei (GLint loc, const unsigned int* v)  { glUniform3uiv(loc,1,v); }
+inline void glUniform4uiv_ei (GLint loc, const unsigned int* v)  { glUniform4uiv(loc,1,v); }
+
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 2,2uiv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 3,3uiv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 4,4uiv_ei)
+
+#endif
+
+#ifdef GL_ARB_gpu_shader_fp64
+inline void glUniform2dv_ei  (GLint loc, const double* v)        { glUniform2dv(loc,1,v); }
+inline void glUniform3dv_ei  (GLint loc, const double* v)        { glUniform3dv(loc,1,v); }
+inline void glUniform4dv_ei  (GLint loc, const double* v)        { glUniform4dv(loc,1,v); }
+
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       2,2dv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       3,3dv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       4,4dv_ei)
+#endif
+
+
+//@}
+
+}
+
+#endif // EIGEN_OPENGL_MODULE
diff --git a/include/eigen3/unsupported/Eigen/Polynomials b/include/eigen3/unsupported/Eigen/Polynomials
new file mode 100644
index 0000000..cece563
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/Polynomials
@@ -0,0 +1,138 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_POLYNOMIALS_MODULE_H
+#define EIGEN_POLYNOMIALS_MODULE_H
+
+#include <Eigen/Core>
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+#include <Eigen/Eigenvalues>
+
+// Note that EIGEN_HIDE_HEAVY_CODE has to be defined per module
+#if (defined EIGEN_EXTERN_INSTANTIATIONS) && (EIGEN_EXTERN_INSTANTIATIONS>=2)
+  #ifndef EIGEN_HIDE_HEAVY_CODE
+  #define EIGEN_HIDE_HEAVY_CODE
+  #endif
+#elif defined EIGEN_HIDE_HEAVY_CODE
+  #undef EIGEN_HIDE_HEAVY_CODE
+#endif
+
+/**
+  * \defgroup Polynomials_Module Polynomials module
+  * \brief This module provides a QR based polynomial solver.
+	*
+  * To use this module, add
+  * \code
+  * #include <unsupported/Eigen/Polynomials>
+  * \endcode
+	* at the start of your source file.
+  */
+
+#include "src/Polynomials/PolynomialUtils.h"
+#include "src/Polynomials/Companion.h"
+#include "src/Polynomials/PolynomialSolver.h"
+
+/**
+	\page polynomials Polynomials defines functions for dealing with polynomials
+	and a QR based polynomial solver.
+	\ingroup Polynomials_Module
+
+	The remainder of the page documents first the functions for evaluating, computing
+	polynomials, computing estimates about polynomials and next the QR based polynomial
+	solver.
+
+	\section polynomialUtils convenient functions to deal with polynomials
+	\subsection roots_to_monicPolynomial
+	The function
+	\code
+	void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly )
+	\endcode
+	computes the coefficients \f$ a_i \f$ of
+
+	\f$ p(x) = a_0 + a_{1}x + ... + a_{n-1}x^{n-1} + x^n \f$
+
+	where \f$ p \f$ is known through its roots i.e. \f$ p(x) = (x-r_1)(x-r_2)...(x-r_n) \f$.
+
+	\subsection poly_eval
+	The function
+	\code
+	T poly_eval( const Polynomials& poly, const T& x )
+	\endcode
+	evaluates a polynomial at a given point using stabilized Hörner method.
+
+	The following code: first computes the coefficients in the monomial basis of the monic polynomial that has the provided roots;
+	then, it evaluates the computed polynomial, using a stabilized Hörner method.
+
+	\include PolynomialUtils1.cpp
+  Output: \verbinclude PolynomialUtils1.out
+
+	\subsection Cauchy bounds
+	The function
+	\code
+	Real cauchy_max_bound( const Polynomial& poly )
+	\endcode
+	provides a maximum bound (the Cauchy one: \f$C(p)\f$) for the absolute value of a root of the given polynomial i.e.
+	\f$ \forall r_i \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$,
+	\f$ |r_i| \le C(p) = \sum_{k=0}^{d} \left | \frac{a_k}{a_d} \right | \f$
+	The leading coefficient \f$ p \f$: should be non zero \f$a_d \neq 0\f$.
+
+
+	The function
+	\code
+	Real cauchy_min_bound( const Polynomial& poly )
+	\endcode
+	provides a minimum bound (the Cauchy one: \f$c(p)\f$) for the absolute value of a non zero root of the given polynomial i.e.
+	\f$ \forall r_i \neq 0 \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$,
+	\f$ |r_i| \ge c(p) = \left( \sum_{k=0}^{d} \left | \frac{a_k}{a_0} \right | \right)^{-1} \f$
+
+
+
+
+	\section QR polynomial solver class
+	Computes the complex roots of a polynomial by computing the eigenvalues of the associated companion matrix with the QR algorithm.
+	
+	The roots of \f$ p(x) = a_0 + a_1 x + a_2 x^2 + a_{3} x^3 + x^4 \f$ are the eigenvalues of
+	\f$
+	\left [
+	\begin{array}{cccc}
+	0 & 0 &  0 & a_0 \\
+	1 & 0 &  0 & a_1 \\
+	0 & 1 &  0 & a_2 \\
+	0 & 0 &  1 & a_3
+	\end{array} \right ]
+	\f$
+
+	However, the QR algorithm is not guaranteed to converge when there are several eigenvalues with same modulus.
+
+	Therefore the current polynomial solver is guaranteed to provide a correct result only when the complex roots \f$r_1,r_2,...,r_d\f$ have distinct moduli i.e.
+	
+	\f$ \forall i,j \in [1;d],~ \| r_i \| \neq \| r_j \| \f$.
+
+	With 32bit (float) floating types this problem shows up frequently.
+  However, almost always, correct accuracy is reached even in these cases for 64bit
+  (double) floating types and small polynomial degree (<20).
+
+	\include PolynomialSolver1.cpp
+	
+	In the above example:
+	
+	-# a simple use of the polynomial solver is shown;
+	-# the accuracy problem with the QR algorithm is presented: a polynomial with almost conjugate roots is provided to the solver.
+	Those roots have almost same module therefore the QR algorithm failed to converge: the accuracy
+	of the last root is bad;
+	-# a simple way to circumvent the problem is shown: use doubles instead of floats.
+
+  Output: \verbinclude PolynomialSolver1.out
+*/
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_POLYNOMIALS_MODULE_H
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/unsupported/Eigen/Skyline b/include/eigen3/unsupported/Eigen/Skyline
new file mode 100644
index 0000000..71a68cb
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/Skyline
@@ -0,0 +1,39 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINE_MODULE_H
+#define EIGEN_SKYLINE_MODULE_H
+
+
+#include "Eigen/Core"
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include <map>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+
+/**
+ *  \defgroup Skyline_Module Skyline module
+ *
+ *
+ *
+ *
+ */
+
+#include "src/Skyline/SkylineUtil.h"
+#include "src/Skyline/SkylineMatrixBase.h"
+#include "src/Skyline/SkylineStorage.h"
+#include "src/Skyline/SkylineMatrix.h"
+#include "src/Skyline/SkylineInplaceLU.h"
+#include "src/Skyline/SkylineProduct.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SKYLINE_MODULE_H
diff --git a/include/eigen3/unsupported/Eigen/SparseExtra b/include/eigen3/unsupported/Eigen/SparseExtra
new file mode 100644
index 0000000..b559790
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/SparseExtra
@@ -0,0 +1,56 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael at free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_EXTRA_MODULE_H
+#define EIGEN_SPARSE_EXTRA_MODULE_H
+
+#include "../../Eigen/Sparse"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include <vector>
+#include <map>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+
+#ifdef EIGEN_GOOGLEHASH_SUPPORT
+  #include <google/dense_hash_map>
+#endif
+
+/**
+  * \defgroup SparseExtra_Module SparseExtra module
+  *
+  * This module contains some experimental features extending the sparse module.
+  *
+  * \code
+  * #include <Eigen/SparseExtra>
+  * \endcode
+  */
+
+
+#include "../../Eigen/src/misc/Solve.h"
+#include "../../Eigen/src/misc/SparseSolve.h"
+
+#include "src/SparseExtra/DynamicSparseMatrix.h"
+#include "src/SparseExtra/BlockOfDynamicSparseMatrix.h"
+#include "src/SparseExtra/RandomSetter.h"
+
+#include "src/SparseExtra/MarketIO.h"
+
+#if !defined(_WIN32)
+#include <dirent.h>
+#include "src/SparseExtra/MatrixMarketIterator.h"
+#endif
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPARSE_EXTRA_MODULE_H
diff --git a/include/eigen3/unsupported/Eigen/Splines b/include/eigen3/unsupported/Eigen/Splines
new file mode 100644
index 0000000..322e6b9
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/Splines
@@ -0,0 +1,31 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINES_MODULE_H
+#define EIGEN_SPLINES_MODULE_H
+
+namespace Eigen 
+{
+/**
+  * \defgroup Splines_Module Spline and spline fitting module
+  *
+  * This module provides a simple multi-dimensional spline class while
+  * offering most basic functionality to fit a spline to point sets.
+  *
+  * \code
+  * #include <unsupported/Eigen/Splines>
+  * \endcode
+  */
+}
+
+#include "src/Splines/SplineFwd.h"
+#include "src/Splines/Spline.h"
+#include "src/Splines/SplineFitting.h"
+
+#endif // EIGEN_SPLINES_MODULE_H
diff --git a/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h b/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
new file mode 100644
index 0000000..1a61e33
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
@@ -0,0 +1,83 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_JACOBIAN_H
+#define EIGEN_AUTODIFF_JACOBIAN_H
+
+namespace Eigen
+{
+
+template<typename Functor> class AutoDiffJacobian : public Functor
+{
+public:
+  AutoDiffJacobian() : Functor() {}
+  AutoDiffJacobian(const Functor& f) : Functor(f) {}
+
+  // forward constructors
+  template<typename T0>
+  AutoDiffJacobian(const T0& a0) : Functor(a0) {}
+  template<typename T0, typename T1>
+  AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
+  template<typename T0, typename T1, typename T2>
+  AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
+
+  enum {
+    InputsAtCompileTime = Functor::InputsAtCompileTime,
+    ValuesAtCompileTime = Functor::ValuesAtCompileTime
+  };
+
+  typedef typename Functor::InputType InputType;
+  typedef typename Functor::ValueType ValueType;
+  typedef typename Functor::JacobianType JacobianType;
+  typedef typename JacobianType::Scalar Scalar;
+  typedef typename JacobianType::Index Index;
+
+  typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
+  typedef AutoDiffScalar<DerivativeType> ActiveScalar;
+
+
+  typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
+  typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
+
+  void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
+  {
+    eigen_assert(v!=0);
+    if (!_jac)
+    {
+      Functor::operator()(x, v);
+      return;
+    }
+
+    JacobianType& jac = *_jac;
+
+    ActiveInput ax = x.template cast<ActiveScalar>();
+    ActiveValue av(jac.rows());
+
+    if(InputsAtCompileTime==Dynamic)
+      for (Index j=0; j<jac.rows(); j++)
+        av[j].derivatives().resize(this->inputs());
+
+    for (Index i=0; i<jac.cols(); i++)
+      ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i);
+
+    Functor::operator()(ax, &av);
+
+    for (Index i=0; i<jac.rows(); i++)
+    {
+      (*v)[i] = av[i].value();
+      jac.row(i) = av[i].derivatives();
+    }
+  }
+protected:
+
+};
+
+}
+
+#endif // EIGEN_AUTODIFF_JACOBIAN_H
diff --git a/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
new file mode 100644
index 0000000..8d42e69
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -0,0 +1,642 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_SCALAR_H
+#define EIGEN_AUTODIFF_SCALAR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename A, typename B>
+struct make_coherent_impl {
+  static void run(A&, B&) {}
+};
+
+// resize a to match b is a.size()==0, and conversely.
+template<typename A, typename B>
+void make_coherent(const A& a, const B&b)
+{
+  make_coherent_impl<A,B>::run(a.const_cast_derived(), b.const_cast_derived());
+}
+
+template<typename _DerType, bool Enable> struct auto_diff_special_op;
+
+} // end namespace internal
+
+/** \class AutoDiffScalar
+  * \brief A scalar type replacement with automatic differentation capability
+  *
+  * \param _DerType the vector type used to store/represent the derivatives. The base scalar type
+  *                 as well as the number of derivatives to compute are determined from this type.
+  *                 Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf
+  *                 if the number of derivatives is not known at compile time, and/or, the number
+  *                 of derivatives is large.
+  *                 Note that _DerType can also be a reference (e.g., \c VectorXf&) to wrap a
+  *                 existing vector into an AutoDiffScalar.
+  *                 Finally, _DerType can also be any Eigen compatible expression.
+  *
+  * This class represents a scalar value while tracking its respective derivatives using Eigen's expression
+  * template mechanism.
+  *
+  * It supports the following list of global math function:
+  *  - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
+  *  - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
+  *  - internal::conj, internal::real, internal::imag, numext::abs2.
+  *
+  * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
+  * in that case, the expression template mechanism only occurs at the top Matrix level,
+  * while derivatives are computed right away.
+  *
+  */
+
+template<typename _DerType>
+class AutoDiffScalar
+  : public internal::auto_diff_special_op
+            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
+                                        typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
+{
+  public:
+    typedef internal::auto_diff_special_op
+            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
+                       typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value> Base;
+    typedef typename internal::remove_all<_DerType>::type DerType;
+    typedef typename internal::traits<DerType>::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real Real;
+
+    using Base::operator+;
+    using Base::operator*;
+
+    /** Default constructor without any initialization. */
+    AutoDiffScalar() {}
+
+    /** Constructs an active scalar from its \a value,
+        and initializes the \a nbDer derivatives such that it corresponds to the \a derNumber -th variable */
+    AutoDiffScalar(const Scalar& value, int nbDer, int derNumber)
+      : m_value(value), m_derivatives(DerType::Zero(nbDer))
+    {
+      m_derivatives.coeffRef(derNumber) = Scalar(1);
+    }
+
+    /** Conversion from a scalar constant to an active scalar.
+      * The derivatives are set to zero. */
+    /*explicit*/ AutoDiffScalar(const Real& value)
+      : m_value(value)
+    {
+      if(m_derivatives.size()>0)
+        m_derivatives.setZero();
+    }
+
+    /** Constructs an active scalar from its \a value and derivatives \a der */
+    AutoDiffScalar(const Scalar& value, const DerType& der)
+      : m_value(value), m_derivatives(der)
+    {}
+
+    template<typename OtherDerType>
+    AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other)
+      : m_value(other.value()), m_derivatives(other.derivatives())
+    {}
+
+    friend  std::ostream & operator << (std::ostream & s, const AutoDiffScalar& a)
+    {
+      return s << a.value();
+    }
+
+    AutoDiffScalar(const AutoDiffScalar& other)
+      : m_value(other.value()), m_derivatives(other.derivatives())
+    {}
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar& operator=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      m_value = other.value();
+      m_derivatives = other.derivatives();
+      return *this;
+    }
+
+    inline AutoDiffScalar& operator=(const AutoDiffScalar& other)
+    {
+      m_value = other.value();
+      m_derivatives = other.derivatives();
+      return *this;
+    }
+
+//     inline operator const Scalar& () const { return m_value; }
+//     inline operator Scalar& () { return m_value; }
+
+    inline const Scalar& value() const { return m_value; }
+    inline Scalar& value() { return m_value; }
+
+    inline const DerType& derivatives() const { return m_derivatives; }
+    inline DerType& derivatives() { return m_derivatives; }
+
+    inline bool operator< (const Scalar& other) const  { return m_value <  other; }
+    inline bool operator<=(const Scalar& other) const  { return m_value <= other; }
+    inline bool operator> (const Scalar& other) const  { return m_value >  other; }
+    inline bool operator>=(const Scalar& other) const  { return m_value >= other; }
+    inline bool operator==(const Scalar& other) const  { return m_value == other; }
+    inline bool operator!=(const Scalar& other) const  { return m_value != other; }
+
+    friend inline bool operator< (const Scalar& a, const AutoDiffScalar& b) { return a <  b.value(); }
+    friend inline bool operator<=(const Scalar& a, const AutoDiffScalar& b) { return a <= b.value(); }
+    friend inline bool operator> (const Scalar& a, const AutoDiffScalar& b) { return a >  b.value(); }
+    friend inline bool operator>=(const Scalar& a, const AutoDiffScalar& b) { return a >= b.value(); }
+    friend inline bool operator==(const Scalar& a, const AutoDiffScalar& b) { return a == b.value(); }
+    friend inline bool operator!=(const Scalar& a, const AutoDiffScalar& b) { return a != b.value(); }
+
+    template<typename OtherDerType> inline bool operator< (const AutoDiffScalar<OtherDerType>& b) const  { return m_value <  b.value(); }
+    template<typename OtherDerType> inline bool operator<=(const AutoDiffScalar<OtherDerType>& b) const  { return m_value <= b.value(); }
+    template<typename OtherDerType> inline bool operator> (const AutoDiffScalar<OtherDerType>& b) const  { return m_value >  b.value(); }
+    template<typename OtherDerType> inline bool operator>=(const AutoDiffScalar<OtherDerType>& b) const  { return m_value >= b.value(); }
+    template<typename OtherDerType> inline bool operator==(const AutoDiffScalar<OtherDerType>& b) const  { return m_value == b.value(); }
+    template<typename OtherDerType> inline bool operator!=(const AutoDiffScalar<OtherDerType>& b) const  { return m_value != b.value(); }
+
+    inline const AutoDiffScalar<DerType&> operator+(const Scalar& other) const
+    {
+      return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
+    }
+
+    friend inline const AutoDiffScalar<DerType&> operator+(const Scalar& a, const AutoDiffScalar& b)
+    {
+      return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+    }
+
+//     inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
+//     {
+//       return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
+//     }
+
+//     friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar& b)
+//     {
+//       return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+//     }
+
+    inline AutoDiffScalar& operator+=(const Scalar& other)
+    {
+      value() += other;
+      return *this;
+    }
+
+    template<typename OtherDerType>
+    inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,const DerType,const typename internal::remove_all<OtherDerType>::type> >
+    operator+(const AutoDiffScalar<OtherDerType>& other) const
+    {
+      internal::make_coherent(m_derivatives, other.derivatives());
+      return AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,const DerType,const typename internal::remove_all<OtherDerType>::type> >(
+        m_value + other.value(),
+        m_derivatives + other.derivatives());
+    }
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar&
+    operator+=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      (*this) = (*this) + other;
+      return *this;
+    }
+
+    inline const AutoDiffScalar<DerType&> operator-(const Scalar& b) const
+    {
+      return AutoDiffScalar<DerType&>(m_value - b, m_derivatives);
+    }
+
+    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
+    operator-(const Scalar& a, const AutoDiffScalar& b)
+    {
+      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
+            (a - b.value(), -b.derivatives());
+    }
+
+    inline AutoDiffScalar& operator-=(const Scalar& other)
+    {
+      value() -= other;
+      return *this;
+    }
+
+    template<typename OtherDerType>
+    inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_difference_op<Scalar>, const DerType,const typename internal::remove_all<OtherDerType>::type> >
+    operator-(const AutoDiffScalar<OtherDerType>& other) const
+    {
+      internal::make_coherent(m_derivatives, other.derivatives());
+      return AutoDiffScalar<CwiseBinaryOp<internal::scalar_difference_op<Scalar>, const DerType,const typename internal::remove_all<OtherDerType>::type> >(
+        m_value - other.value(),
+        m_derivatives - other.derivatives());
+    }
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar&
+    operator-=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      *this = *this - other;
+      return *this;
+    }
+
+    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
+    operator-() const
+    {
+      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >(
+        -m_value,
+        -m_derivatives);
+    }
+
+    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    operator*(const Scalar& other) const
+    {
+      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
+        m_value * other,
+        (m_derivatives * other));
+    }
+
+    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    operator*(const Scalar& other, const AutoDiffScalar& a)
+    {
+      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
+        a.value() * other,
+        a.derivatives() * other);
+    }
+
+//     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+//     operator*(const Real& other) const
+//     {
+//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+//         m_value * other,
+//         (m_derivatives * other));
+//     }
+//
+//     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+//     operator*(const Real& other, const AutoDiffScalar& a)
+//     {
+//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+//         a.value() * other,
+//         a.derivatives() * other);
+//     }
+
+    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    operator/(const Scalar& other) const
+    {
+      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
+        m_value / other,
+        (m_derivatives * (Scalar(1)/other)));
+    }
+
+    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    operator/(const Scalar& other, const AutoDiffScalar& a)
+    {
+      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
+        other / a.value(),
+        a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
+    }
+
+//     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+//     operator/(const Real& other) const
+//     {
+//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+//         m_value / other,
+//         (m_derivatives * (Real(1)/other)));
+//     }
+//
+//     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+//     operator/(const Real& other, const AutoDiffScalar& a)
+//     {
+//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+//         other / a.value(),
+//         a.derivatives() * (-Real(1)/other));
+//     }
+
+    template<typename OtherDerType>
+    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
+        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
+          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
+          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >
+    operator/(const AutoDiffScalar<OtherDerType>& other) const
+    {
+      internal::make_coherent(m_derivatives, other.derivatives());
+      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
+        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
+          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
+          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >(
+        m_value / other.value(),
+          ((m_derivatives * other.value()) - (m_value * other.derivatives()))
+        * (Scalar(1)/(other.value()*other.value())));
+    }
+
+    template<typename OtherDerType>
+    inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
+        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type> > >
+    operator*(const AutoDiffScalar<OtherDerType>& other) const
+    {
+      internal::make_coherent(m_derivatives, other.derivatives());
+      return AutoDiffScalar<const CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
+        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > >(
+        m_value * other.value(),
+        (m_derivatives * other.value()) + (m_value * other.derivatives()));
+    }
+
+    inline AutoDiffScalar& operator*=(const Scalar& other)
+    {
+      *this = *this * other;
+      return *this;
+    }
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar& operator*=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      *this = *this * other;
+      return *this;
+    }
+
+    inline AutoDiffScalar& operator/=(const Scalar& other)
+    {
+      *this = *this / other;
+      return *this;
+    }
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar& operator/=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      *this = *this / other;
+      return *this;
+    }
+
+  protected:
+    Scalar m_value;
+    DerType m_derivatives;
+
+};
+
+namespace internal {
+
+template<typename _DerType>
+struct auto_diff_special_op<_DerType, true>
+//   : auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
+//                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value>
+{
+  typedef typename remove_all<_DerType>::type DerType;
+  typedef typename traits<DerType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real Real;
+
+//   typedef auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
+//                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value> Base;
+
+//   using Base::operator+;
+//   using Base::operator+=;
+//   using Base::operator-;
+//   using Base::operator-=;
+//   using Base::operator*;
+//   using Base::operator*=;
+
+  const AutoDiffScalar<_DerType>& derived() const { return *static_cast<const AutoDiffScalar<_DerType>*>(this); }
+  AutoDiffScalar<_DerType>& derived() { return *static_cast<AutoDiffScalar<_DerType>*>(this); }
+
+
+  inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
+  {
+    return AutoDiffScalar<DerType&>(derived().value() + other, derived().derivatives());
+  }
+
+  friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<_DerType>& b)
+  {
+    return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+  }
+
+  inline AutoDiffScalar<_DerType>& operator+=(const Real& other)
+  {
+    derived().value() += other;
+    return derived();
+  }
+
+
+  inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+  operator*(const Real& other) const
+  {
+    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+      derived().value() * other,
+      derived().derivatives() * other);
+  }
+
+  friend inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+  operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
+  {
+    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+      a.value() * other,
+      a.derivatives() * other);
+  }
+
+  inline AutoDiffScalar<_DerType>& operator*=(const Scalar& other)
+  {
+    *this = *this * other;
+    return derived();
+  }
+};
+
+template<typename _DerType>
+struct auto_diff_special_op<_DerType, false>
+{
+  void operator*() const;
+  void operator-() const;
+  void operator+() const;
+};
+
+template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols, typename B>
+struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>, B> {
+  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
+  static void run(A& a, B& b) {
+    if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0))
+    {
+      a.resize(b.size());
+      a.setZero();
+    }
+  }
+};
+
+template<typename A, typename B_Scalar, int B_Rows, int B_Cols, int B_Options, int B_MaxRows, int B_MaxCols>
+struct make_coherent_impl<A, Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
+  typedef Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> B;
+  static void run(A& a, B& b) {
+    if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0))
+    {
+      b.resize(a.size());
+      b.setZero();
+    }
+  }
+};
+
+template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols,
+         typename B_Scalar, int B_Rows, int B_Cols, int B_Options, int B_MaxRows, int B_MaxCols>
+struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,
+                             Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
+  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
+  typedef Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> B;
+  static void run(A& a, B& b) {
+    if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0))
+    {
+      a.resize(b.size());
+      a.setZero();
+    }
+    else if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0))
+    {
+      b.resize(a.size());
+      b.setZero();
+    }
+  }
+};
+
+template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
+struct scalar_product_traits<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,A_Scalar>
+{
+  enum { Defined = 1 };
+  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
+};
+
+template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
+struct scalar_product_traits<A_Scalar, Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> >
+{
+  enum { Defined = 1 };
+  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
+};
+
+template<typename DerType>
+struct scalar_product_traits<AutoDiffScalar<DerType>,typename DerType::Scalar>
+{
+  enum { Defined = 1 };
+  typedef AutoDiffScalar<DerType> ReturnType;
+};
+
+template<typename DerType>
+struct scalar_product_traits<typename DerType::Scalar,AutoDiffScalar<DerType> >
+{
+  enum { Defined = 1 };
+  typedef AutoDiffScalar<DerType> ReturnType;
+};
+
+} // end namespace internal
+
+#define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
+  template<typename DerType> \
+  inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > \
+  FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
+    using namespace Eigen; \
+    typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
+    typedef AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > ReturnType; \
+    CODE; \
+  }
+
+template<typename DerType>
+inline const AutoDiffScalar<DerType>& conj(const AutoDiffScalar<DerType>& x)  { return x; }
+template<typename DerType>
+inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x)  { return x; }
+template<typename DerType>
+inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&)    { return 0.; }
+template<typename DerType, typename T>
+inline AutoDiffScalar<DerType> (min)(const AutoDiffScalar<DerType>& x, const T& y)    { return (x <= y ? x : y); }
+template<typename DerType, typename T>
+inline AutoDiffScalar<DerType> (max)(const AutoDiffScalar<DerType>& x, const T& y)    { return (x >= y ? x : y); }
+template<typename DerType, typename T>
+inline AutoDiffScalar<DerType> (min)(const T& x, const AutoDiffScalar<DerType>& y)    { return (x < y ? x : y); }
+template<typename DerType, typename T>
+inline AutoDiffScalar<DerType> (max)(const T& x, const AutoDiffScalar<DerType>& y)    { return (x > y ? x : y); }
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs,
+  using std::abs;
+  return ReturnType(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2,
+  using numext::abs2;
+  return ReturnType(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt,
+  using std::sqrt;
+  Scalar sqrtx = sqrt(x.value());
+  return ReturnType(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos,
+  using std::cos;
+  using std::sin;
+  return ReturnType(cos(x.value()), x.derivatives() * (-sin(x.value())));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin,
+  using std::sin;
+  using std::cos;
+  return ReturnType(sin(x.value()),x.derivatives() * cos(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp,
+  using std::exp;
+  Scalar expx = exp(x.value());
+  return ReturnType(expx,x.derivatives() * expx);)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log,
+  using std::log;
+  return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
+
+template<typename DerType>
+inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<DerType>::Scalar>, const DerType> >
+pow(const Eigen::AutoDiffScalar<DerType>& x, typename Eigen::internal::traits<DerType>::Scalar y)
+{
+  using namespace Eigen;
+  typedef typename Eigen::internal::traits<DerType>::Scalar Scalar;
+  return AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const DerType> >(
+    std::pow(x.value(),y),
+    x.derivatives() * (y * std::pow(x.value(),y-1)));
+}
+
+
+template<typename DerTypeA,typename DerTypeB>
+inline const AutoDiffScalar<Matrix<typename internal::traits<DerTypeA>::Scalar,Dynamic,1> >
+atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
+{
+  using std::atan2;
+  using std::max;
+  typedef typename internal::traits<DerTypeA>::Scalar Scalar;
+  typedef AutoDiffScalar<Matrix<Scalar,Dynamic,1> > PlainADS;
+  PlainADS ret;
+  ret.value() = atan2(a.value(), b.value());
+  
+  Scalar tmp2 = a.value() * a.value();
+  Scalar tmp3 = b.value() * b.value();
+  Scalar tmp4 = tmp3/(tmp2+tmp3);
+  
+  if (tmp4!=0)
+    ret.derivatives() = (a.derivatives() * b.value() - a.value() * b.derivatives()) * (tmp2+tmp3);
+
+  return ret;
+}
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan,
+  using std::tan;
+  using std::cos;
+  return ReturnType(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin,
+  using std::sqrt;
+  using std::asin;
+  return ReturnType(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
+  
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
+  using std::sqrt;
+  using std::acos;
+  return ReturnType(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+
+#undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY
+
+template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
+  : NumTraits< typename NumTraits<typename DerType::Scalar>::Real >
+{
+  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime> > Real;
+  typedef AutoDiffScalar<DerType> NonInteger;
+  typedef AutoDiffScalar<DerType>& Nested;
+  enum{
+    RequireInitialization = 1
+  };
+};
+
+}
+
+#endif // EIGEN_AUTODIFF_SCALAR_H
diff --git a/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h b/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
new file mode 100644
index 0000000..8c2d048
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
@@ -0,0 +1,220 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_VECTOR_H
+#define EIGEN_AUTODIFF_VECTOR_H
+
+namespace Eigen {
+
+/* \class AutoDiffScalar
+  * \brief A scalar type replacement with automatic differentation capability
+  *
+  * \param DerType the vector type used to store/represent the derivatives (e.g. Vector3f)
+  *
+  * This class represents a scalar value while tracking its respective derivatives.
+  *
+  * It supports the following list of global math function:
+  *  - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
+  *  - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
+  *  - internal::conj, internal::real, internal::imag, numext::abs2.
+  *
+  * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
+  * in that case, the expression template mechanism only occurs at the top Matrix level,
+  * while derivatives are computed right away.
+  *
+  */
+template<typename ValueType, typename JacobianType>
+class AutoDiffVector
+{
+  public:
+    //typedef typename internal::traits<ValueType>::Scalar Scalar;
+    typedef typename internal::traits<ValueType>::Scalar BaseScalar;
+    typedef AutoDiffScalar<Matrix<BaseScalar,JacobianType::RowsAtCompileTime,1> > ActiveScalar;
+    typedef ActiveScalar Scalar;
+    typedef AutoDiffScalar<typename JacobianType::ColXpr> CoeffType;
+    typedef typename JacobianType::Index Index;
+
+    inline AutoDiffVector() {}
+
+    inline AutoDiffVector(const ValueType& values)
+      : m_values(values)
+    {
+      m_jacobian.setZero();
+    }
+
+
+    CoeffType operator[] (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+    const CoeffType operator[] (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+    CoeffType operator() (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+    const CoeffType operator() (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+    CoeffType coeffRef(Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+    const CoeffType coeffRef(Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+    Index size() const { return m_values.size(); }
+
+    // FIXME here we could return an expression of the sum
+    Scalar sum() const { /*std::cerr << "sum \n\n";*/ /*std::cerr << m_jacobian.rowwise().sum() << "\n\n";*/ return Scalar(m_values.sum(), m_jacobian.rowwise().sum()); }
+
+
+    inline AutoDiffVector(const ValueType& values, const JacobianType& jac)
+      : m_values(values), m_jacobian(jac)
+    {}
+
+    template<typename OtherValueType, typename OtherJacobianType>
+    inline AutoDiffVector(const AutoDiffVector<OtherValueType, OtherJacobianType>& other)
+      : m_values(other.values()), m_jacobian(other.jacobian())
+    {}
+
+    inline AutoDiffVector(const AutoDiffVector& other)
+      : m_values(other.values()), m_jacobian(other.jacobian())
+    {}
+
+    template<typename OtherValueType, typename OtherJacobianType>
+    inline AutoDiffVector& operator=(const AutoDiffVector<OtherValueType, OtherJacobianType>& other)
+    {
+      m_values = other.values();
+      m_jacobian = other.jacobian();
+      return *this;
+    }
+
+    inline AutoDiffVector& operator=(const AutoDiffVector& other)
+    {
+      m_values = other.values();
+      m_jacobian = other.jacobian();
+      return *this;
+    }
+
+    inline const ValueType& values() const { return m_values; }
+    inline ValueType& values() { return m_values; }
+
+    inline const JacobianType& jacobian() const { return m_jacobian; }
+    inline JacobianType& jacobian() { return m_jacobian; }
+
+    template<typename OtherValueType,typename OtherJacobianType>
+    inline const AutoDiffVector<
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,ValueType,OtherValueType>::Type,
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,JacobianType,OtherJacobianType>::Type >
+    operator+(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+    {
+      return AutoDiffVector<
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,ValueType,OtherValueType>::Type,
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,JacobianType,OtherJacobianType>::Type >(
+        m_values + other.values(),
+        m_jacobian + other.jacobian());
+    }
+
+    template<typename OtherValueType, typename OtherJacobianType>
+    inline AutoDiffVector&
+    operator+=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
+    {
+      m_values += other.values();
+      m_jacobian += other.jacobian();
+      return *this;
+    }
+
+    template<typename OtherValueType,typename OtherJacobianType>
+    inline const AutoDiffVector<
+      typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,ValueType,OtherValueType>::Type,
+      typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,JacobianType,OtherJacobianType>::Type >
+    operator-(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+    {
+      return AutoDiffVector<
+        typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,ValueType,OtherValueType>::Type,
+        typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,JacobianType,OtherJacobianType>::Type >(
+          m_values - other.values(),
+          m_jacobian - other.jacobian());
+    }
+
+    template<typename OtherValueType, typename OtherJacobianType>
+    inline AutoDiffVector&
+    operator-=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
+    {
+      m_values -= other.values();
+      m_jacobian -= other.jacobian();
+      return *this;
+    }
+
+    inline const AutoDiffVector<
+      typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
+      typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type >
+    operator-() const
+    {
+      return AutoDiffVector<
+        typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
+        typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type >(
+          -m_values,
+          -m_jacobian);
+    }
+
+    inline const AutoDiffVector<
+      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type>
+    operator*(const BaseScalar& other) const
+    {
+      return AutoDiffVector<
+        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >(
+          m_values * other,
+          m_jacobian * other);
+    }
+
+    friend inline const AutoDiffVector<
+      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >
+    operator*(const Scalar& other, const AutoDiffVector& v)
+    {
+      return AutoDiffVector<
+        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >(
+          v.values() * other,
+          v.jacobian() * other);
+    }
+
+//     template<typename OtherValueType,typename OtherJacobianType>
+//     inline const AutoDiffVector<
+//       CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
+//       CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+//         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
+//         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >
+//     operator*(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+//     {
+//       return AutoDiffVector<
+//         CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
+//         CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+//           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
+//           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >(
+//             m_values.cwise() * other.values(),
+//             (m_jacobian * other.values()) + (m_values * other.jacobian()));
+//     }
+
+    inline AutoDiffVector& operator*=(const Scalar& other)
+    {
+      m_values *= other;
+      m_jacobian *= other;
+      return *this;
+    }
+
+    template<typename OtherValueType,typename OtherJacobianType>
+    inline AutoDiffVector& operator*=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
+    {
+      *this = *this * other;
+      return *this;
+    }
+
+  protected:
+    ValueType m_values;
+    JacobianType m_jacobian;
+
+};
+
+}
+
+#endif // EIGEN_AUTODIFF_VECTOR_H
diff --git a/include/eigen3/unsupported/Eigen/src/BVH/BVAlgorithms.h b/include/eigen3/unsupported/Eigen/src/BVH/BVAlgorithms.h
new file mode 100644
index 0000000..994c8af
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/BVH/BVAlgorithms.h
@@ -0,0 +1,293 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran at mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BVALGORITHMS_H
+#define EIGEN_BVALGORITHMS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename BVH, typename Intersector>
+bool intersect_helper(const BVH &tree, Intersector &intersector, typename BVH::Index root)
+{
+  typedef typename BVH::Index Index;
+  typedef typename BVH::VolumeIterator VolIter;
+  typedef typename BVH::ObjectIterator ObjIter;
+
+  VolIter vBegin = VolIter(), vEnd = VolIter();
+  ObjIter oBegin = ObjIter(), oEnd = ObjIter();
+
+  std::vector<Index> todo(1, root);
+
+  while(!todo.empty()) {
+    tree.getChildren(todo.back(), vBegin, vEnd, oBegin, oEnd);
+    todo.pop_back();
+
+    for(; vBegin != vEnd; ++vBegin) //go through child volumes
+      if(intersector.intersectVolume(tree.getVolume(*vBegin)))
+        todo.push_back(*vBegin);
+
+    for(; oBegin != oEnd; ++oBegin) //go through child objects
+      if(intersector.intersectObject(*oBegin))
+        return true; //intersector said to stop query
+  }
+  return false;
+}
+#endif //not EIGEN_PARSED_BY_DOXYGEN
+
+template<typename Volume1, typename Object1, typename Object2, typename Intersector>
+struct intersector_helper1
+{
+  intersector_helper1(const Object2 &inStored, Intersector &in) : stored(inStored), intersector(in) {}
+  bool intersectVolume(const Volume1 &vol) { return intersector.intersectVolumeObject(vol, stored); }
+  bool intersectObject(const Object1 &obj) { return intersector.intersectObjectObject(obj, stored); }
+  Object2 stored;
+  Intersector &intersector;
+private:
+  intersector_helper1& operator=(const intersector_helper1&);
+};
+
+template<typename Volume2, typename Object2, typename Object1, typename Intersector>
+struct intersector_helper2
+{
+  intersector_helper2(const Object1 &inStored, Intersector &in) : stored(inStored), intersector(in) {}
+  bool intersectVolume(const Volume2 &vol) { return intersector.intersectObjectVolume(stored, vol); }
+  bool intersectObject(const Object2 &obj) { return intersector.intersectObjectObject(stored, obj); }
+  Object1 stored;
+  Intersector &intersector;
+private:
+  intersector_helper2& operator=(const intersector_helper2&);
+};
+
+} // end namespace internal
+
+/**  Given a BVH, runs the query encapsulated by \a intersector.
+  *  The Intersector type must provide the following members: \code
+     bool intersectVolume(const BVH::Volume &volume) //returns true if volume intersects the query
+     bool intersectObject(const BVH::Object &object) //returns true if the search should terminate immediately
+  \endcode
+  */
+template<typename BVH, typename Intersector>
+void BVIntersect(const BVH &tree, Intersector &intersector)
+{
+  internal::intersect_helper(tree, intersector, tree.getRootIndex());
+}
+
+/**  Given two BVH's, runs the query on their Cartesian product encapsulated by \a intersector.
+  *  The Intersector type must provide the following members: \code
+     bool intersectVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2) //returns true if product of volumes intersects the query
+     bool intersectVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2) //returns true if the volume-object product intersects the query
+     bool intersectObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2) //returns true if the volume-object product intersects the query
+     bool intersectObjectObject(const BVH1::Object &o1, const BVH2::Object &o2) //returns true if the search should terminate immediately
+  \endcode
+  */
+template<typename BVH1, typename BVH2, typename Intersector>
+void BVIntersect(const BVH1 &tree1, const BVH2 &tree2, Intersector &intersector) //TODO: tandem descent when it makes sense
+{
+  typedef typename BVH1::Index Index1;
+  typedef typename BVH2::Index Index2;
+  typedef internal::intersector_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object, Intersector> Helper1;
+  typedef internal::intersector_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object, Intersector> Helper2;
+  typedef typename BVH1::VolumeIterator VolIter1;
+  typedef typename BVH1::ObjectIterator ObjIter1;
+  typedef typename BVH2::VolumeIterator VolIter2;
+  typedef typename BVH2::ObjectIterator ObjIter2;
+
+  VolIter1 vBegin1 = VolIter1(), vEnd1 = VolIter1();
+  ObjIter1 oBegin1 = ObjIter1(), oEnd1 = ObjIter1();
+  VolIter2 vBegin2 = VolIter2(), vEnd2 = VolIter2(), vCur2 = VolIter2();
+  ObjIter2 oBegin2 = ObjIter2(), oEnd2 = ObjIter2(), oCur2 = ObjIter2();
+
+  std::vector<std::pair<Index1, Index2> > todo(1, std::make_pair(tree1.getRootIndex(), tree2.getRootIndex()));
+
+  while(!todo.empty()) {
+    tree1.getChildren(todo.back().first, vBegin1, vEnd1, oBegin1, oEnd1);
+    tree2.getChildren(todo.back().second, vBegin2, vEnd2, oBegin2, oEnd2);
+    todo.pop_back();
+
+    for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree
+      const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1);
+      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+        if(intersector.intersectVolumeVolume(vol1, tree2.getVolume(*vCur2)))
+          todo.push_back(std::make_pair(*vBegin1, *vCur2));
+      }
+
+      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+        Helper1 helper(*oCur2, intersector);
+        if(internal::intersect_helper(tree1, helper, *vBegin1))
+          return; //intersector said to stop query
+      }
+    }
+
+    for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree
+      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+        Helper2 helper(*oBegin1, intersector);
+        if(internal::intersect_helper(tree2, helper, *vCur2))
+          return; //intersector said to stop query
+      }
+
+      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+        if(intersector.intersectObjectObject(*oBegin1, *oCur2))
+          return; //intersector said to stop query
+      }
+    }
+  }
+}
+
+namespace internal {
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename BVH, typename Minimizer>
+typename Minimizer::Scalar minimize_helper(const BVH &tree, Minimizer &minimizer, typename BVH::Index root, typename Minimizer::Scalar minimum)
+{
+  typedef typename Minimizer::Scalar Scalar;
+  typedef typename BVH::Index Index;
+  typedef std::pair<Scalar, Index> QueueElement; //first element is priority
+  typedef typename BVH::VolumeIterator VolIter;
+  typedef typename BVH::ObjectIterator ObjIter;
+
+  VolIter vBegin = VolIter(), vEnd = VolIter();
+  ObjIter oBegin = ObjIter(), oEnd = ObjIter();
+  std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > todo; //smallest is at the top
+
+  todo.push(std::make_pair(Scalar(), root));
+
+  while(!todo.empty()) {
+    tree.getChildren(todo.top().second, vBegin, vEnd, oBegin, oEnd);
+    todo.pop();
+
+    for(; oBegin != oEnd; ++oBegin) //go through child objects
+      minimum = (std::min)(minimum, minimizer.minimumOnObject(*oBegin));
+
+    for(; vBegin != vEnd; ++vBegin) { //go through child volumes
+      Scalar val = minimizer.minimumOnVolume(tree.getVolume(*vBegin));
+      if(val < minimum)
+        todo.push(std::make_pair(val, *vBegin));
+    }
+  }
+
+  return minimum;
+}
+#endif //not EIGEN_PARSED_BY_DOXYGEN
+
+
+template<typename Volume1, typename Object1, typename Object2, typename Minimizer>
+struct minimizer_helper1
+{
+  typedef typename Minimizer::Scalar Scalar;
+  minimizer_helper1(const Object2 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {}
+  Scalar minimumOnVolume(const Volume1 &vol) { return minimizer.minimumOnVolumeObject(vol, stored); }
+  Scalar minimumOnObject(const Object1 &obj) { return minimizer.minimumOnObjectObject(obj, stored); }
+  Object2 stored;
+  Minimizer &minimizer;
+private:
+  minimizer_helper1& operator=(const minimizer_helper1&);
+};
+
+template<typename Volume2, typename Object2, typename Object1, typename Minimizer>
+struct minimizer_helper2
+{
+  typedef typename Minimizer::Scalar Scalar;
+  minimizer_helper2(const Object1 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {}
+  Scalar minimumOnVolume(const Volume2 &vol) { return minimizer.minimumOnObjectVolume(stored, vol); }
+  Scalar minimumOnObject(const Object2 &obj) { return minimizer.minimumOnObjectObject(stored, obj); }
+  Object1 stored;
+  Minimizer &minimizer;
+private:
+  minimizer_helper2& operator=(const minimizer_helper2&);
+};
+
+} // end namespace internal
+
+/**  Given a BVH, runs the query encapsulated by \a minimizer.
+  *  \returns the minimum value.
+  *  The Minimizer type must provide the following members: \code
+     typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one)
+     Scalar minimumOnVolume(const BVH::Volume &volume)
+     Scalar minimumOnObject(const BVH::Object &object)
+  \endcode
+  */
+template<typename BVH, typename Minimizer>
+typename Minimizer::Scalar BVMinimize(const BVH &tree, Minimizer &minimizer)
+{
+  return internal::minimize_helper(tree, minimizer, tree.getRootIndex(), (std::numeric_limits<typename Minimizer::Scalar>::max)());
+}
+
+/**  Given two BVH's, runs the query on their cartesian product encapsulated by \a minimizer.
+  *  \returns the minimum value.
+  *  The Minimizer type must provide the following members: \code
+     typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one)
+     Scalar minimumOnVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2)
+     Scalar minimumOnVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2)
+     Scalar minimumOnObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2)
+     Scalar minimumOnObjectObject(const BVH1::Object &o1, const BVH2::Object &o2)
+  \endcode
+  */
+template<typename BVH1, typename BVH2, typename Minimizer>
+typename Minimizer::Scalar BVMinimize(const BVH1 &tree1, const BVH2 &tree2, Minimizer &minimizer)
+{
+  typedef typename Minimizer::Scalar Scalar;
+  typedef typename BVH1::Index Index1;
+  typedef typename BVH2::Index Index2;
+  typedef internal::minimizer_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object, Minimizer> Helper1;
+  typedef internal::minimizer_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object, Minimizer> Helper2;
+  typedef std::pair<Scalar, std::pair<Index1, Index2> > QueueElement; //first element is priority
+  typedef typename BVH1::VolumeIterator VolIter1;
+  typedef typename BVH1::ObjectIterator ObjIter1;
+  typedef typename BVH2::VolumeIterator VolIter2;
+  typedef typename BVH2::ObjectIterator ObjIter2;
+
+  VolIter1 vBegin1 = VolIter1(), vEnd1 = VolIter1();
+  ObjIter1 oBegin1 = ObjIter1(), oEnd1 = ObjIter1();
+  VolIter2 vBegin2 = VolIter2(), vEnd2 = VolIter2(), vCur2 = VolIter2();
+  ObjIter2 oBegin2 = ObjIter2(), oEnd2 = ObjIter2(), oCur2 = ObjIter2();
+  std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > todo; //smallest is at the top
+
+  Scalar minimum = (std::numeric_limits<Scalar>::max)();
+  todo.push(std::make_pair(Scalar(), std::make_pair(tree1.getRootIndex(), tree2.getRootIndex())));
+
+  while(!todo.empty()) {
+    tree1.getChildren(todo.top().second.first, vBegin1, vEnd1, oBegin1, oEnd1);
+    tree2.getChildren(todo.top().second.second, vBegin2, vEnd2, oBegin2, oEnd2);
+    todo.pop();
+
+    for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree
+      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+        minimum = (std::min)(minimum, minimizer.minimumOnObjectObject(*oBegin1, *oCur2));
+      }
+
+      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+        Helper2 helper(*oBegin1, minimizer);
+        minimum = (std::min)(minimum, internal::minimize_helper(tree2, helper, *vCur2, minimum));
+      }
+    }
+
+    for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree
+      const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1);
+
+      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+        Helper1 helper(*oCur2, minimizer);
+        minimum = (std::min)(minimum, internal::minimize_helper(tree1, helper, *vBegin1, minimum));
+      }
+
+      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+        Scalar val = minimizer.minimumOnVolumeVolume(vol1, tree2.getVolume(*vCur2));
+        if(val < minimum)
+          todo.push(std::make_pair(val, std::make_pair(*vBegin1, *vCur2)));
+      }
+    }
+  }
+  return minimum;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_BVALGORITHMS_H
diff --git a/include/eigen3/unsupported/Eigen/src/BVH/KdBVH.h b/include/eigen3/unsupported/Eigen/src/BVH/KdBVH.h
new file mode 100644
index 0000000..1b8d758
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/BVH/KdBVH.h
@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran at mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef KDBVH_H_INCLUDED
+#define KDBVH_H_INCLUDED
+
+namespace Eigen { 
+
+namespace internal {
+
+//internal pair class for the BVH--used instead of std::pair because of alignment
+template<typename Scalar, int Dim>
+struct vector_int_pair
+{
+EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Dim)
+  typedef Matrix<Scalar, Dim, 1> VectorType;
+
+  vector_int_pair(const VectorType &v, int i) : first(v), second(i) {}
+
+  VectorType first;
+  int second;
+};
+
+//these templates help the tree initializer get the bounding boxes either from a provided
+//iterator range or using bounding_box in a unified way
+template<typename ObjectList, typename VolumeList, typename BoxIter>
+struct get_boxes_helper {
+  void operator()(const ObjectList &objects, BoxIter boxBegin, BoxIter boxEnd, VolumeList &outBoxes)
+  {
+    outBoxes.insert(outBoxes.end(), boxBegin, boxEnd);
+    eigen_assert(outBoxes.size() == objects.size());
+  }
+};
+
+template<typename ObjectList, typename VolumeList>
+struct get_boxes_helper<ObjectList, VolumeList, int> {
+  void operator()(const ObjectList &objects, int, int, VolumeList &outBoxes)
+  {
+    outBoxes.reserve(objects.size());
+    for(int i = 0; i < (int)objects.size(); ++i)
+      outBoxes.push_back(bounding_box(objects[i]));
+  }
+};
+
+} // end namespace internal
+
+
+/** \class KdBVH
+ *  \brief A simple bounding volume hierarchy based on AlignedBox
+ *
+ *  \param _Scalar The underlying scalar type of the bounding boxes
+ *  \param _Dim The dimension of the space in which the hierarchy lives
+ *  \param _Object The object type that lives in the hierarchy.  It must have value semantics.  Either bounding_box(_Object) must
+ *                 be defined and return an AlignedBox<_Scalar, _Dim> or bounding boxes must be provided to the tree initializer.
+ *
+ *  This class provides a simple (as opposed to optimized) implementation of a bounding volume hierarchy analogous to a Kd-tree.
+ *  Given a sequence of objects, it computes their bounding boxes, constructs a Kd-tree of their centers
+ *  and builds a BVH with the structure of that Kd-tree.  When the elements of the tree are too expensive to be copied around,
+ *  it is useful for _Object to be a pointer.
+ */
+template<typename _Scalar, int _Dim, typename _Object> class KdBVH
+{
+public:
+  enum { Dim = _Dim };
+  typedef _Object Object;
+  typedef std::vector<Object, aligned_allocator<Object> > ObjectList;
+  typedef _Scalar Scalar;
+  typedef AlignedBox<Scalar, Dim> Volume;
+  typedef std::vector<Volume, aligned_allocator<Volume> > VolumeList;
+  typedef int Index;
+  typedef const int *VolumeIterator; //the iterators are just pointers into the tree's vectors
+  typedef const Object *ObjectIterator;
+
+  KdBVH() {}
+
+  /** Given an iterator range over \a Object references, constructs the BVH.  Requires that bounding_box(Object) return a Volume. */
+  template<typename Iter> KdBVH(Iter begin, Iter end) { init(begin, end, 0, 0); } //int is recognized by init as not being an iterator type
+
+  /** Given an iterator range over \a Object references and an iterator range over their bounding boxes, constructs the BVH */
+  template<typename OIter, typename BIter> KdBVH(OIter begin, OIter end, BIter boxBegin, BIter boxEnd) { init(begin, end, boxBegin, boxEnd); }
+
+  /** Given an iterator range over \a Object references, constructs the BVH, overwriting whatever is in there currently.
+    * Requires that bounding_box(Object) return a Volume. */
+  template<typename Iter> void init(Iter begin, Iter end) { init(begin, end, 0, 0); }
+
+  /** Given an iterator range over \a Object references and an iterator range over their bounding boxes,
+    * constructs the BVH, overwriting whatever is in there currently. */
+  template<typename OIter, typename BIter> void init(OIter begin, OIter end, BIter boxBegin, BIter boxEnd)
+  {
+    objects.clear();
+    boxes.clear();
+    children.clear();
+
+    objects.insert(objects.end(), begin, end);
+    int n = static_cast<int>(objects.size());
+
+    if(n < 2)
+      return; //if we have at most one object, we don't need any internal nodes
+
+    VolumeList objBoxes;
+    VIPairList objCenters;
+
+    //compute the bounding boxes depending on BIter type
+    internal::get_boxes_helper<ObjectList, VolumeList, BIter>()(objects, boxBegin, boxEnd, objBoxes);
+
+    objCenters.reserve(n);
+    boxes.reserve(n - 1);
+    children.reserve(2 * n - 2);
+
+    for(int i = 0; i < n; ++i)
+      objCenters.push_back(VIPair(objBoxes[i].center(), i));
+
+    build(objCenters, 0, n, objBoxes, 0); //the recursive part of the algorithm
+
+    ObjectList tmp(n);
+    tmp.swap(objects);
+    for(int i = 0; i < n; ++i)
+      objects[i] = tmp[objCenters[i].second];
+  }
+
+  /** \returns the index of the root of the hierarchy */
+  inline Index getRootIndex() const { return (int)boxes.size() - 1; }
+
+  /** Given an \a index of a node, on exit, \a outVBegin and \a outVEnd range over the indices of the volume children of the node
+    * and \a outOBegin and \a outOEnd range over the object children of the node */
+  EIGEN_STRONG_INLINE void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
+                                       ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
+  { //inlining this function should open lots of optimization opportunities to the compiler
+    if(index < 0) {
+      outVBegin = outVEnd;
+      if(!objects.empty())
+        outOBegin = &(objects[0]);
+      outOEnd = outOBegin + objects.size(); //output all objects--necessary when the tree has only one object
+      return;
+    }
+
+    int numBoxes = static_cast<int>(boxes.size());
+
+    int idx = index * 2;
+    if(children[idx + 1] < numBoxes) { //second index is always bigger
+      outVBegin = &(children[idx]);
+      outVEnd = outVBegin + 2;
+      outOBegin = outOEnd;
+    }
+    else if(children[idx] >= numBoxes) { //if both children are objects
+      outVBegin = outVEnd;
+      outOBegin = &(objects[children[idx] - numBoxes]);
+      outOEnd = outOBegin + 2;
+    } else { //if the first child is a volume and the second is an object
+      outVBegin = &(children[idx]);
+      outVEnd = outVBegin + 1;
+      outOBegin = &(objects[children[idx + 1] - numBoxes]);
+      outOEnd = outOBegin + 1;
+    }
+  }
+
+  /** \returns the bounding box of the node at \a index */
+  inline const Volume &getVolume(Index index) const
+  {
+    return boxes[index];
+  }
+
+private:
+  typedef internal::vector_int_pair<Scalar, Dim> VIPair;
+  typedef std::vector<VIPair, aligned_allocator<VIPair> > VIPairList;
+  typedef Matrix<Scalar, Dim, 1> VectorType;
+  struct VectorComparator //compares vectors, or, more specificall, VIPairs along a particular dimension
+  {
+    VectorComparator(int inDim) : dim(inDim) {}
+    inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; }
+    int dim;
+  };
+
+  //Build the part of the tree between objects[from] and objects[to] (not including objects[to]).
+  //This routine partitions the objCenters in [from, to) along the dimension dim, recursively constructs
+  //the two halves, and adds their parent node.  TODO: a cache-friendlier layout
+  void build(VIPairList &objCenters, int from, int to, const VolumeList &objBoxes, int dim)
+  {
+    eigen_assert(to - from > 1);
+    if(to - from == 2) {
+      boxes.push_back(objBoxes[objCenters[from].second].merged(objBoxes[objCenters[from + 1].second]));
+      children.push_back(from + (int)objects.size() - 1); //there are objects.size() - 1 tree nodes
+      children.push_back(from + (int)objects.size());
+    }
+    else if(to - from == 3) {
+      int mid = from + 2;
+      std::nth_element(objCenters.begin() + from, objCenters.begin() + mid,
+                        objCenters.begin() + to, VectorComparator(dim)); //partition
+      build(objCenters, from, mid, objBoxes, (dim + 1) % Dim);
+      int idx1 = (int)boxes.size() - 1;
+      boxes.push_back(boxes[idx1].merged(objBoxes[objCenters[mid].second]));
+      children.push_back(idx1);
+      children.push_back(mid + (int)objects.size() - 1);
+    }
+    else {
+      int mid = from + (to - from) / 2;
+      nth_element(objCenters.begin() + from, objCenters.begin() + mid,
+                  objCenters.begin() + to, VectorComparator(dim)); //partition
+      build(objCenters, from, mid, objBoxes, (dim + 1) % Dim);
+      int idx1 = (int)boxes.size() - 1;
+      build(objCenters, mid, to, objBoxes, (dim + 1) % Dim);
+      int idx2 = (int)boxes.size() - 1;
+      boxes.push_back(boxes[idx1].merged(boxes[idx2]));
+      children.push_back(idx1);
+      children.push_back(idx2);
+    }
+  }
+
+  std::vector<int> children; //children of x are children[2x] and children[2x+1], indices bigger than boxes.size() index into objects.
+  VolumeList boxes;
+  ObjectList objects;
+};
+
+} // end namespace Eigen
+
+#endif //KDBVH_H_INCLUDED
diff --git a/include/eigen3/unsupported/Eigen/src/FFT/ei_fftw_impl.h b/include/eigen3/unsupported/Eigen/src/FFT/ei_fftw_impl.h
new file mode 100644
index 0000000..d49aa17
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/FFT/ei_fftw_impl.h
@@ -0,0 +1,261 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. 
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+namespace Eigen { 
+
+namespace internal {
+
+  // FFTW uses non-const arguments
+  // so we must use ugly const_cast calls for all the args it uses
+  //
+  // This should be safe as long as 
+  // 1. we use FFTW_ESTIMATE for all our planning
+  //       see the FFTW docs section 4.3.2 "Planner Flags"
+  // 2. fftw_complex is compatible with std::complex
+  //    This assumes std::complex<T> layout is array of size 2 with real,imag
+  template <typename T> 
+  inline 
+  T * fftw_cast(const T* p)
+  { 
+      return const_cast<T*>( p); 
+  }
+
+  inline 
+  fftw_complex * fftw_cast( const std::complex<double> * p)
+  {
+      return const_cast<fftw_complex*>( reinterpret_cast<const fftw_complex*>(p) ); 
+  }
+
+  inline 
+  fftwf_complex * fftw_cast( const std::complex<float> * p)
+  { 
+      return const_cast<fftwf_complex*>( reinterpret_cast<const fftwf_complex*>(p) ); 
+  }
+
+  inline 
+  fftwl_complex * fftw_cast( const std::complex<long double> * p)
+  { 
+      return const_cast<fftwl_complex*>( reinterpret_cast<const fftwl_complex*>(p) ); 
+  }
+
+  template <typename T> 
+  struct fftw_plan {};
+
+  template <> 
+  struct fftw_plan<float>
+  {
+      typedef float scalar_type;
+      typedef fftwf_complex complex_type;
+      fftwf_plan m_plan;
+      fftw_plan() :m_plan(NULL) {}
+      ~fftw_plan() {if (m_plan) fftwf_destroy_plan(m_plan);}
+
+      inline
+      void fwd(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwf_execute_dft( m_plan, src,dst);
+      }
+      inline
+      void inv(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwf_execute_dft( m_plan, src,dst);
+      }
+      inline
+      void fwd(complex_type * dst,scalar_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwf_execute_dft_r2c( m_plan,src,dst);
+      }
+      inline
+      void inv(scalar_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL)
+              m_plan = fftwf_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwf_execute_dft_c2r( m_plan, src,dst);
+      }
+
+      inline 
+      void fwd2( complex_type * dst,complex_type * src,int n0,int n1) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwf_execute_dft( m_plan, src,dst);
+      }
+      inline 
+      void inv2( complex_type * dst,complex_type * src,int n0,int n1) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwf_execute_dft( m_plan, src,dst);
+      }
+
+  };
+  template <> 
+  struct fftw_plan<double>
+  {
+      typedef double scalar_type;
+      typedef fftw_complex complex_type;
+      ::fftw_plan m_plan;
+      fftw_plan() :m_plan(NULL) {}
+      ~fftw_plan() {if (m_plan) fftw_destroy_plan(m_plan);}
+
+      inline
+      void fwd(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftw_execute_dft( m_plan, src,dst);
+      }
+      inline
+      void inv(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftw_execute_dft( m_plan, src,dst);
+      }
+      inline
+      void fwd(complex_type * dst,scalar_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftw_execute_dft_r2c( m_plan,src,dst);
+      }
+      inline
+      void inv(scalar_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL)
+              m_plan = fftw_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftw_execute_dft_c2r( m_plan, src,dst);
+      }
+      inline 
+      void fwd2( complex_type * dst,complex_type * src,int n0,int n1) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftw_execute_dft( m_plan, src,dst);
+      }
+      inline 
+      void inv2( complex_type * dst,complex_type * src,int n0,int n1) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftw_execute_dft( m_plan, src,dst);
+      }
+  };
+  template <> 
+  struct fftw_plan<long double>
+  {
+      typedef long double scalar_type;
+      typedef fftwl_complex complex_type;
+      fftwl_plan m_plan;
+      fftw_plan() :m_plan(NULL) {}
+      ~fftw_plan() {if (m_plan) fftwl_destroy_plan(m_plan);}
+
+      inline
+      void fwd(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwl_execute_dft( m_plan, src,dst);
+      }
+      inline
+      void inv(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwl_execute_dft( m_plan, src,dst);
+      }
+      inline
+      void fwd(complex_type * dst,scalar_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwl_execute_dft_r2c( m_plan,src,dst);
+      }
+      inline
+      void inv(scalar_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL)
+              m_plan = fftwl_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwl_execute_dft_c2r( m_plan, src,dst);
+      }
+      inline 
+      void fwd2( complex_type * dst,complex_type * src,int n0,int n1) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwl_execute_dft( m_plan, src,dst);
+      }
+      inline 
+      void inv2( complex_type * dst,complex_type * src,int n0,int n1) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+          fftwl_execute_dft( m_plan, src,dst);
+      }
+  };
+
+  template <typename _Scalar>
+  struct fftw_impl
+  {
+      typedef _Scalar Scalar;
+      typedef std::complex<Scalar> Complex;
+
+      inline
+      void clear() 
+      {
+        m_plans.clear();
+      }
+
+      // complex-to-complex forward FFT
+      inline
+      void fwd( Complex * dst,const Complex *src,int nfft)
+      {
+        get_plan(nfft,false,dst,src).fwd(fftw_cast(dst), fftw_cast(src),nfft );
+      }
+
+      // real-to-complex forward FFT
+      inline
+      void fwd( Complex * dst,const Scalar * src,int nfft) 
+      {
+          get_plan(nfft,false,dst,src).fwd(fftw_cast(dst), fftw_cast(src) ,nfft);
+      }
+
+      // 2-d complex-to-complex
+      inline
+      void fwd2(Complex * dst, const Complex * src, int n0,int n1)
+      {
+          get_plan(n0,n1,false,dst,src).fwd2(fftw_cast(dst), fftw_cast(src) ,n0,n1);
+      }
+
+      // inverse complex-to-complex
+      inline
+      void inv(Complex * dst,const Complex  *src,int nfft)
+      {
+        get_plan(nfft,true,dst,src).inv(fftw_cast(dst), fftw_cast(src),nfft );
+      }
+
+      // half-complex to scalar
+      inline
+      void inv( Scalar * dst,const Complex * src,int nfft) 
+      {
+        get_plan(nfft,true,dst,src).inv(fftw_cast(dst), fftw_cast(src),nfft );
+      }
+
+      // 2-d complex-to-complex
+      inline
+      void inv2(Complex * dst, const Complex * src, int n0,int n1)
+      {
+        get_plan(n0,n1,true,dst,src).inv2(fftw_cast(dst), fftw_cast(src) ,n0,n1);
+      }
+
+
+  protected:
+      typedef fftw_plan<Scalar> PlanData;
+
+      typedef std::map<int64_t,PlanData> PlanMap;
+
+      PlanMap m_plans;
+
+      inline
+      PlanData & get_plan(int nfft,bool inverse,void * dst,const void * src)
+      {
+          bool inplace = (dst==src);
+          bool aligned = ( (reinterpret_cast<size_t>(src)&15) | (reinterpret_cast<size_t>(dst)&15) ) == 0;
+          int64_t key = ( (nfft<<3 ) | (inverse<<2) | (inplace<<1) | aligned ) << 1;
+          return m_plans[key];
+      }
+
+      inline
+      PlanData & get_plan(int n0,int n1,bool inverse,void * dst,const void * src)
+      {
+          bool inplace = (dst==src);
+          bool aligned = ( (reinterpret_cast<size_t>(src)&15) | (reinterpret_cast<size_t>(dst)&15) ) == 0;
+          int64_t key = ( ( (((int64_t)n0) << 30)|(n1<<3 ) | (inverse<<2) | (inplace<<1) | aligned ) << 1 ) + 1;
+          return m_plans[key];
+      }
+  };
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/include/eigen3/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
new file mode 100644
index 0000000..be51b4e
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -0,0 +1,420 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+namespace Eigen { 
+
+namespace internal {
+
+  // This FFT implementation was derived from kissfft http:sourceforge.net/projects/kissfft
+  // Copyright 2003-2009 Mark Borgerding
+
+template <typename _Scalar>
+struct kiss_cpx_fft
+{
+  typedef _Scalar Scalar;
+  typedef std::complex<Scalar> Complex;
+  std::vector<Complex> m_twiddles;
+  std::vector<int> m_stageRadix;
+  std::vector<int> m_stageRemainder;
+  std::vector<Complex> m_scratchBuf;
+  bool m_inverse;
+
+  inline
+    void make_twiddles(int nfft,bool inverse)
+    {
+      using std::acos;
+      m_inverse = inverse;
+      m_twiddles.resize(nfft);
+      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+      for (int i=0;i<nfft;++i)
+        m_twiddles[i] = exp( Complex(0,i*phinc) );
+    }
+
+  void factorize(int nfft)
+  {
+    //start factoring out 4's, then 2's, then 3,5,7,9,...
+    int n= nfft;
+    int p=4;
+    do {
+      while (n % p) {
+        switch (p) {
+          case 4: p = 2; break;
+          case 2: p = 3; break;
+          default: p += 2; break;
+        }
+        if (p*p>n)
+          p=n;// impossible to have a factor > sqrt(n)
+      }
+      n /= p;
+      m_stageRadix.push_back(p);
+      m_stageRemainder.push_back(n);
+      if ( p > 5 )
+        m_scratchBuf.resize(p); // scratchbuf will be needed in bfly_generic
+    }while(n>1);
+  }
+
+  template <typename _Src>
+    inline
+    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+    {
+      int p = m_stageRadix[stage];
+      int m = m_stageRemainder[stage];
+      Complex * Fout_beg = xout;
+      Complex * Fout_end = xout + p*m;
+
+      if (m>1) {
+        do{
+          // recursive call:
+          // DFT of size m*p performed by doing
+          // p instances of smaller DFTs of size m, 
+          // each one takes a decimated version of the input
+          work(stage+1, xout , xin, fstride*p,in_stride);
+          xin += fstride*in_stride;
+        }while( (xout += m) != Fout_end );
+      }else{
+        do{
+          *xout = *xin;
+          xin += fstride*in_stride;
+        }while(++xout != Fout_end );
+      }
+      xout=Fout_beg;
+
+      // recombine the p smaller DFTs 
+      switch (p) {
+        case 2: bfly2(xout,fstride,m); break;
+        case 3: bfly3(xout,fstride,m); break;
+        case 4: bfly4(xout,fstride,m); break;
+        case 5: bfly5(xout,fstride,m); break;
+        default: bfly_generic(xout,fstride,m,p); break;
+      }
+    }
+
+  inline
+    void bfly2( Complex * Fout, const size_t fstride, int m)
+    {
+      for (int k=0;k<m;++k) {
+        Complex t = Fout[m+k] * m_twiddles[k*fstride];
+        Fout[m+k] = Fout[k] - t;
+        Fout[k] += t;
+      }
+    }
+
+  inline
+    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex scratch[6];
+      int negative_if_inverse = m_inverse * -2 +1;
+      for (size_t k=0;k<m;++k) {
+        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+        scratch[5] = Fout[k] - scratch[1];
+
+        Fout[k] += scratch[1];
+        scratch[3] = scratch[0] + scratch[2];
+        scratch[4] = scratch[0] - scratch[2];
+        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+        Fout[k+2*m]  = Fout[k] - scratch[3];
+        Fout[k] += scratch[3];
+        Fout[k+m] = scratch[5] + scratch[4];
+        Fout[k+3*m] = scratch[5] - scratch[4];
+      }
+    }
+
+  inline
+    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      size_t k=m;
+      const size_t m2 = 2*m;
+      Complex *tw1,*tw2;
+      Complex scratch[5];
+      Complex epi3;
+      epi3 = m_twiddles[fstride*m];
+
+      tw1=tw2=&m_twiddles[0];
+
+      do{
+        scratch[1]=Fout[m] * *tw1;
+        scratch[2]=Fout[m2] * *tw2;
+
+        scratch[3]=scratch[1]+scratch[2];
+        scratch[0]=scratch[1]-scratch[2];
+        tw1 += fstride;
+        tw2 += fstride*2;
+        Fout[m] = Complex( Fout->real() - Scalar(.5)*scratch[3].real() , Fout->imag() - Scalar(.5)*scratch[3].imag() );
+        scratch[0] *= epi3.imag();
+        *Fout += scratch[3];
+        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+        ++Fout;
+      }while(--k);
+    }
+
+  inline
+    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+      size_t u;
+      Complex scratch[13];
+      Complex * twiddles = &m_twiddles[0];
+      Complex *tw;
+      Complex ya,yb;
+      ya = twiddles[fstride*m];
+      yb = twiddles[fstride*2*m];
+
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      tw=twiddles;
+      for ( u=0; u<m; ++u ) {
+        scratch[0] = *Fout0;
+
+        scratch[1]  = *Fout1 * tw[u*fstride];
+        scratch[2]  = *Fout2 * tw[2*u*fstride];
+        scratch[3]  = *Fout3 * tw[3*u*fstride];
+        scratch[4]  = *Fout4 * tw[4*u*fstride];
+
+        scratch[7] = scratch[1] + scratch[4];
+        scratch[10] = scratch[1] - scratch[4];
+        scratch[8] = scratch[2] + scratch[3];
+        scratch[9] = scratch[2] - scratch[3];
+
+        *Fout0 +=  scratch[7];
+        *Fout0 +=  scratch[8];
+
+        scratch[5] = scratch[0] + Complex(
+            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
+            );
+
+        scratch[6] = Complex(
+            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+            );
+
+        *Fout1 = scratch[5] - scratch[6];
+        *Fout4 = scratch[5] + scratch[6];
+
+        scratch[11] = scratch[0] +
+          Complex(
+              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+              );
+
+        scratch[12] = Complex(
+            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+            );
+
+        *Fout2=scratch[11]+scratch[12];
+        *Fout3=scratch[11]-scratch[12];
+
+        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+    }
+
+  /* perform the butterfly for one stage of a mixed radix FFT */
+  inline
+    void bfly_generic(
+        Complex * Fout,
+        const size_t fstride,
+        int m,
+        int p
+        )
+    {
+      int u,k,q1,q;
+      Complex * twiddles = &m_twiddles[0];
+      Complex t;
+      int Norig = static_cast<int>(m_twiddles.size());
+      Complex * scratchbuf = &m_scratchBuf[0];
+
+      for ( u=0; u<m; ++u ) {
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          scratchbuf[q1] = Fout[ k  ];
+          k += m;
+        }
+
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          int twidx=0;
+          Fout[ k ] = scratchbuf[0];
+          for (q=1;q<p;++q ) {
+            twidx += static_cast<int>(fstride) * k;
+            if (twidx>=Norig) twidx-=Norig;
+            t=scratchbuf[q] * twiddles[twidx];
+            Fout[ k ] += t;
+          }
+          k += m;
+        }
+      }
+    }
+};
+
+template <typename _Scalar>
+struct kissfft_impl
+{
+  typedef _Scalar Scalar;
+  typedef std::complex<Scalar> Complex;
+
+  void clear() 
+  {
+    m_plans.clear();
+    m_realTwiddles.clear();
+  }
+
+  inline
+    void fwd( Complex * dst,const Complex *src,int nfft)
+    {
+      get_plan(nfft,false).work(0, dst, src, 1,1);
+    }
+
+  inline
+    void fwd2( Complex * dst,const Complex *src,int n0,int n1)
+    {
+        EIGEN_UNUSED_VARIABLE(dst);
+        EIGEN_UNUSED_VARIABLE(src);
+        EIGEN_UNUSED_VARIABLE(n0);
+        EIGEN_UNUSED_VARIABLE(n1);
+    }
+
+  inline
+    void inv2( Complex * dst,const Complex *src,int n0,int n1)
+    {
+        EIGEN_UNUSED_VARIABLE(dst);
+        EIGEN_UNUSED_VARIABLE(src);
+        EIGEN_UNUSED_VARIABLE(n0);
+        EIGEN_UNUSED_VARIABLE(n1);
+    }
+
+  // real-to-complex forward FFT
+  // perform two FFTs of src even and src odd
+  // then twiddle to recombine them into the half-spectrum format
+  // then fill in the conjugate symmetric half
+  inline
+    void fwd( Complex * dst,const Scalar * src,int nfft) 
+    {
+      if ( nfft&3  ) {
+        // use generic mode for odd
+        m_tmpBuf1.resize(nfft);
+        get_plan(nfft,false).work(0, &m_tmpBuf1[0], src, 1,1);
+        std::copy(m_tmpBuf1.begin(),m_tmpBuf1.begin()+(nfft>>1)+1,dst );
+      }else{
+        int ncfft = nfft>>1;
+        int ncfft2 = nfft>>2;
+        Complex * rtw = real_twiddles(ncfft2);
+
+        // use optimized mode for even real
+        fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
+        Complex dc = dst[0].real() +  dst[0].imag();
+        Complex nyquist = dst[0].real() -  dst[0].imag();
+        int k;
+        for ( k=1;k <= ncfft2 ; ++k ) {
+          Complex fpk = dst[k];
+          Complex fpnk = conj(dst[ncfft-k]);
+          Complex f1k = fpk + fpnk;
+          Complex f2k = fpk - fpnk;
+          Complex tw= f2k * rtw[k-1];
+          dst[k] =  (f1k + tw) * Scalar(.5);
+          dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
+        }
+        dst[0] = dc;
+        dst[ncfft] = nyquist;
+      }
+    }
+
+  // inverse complex-to-complex
+  inline
+    void inv(Complex * dst,const Complex  *src,int nfft)
+    {
+      get_plan(nfft,true).work(0, dst, src, 1,1);
+    }
+
+  // half-complex to scalar
+  inline
+    void inv( Scalar * dst,const Complex * src,int nfft) 
+    {
+      if (nfft&3) {
+        m_tmpBuf1.resize(nfft);
+        m_tmpBuf2.resize(nfft);
+        std::copy(src,src+(nfft>>1)+1,m_tmpBuf1.begin() );
+        for (int k=1;k<(nfft>>1)+1;++k)
+          m_tmpBuf1[nfft-k] = conj(m_tmpBuf1[k]);
+        inv(&m_tmpBuf2[0],&m_tmpBuf1[0],nfft);
+        for (int k=0;k<nfft;++k)
+          dst[k] = m_tmpBuf2[k].real();
+      }else{
+        // optimized version for multiple of 4
+        int ncfft = nfft>>1;
+        int ncfft2 = nfft>>2;
+        Complex * rtw = real_twiddles(ncfft2);
+        m_tmpBuf1.resize(ncfft);
+        m_tmpBuf1[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() );
+        for (int k = 1; k <= ncfft / 2; ++k) {
+          Complex fk = src[k];
+          Complex fnkc = conj(src[ncfft-k]);
+          Complex fek = fk + fnkc;
+          Complex tmp = fk - fnkc;
+          Complex fok = tmp * conj(rtw[k-1]);
+          m_tmpBuf1[k] = fek + fok;
+          m_tmpBuf1[ncfft-k] = conj(fek - fok);
+        }
+        get_plan(ncfft,true).work(0, reinterpret_cast<Complex*>(dst), &m_tmpBuf1[0], 1,1);
+      }
+    }
+
+  protected:
+  typedef kiss_cpx_fft<Scalar> PlanData;
+  typedef std::map<int,PlanData> PlanMap;
+
+  PlanMap m_plans;
+  std::map<int, std::vector<Complex> > m_realTwiddles;
+  std::vector<Complex> m_tmpBuf1;
+  std::vector<Complex> m_tmpBuf2;
+
+  inline
+    int PlanKey(int nfft, bool isinverse) const { return (nfft<<1) | int(isinverse); }
+
+  inline
+    PlanData & get_plan(int nfft, bool inverse)
+    {
+      // TODO look for PlanKey(nfft, ! inverse) and conjugate the twiddles
+      PlanData & pd = m_plans[ PlanKey(nfft,inverse) ];
+      if ( pd.m_twiddles.size() == 0 ) {
+        pd.make_twiddles(nfft,inverse);
+        pd.factorize(nfft);
+      }
+      return pd;
+    }
+
+  inline
+    Complex * real_twiddles(int ncfft2)
+    {
+      using std::acos;
+      std::vector<Complex> & twidref = m_realTwiddles[ncfft2];// creates new if not there
+      if ( (int)twidref.size() != ncfft2 ) {
+        twidref.resize(ncfft2);
+        int ncfft= ncfft2<<1;
+        Scalar pi =  acos( Scalar(-1) );
+        for (int k=1;k<=ncfft2;++k) 
+          twidref[k-1] = exp( Complex(0,-pi * (Scalar(k) / ncfft + Scalar(.5)) ) );
+      }
+      return &twidref[0];
+    }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/include/eigen3/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
new file mode 100644
index 0000000..dc0093e
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
@@ -0,0 +1,189 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+
+/* NOTE The functions of this file have been adapted from the GMM++ library */
+
+//========================================================================
+//
+// Copyright (C) 2002-2007 Yves Renard
+//
+// This file is a part of GETFEM++
+//
+// Getfem++ is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; version 2.1 of the License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Lesser General Public License for more details.
+// You should have received a copy of the GNU Lesser General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301,
+// USA.
+//
+//========================================================================
+
+#include "../../../../Eigen/src/Core/util/NonMPL2.h"
+
+#ifndef EIGEN_CONSTRAINEDCG_H
+#define EIGEN_CONSTRAINEDCG_H
+
+#include <Eigen/Core>
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \ingroup IterativeSolvers_Module
+  * Compute the pseudo inverse of the non-square matrix C such that
+  * \f$ CINV = (C * C^T)^{-1} * C \f$ based on a conjugate gradient method.
+  *
+  * This function is internally used by constrained_cg.
+  */
+template <typename CMatrix, typename CINVMatrix>
+void pseudo_inverse(const CMatrix &C, CINVMatrix &CINV)
+{
+  // optimisable : copie de la ligne, precalcul de C * trans(C).
+  typedef typename CMatrix::Scalar Scalar;
+  typedef typename CMatrix::Index Index;
+  // FIXME use sparse vectors ?
+  typedef Matrix<Scalar,Dynamic,1> TmpVec;
+
+  Index rows = C.rows(), cols = C.cols();
+
+  TmpVec d(rows), e(rows), l(cols), p(rows), q(rows), r(rows);
+  Scalar rho, rho_1, alpha;
+  d.setZero();
+
+  typedef Triplet<double> T;
+  std::vector<T> tripletList;
+    
+  for (Index i = 0; i < rows; ++i)
+  {
+    d[i] = 1.0;
+    rho = 1.0;
+    e.setZero();
+    r = d;
+    p = d;
+
+    while (rho >= 1e-38)
+    { /* conjugate gradient to compute e             */
+      /* which is the i-th row of inv(C * trans(C))  */
+      l = C.transpose() * p;
+      q = C * l;
+      alpha = rho / p.dot(q);
+      e +=  alpha * p;
+      r += -alpha * q;
+      rho_1 = rho;
+      rho = r.dot(r);
+      p = (rho/rho_1) * p + r;
+    }
+
+    l = C.transpose() * e; // l is the i-th row of CINV
+    // FIXME add a generic "prune/filter" expression for both dense and sparse object to sparse
+    for (Index j=0; j<l.size(); ++j)
+      if (l[j]<1e-15)
+	tripletList.push_back(T(i,j,l(j)));
+
+	
+    d[i] = 0.0;
+  }
+  CINV.setFromTriplets(tripletList.begin(), tripletList.end());
+}
+
+
+
+/** \ingroup IterativeSolvers_Module
+  * Constrained conjugate gradient
+  *
+  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the contraint \f$ Cx \le f \f$
+  */
+template<typename TMatrix, typename CMatrix,
+         typename VectorX, typename VectorB, typename VectorF>
+void constrained_cg(const TMatrix& A, const CMatrix& C, VectorX& x,
+                       const VectorB& b, const VectorF& f, IterationController &iter)
+{
+  using std::sqrt;
+  typedef typename TMatrix::Scalar Scalar;
+  typedef typename TMatrix::Index Index;
+  typedef Matrix<Scalar,Dynamic,1>  TmpVec;
+
+  Scalar rho = 1.0, rho_1, lambda, gamma;
+  Index xSize = x.size();
+  TmpVec  p(xSize), q(xSize), q2(xSize),
+          r(xSize), old_z(xSize), z(xSize),
+          memox(xSize);
+  std::vector<bool> satured(C.rows());
+  p.setZero();
+  iter.setRhsNorm(sqrt(b.dot(b))); // gael vect_sp(PS, b, b)
+  if (iter.rhsNorm() == 0.0) iter.setRhsNorm(1.0);
+
+  SparseMatrix<Scalar,RowMajor> CINV(C.rows(), C.cols());
+  pseudo_inverse(C, CINV);
+
+  while(true)
+  {
+    // computation of residual
+    old_z = z;
+    memox = x;
+    r = b;
+    r += A * -x;
+    z = r;
+    bool transition = false;
+    for (Index i = 0; i < C.rows(); ++i)
+    {
+      Scalar al = C.row(i).dot(x) - f.coeff(i);
+      if (al >= -1.0E-15)
+      {
+        if (!satured[i])
+        {
+          satured[i] = true;
+          transition = true;
+        }
+        Scalar bb = CINV.row(i).dot(z);
+        if (bb > 0.0)
+          // FIXME: we should allow that: z += -bb * C.row(i);
+          for (typename CMatrix::InnerIterator it(C,i); it; ++it)
+            z.coeffRef(it.index()) -= bb*it.value();
+      }
+      else
+        satured[i] = false;
+    }
+
+    // descent direction
+    rho_1 = rho;
+    rho = r.dot(z);
+
+    if (iter.finished(rho)) break;
+
+    if (iter.noiseLevel() > 0 && transition) std::cerr << "CCG: transition\n";
+    if (transition || iter.first()) gamma = 0.0;
+    else gamma = (std::max)(0.0, (rho - old_z.dot(z)) / rho_1);
+    p = z + gamma*p;
+
+    ++iter;
+    // one dimensionnal optimization
+    q = A * p;
+    lambda = rho / q.dot(p);
+    for (Index i = 0; i < C.rows(); ++i)
+    {
+      if (!satured[i])
+      {
+        Scalar bb = C.row(i).dot(p) - f[i];
+        if (bb > 0.0)
+          lambda = (std::min)(lambda, (f.coeff(i)-C.row(i).dot(x)) / bb);
+      }
+    }
+    x += lambda * p;
+    memox -= x;
+  }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CONSTRAINEDCG_H
diff --git a/include/eigen3/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
new file mode 100644
index 0000000..9fcc8a8
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
@@ -0,0 +1,542 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DGMRES_H
+#define EIGEN_DGMRES_H
+
+#include <Eigen/Eigenvalues>
+
+namespace Eigen { 
+  
+template< typename _MatrixType,
+          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+class DGMRES;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<DGMRES<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+/** \brief Computes a permutation vector to have a sorted sequence
+  * \param vec The vector to reorder.
+  * \param perm gives the sorted sequence on output. Must be initialized with 0..n-1
+  * \param ncut Put  the ncut smallest elements at the end of the vector
+  * WARNING This is an expensive sort, so should be used only 
+  * for small size vectors
+  * TODO Use modified QuickSplit or std::nth_element to get the smallest values 
+  */
+template <typename VectorType, typename IndexType>
+void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut)
+{
+  eigen_assert(vec.size() == perm.size());
+  typedef typename IndexType::Scalar Index; 
+  typedef typename VectorType::Scalar Scalar; 
+  bool flag; 
+  for (Index k  = 0; k < ncut; k++)
+  {
+    flag = false;
+    for (Index j = 0; j < vec.size()-1; j++)
+    {
+      if ( vec(perm(j)) < vec(perm(j+1)) )
+      {
+        std::swap(perm(j),perm(j+1)); 
+        flag = true;
+      }
+      if (!flag) break; // The vector is in sorted order
+    }
+  }
+}
+
+}
+/**
+ * \ingroup IterativeLInearSolvers_Module
+ * \brief A Restarted GMRES with deflation.
+ * This class implements a modification of the GMRES solver for
+ * sparse linear systems. The basis is built with modified 
+ * Gram-Schmidt. At each restart, a few approximated eigenvectors
+ * corresponding to the smallest eigenvalues are used to build a
+ * preconditioner for the next cycle. This preconditioner 
+ * for deflation can be combined with any other preconditioner, 
+ * the IncompleteLUT for instance. The preconditioner is applied 
+ * at right of the matrix and the combination is multiplicative.
+ * 
+ * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+ * Typical usage :
+ * \code
+ * SparseMatrix<double> A;
+ * VectorXd x, b; 
+ * //Fill A and b ...
+ * DGMRES<SparseMatrix<double> > solver;
+ * solver.set_restart(30); // Set restarting value
+ * solver.setEigenv(1); // Set the number of eigenvalues to deflate
+ * solver.compute(A);
+ * x = solver.solve(b);
+ * \endcode
+ * 
+ * References :
+ * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
+ *  Algebraic Solvers for Linear Systems Arising from Compressible
+ *  Flows, Computers and Fluids, In Press,
+ *  http://dx.doi.org/10.1016/j.compfluid.2012.03.023   
+ * [2] K. Burrage and J. Erhel, On the performance of various 
+ * adaptive preconditioned GMRES strategies, 5(1998), 101-121.
+ * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES 
+ *  preconditioned by deflation,J. Computational and Applied
+ *  Mathematics, 69(1996), 303-318. 
+
+ * 
+ */
+template< typename _MatrixType, typename _Preconditioner>
+class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
+{
+    typedef IterativeSolverBase<DGMRES> Base;
+    using Base::mp_matrix;
+    using Base::m_error;
+    using Base::m_iterations;
+    using Base::m_info;
+    using Base::m_isInitialized;
+    using Base::m_tolerance; 
+  public:
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef _Preconditioner Preconditioner;
+    typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix; 
+    typedef Matrix<RealScalar,Dynamic,Dynamic> DenseRealMatrix; 
+    typedef Matrix<Scalar,Dynamic,1> DenseVector;
+    typedef Matrix<RealScalar,Dynamic,1> DenseRealVector; 
+    typedef Matrix<std::complex<RealScalar>, Dynamic, 1> ComplexVector;
+ 
+    
+  /** Default constructor. */
+  DGMRES() : Base(),m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  DGMRES(const MatrixType& A) : Base(A),m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false)
+  {}
+
+  ~DGMRES() {}
+  
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
+    * \a x0 as an initial solution.
+    *
+    * \sa compute()
+    */
+  template<typename Rhs,typename Guess>
+  inline const internal::solve_retval_with_guess<DGMRES, Rhs, Guess>
+  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
+  {
+    eigen_assert(m_isInitialized && "DGMRES is not initialized.");
+    eigen_assert(Base::rows()==b.rows()
+              && "DGMRES::solve(): invalid number of rows of the right hand side matrix b");
+    return internal::solve_retval_with_guess
+            <DGMRES, Rhs, Guess>(*this, b.derived(), x0);
+  }
+  
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solveWithGuess(const Rhs& b, Dest& x) const
+  {    
+    bool failed = false;
+    for(int j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+      
+      typename Dest::ColXpr xj(x,j);
+      dgmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner);
+    }
+    m_info = failed ? NumericalIssue
+           : m_error <= Base::m_tolerance ? Success
+           : NoConvergence;
+    m_isInitialized = true;
+  }
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve(const Rhs& b, Dest& x) const
+  {
+    x = b;
+    _solveWithGuess(b,x);
+  }
+  /** 
+   * Get the restart value
+    */
+  int restart() { return m_restart; }
+  
+  /** 
+   * Set the restart value (default is 30)  
+   */
+  void set_restart(const int restart) { m_restart=restart; }
+  
+  /** 
+   * Set the number of eigenvalues to deflate at each restart 
+   */
+  void setEigenv(const int neig) 
+  {
+    m_neig = neig;
+    if (neig+1 > m_maxNeig) m_maxNeig = neig+1; // To allow for complex conjugates
+  }
+  
+  /** 
+   * Get the size of the deflation subspace size
+   */ 
+  int deflSize() {return m_r; }
+  
+  /**
+   * Set the maximum size of the deflation subspace
+   */
+  void setMaxEigenv(const int maxNeig) { m_maxNeig = maxNeig; }
+  
+  protected:
+    // DGMRES algorithm 
+    template<typename Rhs, typename Dest>
+    void dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x, const Preconditioner& precond) const;
+    // Perform one cycle of GMRES
+    template<typename Dest>
+    int dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const; 
+    // Compute data to use for deflation 
+    int dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, Index& neig) const;
+    // Apply deflation to a vector
+    template<typename RhsType, typename DestType>
+    int dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
+    ComplexVector schurValues(const ComplexSchur<DenseMatrix>& schurofH) const;
+    ComplexVector schurValues(const RealSchur<DenseMatrix>& schurofH) const;
+    // Init data for deflation
+    void dgmresInitDeflation(Index& rows) const; 
+    mutable DenseMatrix m_V; // Krylov basis vectors
+    mutable DenseMatrix m_H; // Hessenberg matrix 
+    mutable DenseMatrix m_Hes; // Initial hessenberg matrix wihout Givens rotations applied
+    mutable Index m_restart; // Maximum size of the Krylov subspace
+    mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace 
+    mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
+    mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */
+    mutable PartialPivLU<DenseMatrix> m_luT; // LU factorization of m_T
+    mutable int m_neig; //Number of eigenvalues to extract at each restart
+    mutable int m_r; // Current number of deflated eigenvalues, size of m_U
+    mutable int m_maxNeig; // Maximum number of eigenvalues to deflate
+    mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A
+    mutable bool m_isDeflAllocated;
+    mutable bool m_isDeflInitialized;
+    
+    //Adaptive strategy 
+    mutable RealScalar m_smv; // Smaller multiple of the remaining number of steps allowed
+    mutable bool m_force; // Force the use of deflation at each restart
+    
+}; 
+/** 
+ * \brief Perform several cycles of restarted GMRES with modified Gram Schmidt, 
+ * 
+ * A right preconditioner is used combined with deflation.
+ * 
+ */
+template< typename _MatrixType, typename _Preconditioner>
+template<typename Rhs, typename Dest>
+void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x,
+              const Preconditioner& precond) const
+{
+  //Initialization
+  int n = mat.rows(); 
+  DenseVector r0(n); 
+  int nbIts = 0; 
+  m_H.resize(m_restart+1, m_restart);
+  m_Hes.resize(m_restart, m_restart);
+  m_V.resize(n,m_restart+1);
+  //Initial residual vector and intial norm
+  x = precond.solve(x);
+  r0 = rhs - mat * x; 
+  RealScalar beta = r0.norm(); 
+  RealScalar normRhs = rhs.norm();
+  m_error = beta/normRhs; 
+  if(m_error < m_tolerance)
+    m_info = Success; 
+  else
+    m_info = NoConvergence;
+  
+  // Iterative process
+  while (nbIts < m_iterations && m_info == NoConvergence)
+  {
+    dgmresCycle(mat, precond, x, r0, beta, normRhs, nbIts); 
+    
+    // Compute the new residual vector for the restart 
+    if (nbIts < m_iterations && m_info == NoConvergence)
+      r0 = rhs - mat * x; 
+  }
+} 
+
+/**
+ * \brief Perform one restart cycle of DGMRES
+ * \param mat The coefficient matrix
+ * \param precond The preconditioner
+ * \param x the new approximated solution
+ * \param r0 The initial residual vector
+ * \param beta The norm of the residual computed so far
+ * \param normRhs The norm of the right hand side vector
+ * \param nbIts The number of iterations
+ */
+template< typename _MatrixType, typename _Preconditioner>
+template<typename Dest>
+int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const
+{
+  //Initialization 
+  DenseVector g(m_restart+1); // Right hand side of the least square problem
+  g.setZero();  
+  g(0) = Scalar(beta); 
+  m_V.col(0) = r0/beta; 
+  m_info = NoConvergence; 
+  std::vector<JacobiRotation<Scalar> >gr(m_restart); // Givens rotations
+  int it = 0; // Number of inner iterations 
+  int n = mat.rows();
+  DenseVector tv1(n), tv2(n);  //Temporary vectors
+  while (m_info == NoConvergence && it < m_restart && nbIts < m_iterations)
+  {    
+    // Apply preconditioner(s) at right
+    if (m_isDeflInitialized )
+    {
+      dgmresApplyDeflation(m_V.col(it), tv1); // Deflation
+      tv2 = precond.solve(tv1); 
+    }
+    else
+    {
+      tv2 = precond.solve(m_V.col(it)); // User's selected preconditioner
+    }
+    tv1 = mat * tv2; 
+   
+    // Orthogonalize it with the previous basis in the basis using modified Gram-Schmidt
+    Scalar coef; 
+    for (int i = 0; i <= it; ++i)
+    { 
+      coef = tv1.dot(m_V.col(i));
+      tv1 = tv1 - coef * m_V.col(i); 
+      m_H(i,it) = coef; 
+      m_Hes(i,it) = coef; 
+    }
+    // Normalize the vector 
+    coef = tv1.norm(); 
+    m_V.col(it+1) = tv1/coef;
+    m_H(it+1, it) = coef;
+//     m_Hes(it+1,it) = coef; 
+    
+    // FIXME Check for happy breakdown 
+    
+    // Update Hessenberg matrix with Givens rotations
+    for (int i = 1; i <= it; ++i) 
+    {
+      m_H.col(it).applyOnTheLeft(i-1,i,gr[i-1].adjoint());
+    }
+    // Compute the new plane rotation 
+    gr[it].makeGivens(m_H(it, it), m_H(it+1,it)); 
+    // Apply the new rotation
+    m_H.col(it).applyOnTheLeft(it,it+1,gr[it].adjoint());
+    g.applyOnTheLeft(it,it+1, gr[it].adjoint()); 
+    
+    beta = std::abs(g(it+1));
+    m_error = beta/normRhs; 
+    std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
+    it++; nbIts++; 
+    
+    if (m_error < m_tolerance)
+    {
+      // The method has converged
+      m_info = Success;
+      break;
+    }
+  }
+  
+  // Compute the new coefficients by solving the least square problem
+//   it++;
+  //FIXME  Check first if the matrix is singular ... zero diagonal
+  DenseVector nrs(m_restart); 
+  nrs = m_H.topLeftCorner(it,it).template triangularView<Upper>().solve(g.head(it)); 
+  
+  // Form the new solution
+  if (m_isDeflInitialized)
+  {
+    tv1 = m_V.leftCols(it) * nrs; 
+    dgmresApplyDeflation(tv1, tv2); 
+    x = x + precond.solve(tv2);
+  }
+  else
+    x = x + precond.solve(m_V.leftCols(it) * nrs); 
+  
+  // Go for a new cycle and compute data for deflation
+  if(nbIts < m_iterations && m_info == NoConvergence && m_neig > 0 && (m_r+m_neig) < m_maxNeig)
+    dgmresComputeDeflationData(mat, precond, it, m_neig); 
+  return 0; 
+  
+}
+
+
+template< typename _MatrixType, typename _Preconditioner>
+void DGMRES<_MatrixType, _Preconditioner>::dgmresInitDeflation(Index& rows) const
+{
+  m_U.resize(rows, m_maxNeig);
+  m_MU.resize(rows, m_maxNeig); 
+  m_T.resize(m_maxNeig, m_maxNeig);
+  m_lambdaN = 0.0; 
+  m_isDeflAllocated = true; 
+}
+
+template< typename _MatrixType, typename _Preconditioner>
+inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const ComplexSchur<DenseMatrix>& schurofH) const
+{
+  return schurofH.matrixT().diagonal();
+}
+
+template< typename _MatrixType, typename _Preconditioner>
+inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const RealSchur<DenseMatrix>& schurofH) const
+{
+  typedef typename MatrixType::Index Index;
+  const DenseMatrix& T = schurofH.matrixT();
+  Index it = T.rows();
+  ComplexVector eig(it);
+  Index j = 0;
+  while (j < it-1)
+  {
+    if (T(j+1,j) ==Scalar(0))
+    {
+      eig(j) = std::complex<RealScalar>(T(j,j),RealScalar(0)); 
+      j++; 
+    }
+    else
+    {
+      eig(j) = std::complex<RealScalar>(T(j,j),T(j+1,j)); 
+      eig(j+1) = std::complex<RealScalar>(T(j,j+1),T(j+1,j+1));
+      j++;
+    }
+  }
+  if (j < it-1) eig(j) = std::complex<RealScalar>(T(j,j),RealScalar(0));
+  return eig;
+}
+
+template< typename _MatrixType, typename _Preconditioner>
+int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, Index& neig) const
+{
+  // First, find the Schur form of the Hessenberg matrix H
+  typename internal::conditional<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> >::type schurofH; 
+  bool computeU = true;
+  DenseMatrix matrixQ(it,it); 
+  matrixQ.setIdentity();
+  schurofH.computeFromHessenberg(m_Hes.topLeftCorner(it,it), matrixQ, computeU); 
+  
+  ComplexVector eig(it);
+  Matrix<Index,Dynamic,1>perm(it); 
+  eig = this->schurValues(schurofH);
+  
+  // Reorder the absolute values of Schur values
+  DenseRealVector modulEig(it); 
+  for (int j=0; j<it; ++j) modulEig(j) = std::abs(eig(j)); 
+  perm.setLinSpaced(it,0,it-1);
+  internal::sortWithPermutation(modulEig, perm, neig);
+  
+  if (!m_lambdaN)
+  {
+    m_lambdaN = (std::max)(modulEig.maxCoeff(), m_lambdaN);
+  }
+  //Count the real number of extracted eigenvalues (with complex conjugates)
+  int nbrEig = 0; 
+  while (nbrEig < neig)
+  {
+    if(eig(perm(it-nbrEig-1)).imag() == RealScalar(0)) nbrEig++; 
+    else nbrEig += 2; 
+  }
+  // Extract the  Schur vectors corresponding to the smallest Ritz values
+  DenseMatrix Sr(it, nbrEig); 
+  Sr.setZero();
+  for (int j = 0; j < nbrEig; j++)
+  {
+    Sr.col(j) = schurofH.matrixU().col(perm(it-j-1));
+  }
+  
+  // Form the Schur vectors of the initial matrix using the Krylov basis
+  DenseMatrix X; 
+  X = m_V.leftCols(it) * Sr;
+  if (m_r)
+  {
+   // Orthogonalize X against m_U using modified Gram-Schmidt
+   for (int j = 0; j < nbrEig; j++)
+     for (int k =0; k < m_r; k++)
+      X.col(j) = X.col(j) - (m_U.col(k).dot(X.col(j)))*m_U.col(k); 
+  }
+  
+  // Compute m_MX = A * M^-1 * X
+  Index m = m_V.rows();
+  if (!m_isDeflAllocated) 
+    dgmresInitDeflation(m); 
+  DenseMatrix MX(m, nbrEig);
+  DenseVector tv1(m);
+  for (int j = 0; j < nbrEig; j++)
+  {
+    tv1 = mat * X.col(j);
+    MX.col(j) = precond.solve(tv1);
+  }
+  
+  //Update m_T = [U'MU U'MX; X'MU X'MX]
+  m_T.block(m_r, m_r, nbrEig, nbrEig) = X.transpose() * MX; 
+  if(m_r)
+  {
+    m_T.block(0, m_r, m_r, nbrEig) = m_U.leftCols(m_r).transpose() * MX; 
+    m_T.block(m_r, 0, nbrEig, m_r) = X.transpose() * m_MU.leftCols(m_r);
+  }
+  
+  // Save X into m_U and m_MX in m_MU
+  for (int j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
+  for (int j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
+  // Increase the size of the invariant subspace
+  m_r += nbrEig; 
+  
+  // Factorize m_T into m_luT
+  m_luT.compute(m_T.topLeftCorner(m_r, m_r));
+  
+  //FIXME CHeck if the factorization was correctly done (nonsingular matrix)
+  m_isDeflInitialized = true;
+  return 0; 
+}
+template<typename _MatrixType, typename _Preconditioner>
+template<typename RhsType, typename DestType>
+int DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
+{
+  DenseVector x1 = m_U.leftCols(m_r).transpose() * x; 
+  y = x + m_U.leftCols(m_r) * ( m_lambdaN * m_luT.solve(x1) - x1);
+  return 0; 
+}
+
+namespace internal {
+
+  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
+struct solve_retval<DGMRES<_MatrixType, _Preconditioner>, Rhs>
+  : solve_retval_base<DGMRES<_MatrixType, _Preconditioner>, Rhs>
+{
+  typedef DGMRES<_MatrixType, _Preconditioner> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+} // end namespace internal
+
+} // end namespace Eigen
+#endif 
diff --git a/include/eigen3/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/GMRES.h
new file mode 100644
index 0000000..c8c8406
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -0,0 +1,383 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2012, 2014 Kolja Brix <brix at igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GMRES_H
+#define EIGEN_GMRES_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**
+ * Generalized Minimal Residual Algorithm based on the
+ * Arnoldi algorithm implemented with Householder reflections.
+ *
+ * Parameters:
+ *  \param mat       matrix of linear system of equations
+ *  \param Rhs       right hand side vector of linear system of equations
+ *  \param x         on input: initial guess, on output: solution
+ *  \param precond   preconditioner used
+ *  \param iters     on input: maximum number of iterations to perform
+ *                   on output: number of iterations performed
+ *  \param restart   number of iterations for a restart
+ *  \param tol_error on input: residual tolerance
+ *                   on output: residuum achieved
+ *
+ * \sa IterativeMethods::bicgstab() 
+ *  
+ *
+ * For references, please see:
+ *
+ * Saad, Y. and Schultz, M. H.
+ * GMRES: A Generalized Minimal Residual Algorithm for Solving Nonsymmetric Linear Systems.
+ * SIAM J.Sci.Stat.Comp. 7, 1986, pp. 856 - 869.
+ *
+ * Saad, Y.
+ * Iterative Methods for Sparse Linear Systems.
+ * Society for Industrial and Applied Mathematics, Philadelphia, 2003.
+ *
+ * Walker, H. F.
+ * Implementations of the GMRES method.
+ * Comput.Phys.Comm. 53, 1989, pp. 311 - 320.
+ *
+ * Walker, H. F.
+ * Implementation of the GMRES Method using Householder Transformations.
+ * SIAM J.Sci.Stat.Comp. 9, 1988, pp. 152 - 163.
+ *
+ */
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Preconditioner & precond,
+		int &iters, const int &restart, typename Dest::RealScalar & tol_error) {
+
+	using std::sqrt;
+	using std::abs;
+
+	typedef typename Dest::RealScalar RealScalar;
+	typedef typename Dest::Scalar Scalar;
+	typedef Matrix < Scalar, Dynamic, 1 > VectorType;
+	typedef Matrix < Scalar, Dynamic, Dynamic > FMatrixType;
+
+	RealScalar tol = tol_error;
+	const int maxIters = iters;
+	iters = 0;
+
+	const int m = mat.rows();
+
+	VectorType p0 = rhs - mat*x;
+	VectorType r0 = precond.solve(p0);
+ 
+	// is initial guess already good enough?
+	if(abs(r0.norm()) < tol) {
+		return true; 
+	}
+
+	VectorType w = VectorType::Zero(restart + 1);
+
+	FMatrixType H = FMatrixType::Zero(m, restart + 1); // Hessenberg matrix
+	VectorType tau = VectorType::Zero(restart + 1);
+	std::vector < JacobiRotation < Scalar > > G(restart);
+
+	// generate first Householder vector
+	VectorType e(m-1);
+	RealScalar beta;
+	r0.makeHouseholder(e, tau.coeffRef(0), beta);
+	w(0)=(Scalar) beta;
+	H.bottomLeftCorner(m - 1, 1) = e;
+
+	for (int k = 1; k <= restart; ++k) {
+
+		++iters;
+
+		VectorType v = VectorType::Unit(m, k - 1), workspace(m);
+
+		// apply Householder reflections H_{1} ... H_{k-1} to v
+		for (int i = k - 1; i >= 0; --i) {
+			v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+		}
+
+		// apply matrix M to v:  v = mat * v;
+		VectorType t=mat*v;
+		v=precond.solve(t);
+
+		// apply Householder reflections H_{k-1} ... H_{1} to v
+		for (int i = 0; i < k; ++i) {
+			v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+		}
+
+		if (v.tail(m - k).norm() != 0.0) {
+
+			if (k <= restart) {
+
+				// generate new Householder vector
+                                  VectorType e(m - k - 1);
+				RealScalar beta;
+				v.tail(m - k).makeHouseholder(e, tau.coeffRef(k), beta);
+				H.col(k).tail(m - k - 1) = e;
+
+				// apply Householder reflection H_{k} to v
+				v.tail(m - k).applyHouseholderOnTheLeft(H.col(k).tail(m - k - 1), tau.coeffRef(k), workspace.data());
+
+			}
+                }
+
+                if (k > 1) {
+                        for (int i = 0; i < k - 1; ++i) {
+                                // apply old Givens rotations to v
+                                v.applyOnTheLeft(i, i + 1, G[i].adjoint());
+                        }
+                }
+
+                if (k<m && v(k) != (Scalar) 0) {
+                        // determine next Givens rotation
+                        G[k - 1].makeGivens(v(k - 1), v(k));
+
+                        // apply Givens rotation to v and w
+                        v.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+                        w.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+
+                }
+
+                // insert coefficients into upper matrix triangle
+                H.col(k - 1).head(k) = v.head(k);
+
+                bool stop=(k==m || abs(w(k)) < tol || iters == maxIters);
+
+                if (stop || k == restart) {
+
+                        // solve upper triangular system
+                        VectorType y = w.head(k);
+                        H.topLeftCorner(k, k).template triangularView < Eigen::Upper > ().solveInPlace(y);
+
+                        // use Horner-like scheme to calculate solution vector
+                        VectorType x_new = y(k - 1) * VectorType::Unit(m, k - 1);
+
+                        // apply Householder reflection H_{k} to x_new
+                        x_new.tail(m - k + 1).applyHouseholderOnTheLeft(H.col(k - 1).tail(m - k), tau.coeffRef(k - 1), workspace.data());
+
+                        for (int i = k - 2; i >= 0; --i) {
+                                x_new += y(i) * VectorType::Unit(m, i);
+                                // apply Householder reflection H_{i} to x_new
+                                x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+                        }
+
+                        x += x_new;
+
+                        if (stop) {
+                                return true;
+                        } else {
+                                k=0;
+
+                                // reset data for a restart  r0 = rhs - mat * x;
+                                VectorType p0=mat*x;
+                                VectorType p1=precond.solve(p0);
+                                r0 = rhs - p1;
+//                                 r0_sqnorm = r0.squaredNorm();
+                                w = VectorType::Zero(restart + 1);
+                                H = FMatrixType::Zero(m, restart + 1);
+                                tau = VectorType::Zero(restart + 1);
+
+                                // generate first Householder vector
+                                RealScalar beta;
+                                r0.makeHouseholder(e, tau.coeffRef(0), beta);
+                                w(0)=(Scalar) beta;
+                                H.bottomLeftCorner(m - 1, 1) = e;
+
+                        }
+
+                }
+
+
+
+	}
+	
+	return false;
+
+}
+
+}
+
+template< typename _MatrixType,
+          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+class GMRES;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<GMRES<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A GMRES solver for sparse square problems
+  *
+  * This class allows to solve for A.x = b sparse linear problems using a generalized minimal
+  * residual method. The vectors x and b can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  * 
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+  * \code
+  * int n = 10000;
+  * VectorXd x(n), b(n);
+  * SparseMatrix<double> A(n,n);
+  * // fill A and b
+  * GMRES<SparseMatrix<double> > solver(A);
+  * x = solver.solve(b);
+  * std::cout << "#iterations:     " << solver.iterations() << std::endl;
+  * std::cout << "estimated error: " << solver.error()      << std::endl;
+  * // update b, and solve again
+  * x = solver.solve(b);
+  * \endcode
+  * 
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method. Here is a step by
+  * step execution example starting with a random guess and printing the evolution
+  * of the estimated error:
+  * * \code
+  * x = VectorXd::Random(n);
+  * solver.setMaxIterations(1);
+  * int i = 0;
+  * do {
+  *   x = solver.solveWithGuess(b,x);
+  *   std::cout << i << " : " << solver.error() << std::endl;
+  *   ++i;
+  * } while (solver.info()!=Success && i<100);
+  * \endcode
+  * Note that such a step by step excution is slightly slower.
+  * 
+  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  */
+template< typename _MatrixType, typename _Preconditioner>
+class GMRES : public IterativeSolverBase<GMRES<_MatrixType,_Preconditioner> >
+{
+  typedef IterativeSolverBase<GMRES> Base;
+  using Base::mp_matrix;
+  using Base::m_error;
+  using Base::m_iterations;
+  using Base::m_info;
+  using Base::m_isInitialized;
+ 
+private:
+  int m_restart;
+  
+public:
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef _Preconditioner Preconditioner;
+
+public:
+
+  /** Default constructor. */
+  GMRES() : Base(), m_restart(30) {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  GMRES(const MatrixType& A) : Base(A), m_restart(30) {}
+
+  ~GMRES() {}
+  
+  /** Get the number of iterations after that a restart is performed.
+    */
+  int get_restart() { return m_restart; }
+  
+  /** Set the number of iterations after that a restart is performed.
+    *  \param restart   number of iterations for a restarti, default is 30.
+    */
+  void set_restart(const int restart) { m_restart=restart; }
+  
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
+    * \a x0 as an initial solution.
+    *
+    * \sa compute()
+    */
+  template<typename Rhs,typename Guess>
+  inline const internal::solve_retval_with_guess<GMRES, Rhs, Guess>
+  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
+  {
+    eigen_assert(m_isInitialized && "GMRES is not initialized.");
+    eigen_assert(Base::rows()==b.rows()
+              && "GMRES::solve(): invalid number of rows of the right hand side matrix b");
+    return internal::solve_retval_with_guess
+            <GMRES, Rhs, Guess>(*this, b.derived(), x0);
+  }
+  
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solveWithGuess(const Rhs& b, Dest& x) const
+  {    
+    bool failed = false;
+    for(int j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+      
+      typename Dest::ColXpr xj(x,j);
+      if(!internal::gmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
+        failed = true;
+    }
+    m_info = failed ? NumericalIssue
+           : m_error <= Base::m_tolerance ? Success
+           : NoConvergence;
+    m_isInitialized = true;
+  }
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve(const Rhs& b, Dest& x) const
+  {
+    x = b;
+    if(x.squaredNorm() == 0) return; // Check Zero right hand side
+    _solveWithGuess(b,x);
+  }
+
+protected:
+
+};
+
+
+namespace internal {
+
+  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
+struct solve_retval<GMRES<_MatrixType, _Preconditioner>, Rhs>
+  : solve_retval_base<GMRES<_MatrixType, _Preconditioner>, Rhs>
+{
+  typedef GMRES<_MatrixType, _Preconditioner> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GMRES_H
diff --git a/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
new file mode 100644
index 0000000..661c1f2
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
@@ -0,0 +1,278 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INCOMPLETE_CHOlESKY_H
+#define EIGEN_INCOMPLETE_CHOlESKY_H
+#include "Eigen/src/IterativeLinearSolvers/IncompleteLUT.h" 
+#include <Eigen/OrderingMethods>
+#include <list>
+
+namespace Eigen {  
+/** 
+ * \brief Modified Incomplete Cholesky with dual threshold
+ * 
+ * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+ *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+ * 
+ * \tparam _MatrixType The type of the sparse matrix. It should be a symmetric 
+ *                     matrix. It is advised to give  a row-oriented sparse matrix 
+ * \tparam _UpLo The triangular part of the matrix to reference. 
+ * \tparam _OrderingType 
+ */
+
+template <typename Scalar, int _UpLo = Lower, typename _OrderingType = NaturalOrdering<int> >
+class IncompleteCholesky : internal::noncopyable
+{
+  public:
+    typedef SparseMatrix<Scalar,ColMajor> MatrixType;
+    typedef _OrderingType OrderingType;
+    typedef typename MatrixType::RealScalar RealScalar; 
+    typedef typename MatrixType::Index Index; 
+    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef Matrix<Scalar,Dynamic,1> ScalarType; 
+    typedef Matrix<Index,Dynamic, 1> IndexType;
+    typedef std::vector<std::list<Index> > VectorList; 
+    enum { UpLo = _UpLo };
+  public:
+    IncompleteCholesky() : m_shift(1),m_factorizationIsOk(false) {}
+    IncompleteCholesky(const MatrixType& matrix) : m_shift(1),m_factorizationIsOk(false)
+    {
+      compute(matrix);
+    }
+    
+    Index rows() const { return m_L.rows(); }
+    
+    Index cols() const { return m_L.cols(); }
+    
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was succesful,
+      *          \c NumericalIssue if the matrix appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "IncompleteLLT is not initialized.");
+      return m_info;
+    }
+    
+    /** 
+     * \brief Set the initial shift parameter
+     */
+    void setShift( Scalar shift) { m_shift = shift; }
+    
+    /**
+    * \brief Computes the fill reducing permutation vector. 
+    */
+    template<typename MatrixType>
+    void analyzePattern(const MatrixType& mat)
+    {
+      OrderingType ord; 
+      ord(mat.template selfadjointView<UpLo>(), m_perm); 
+      m_analysisIsOk = true; 
+    }
+    
+    template<typename MatrixType>
+    void factorize(const MatrixType& amat);
+    
+    template<typename MatrixType>
+    void compute (const MatrixType& matrix)
+    {
+      analyzePattern(matrix); 
+      factorize(matrix);
+    }
+    
+    template<typename Rhs, typename Dest>
+    void _solve(const Rhs& b, Dest& x) const
+    {
+      eigen_assert(m_factorizationIsOk && "factorize() should be called first");
+      if (m_perm.rows() == b.rows())
+        x = m_perm.inverse() * b; 
+      else 
+        x = b; 
+      x = m_scal.asDiagonal() * x;
+      x = m_L.template triangularView<UnitLower>().solve(x); 
+      x = m_L.adjoint().template triangularView<Upper>().solve(x); 
+      if (m_perm.rows() == b.rows())
+        x = m_perm * x;
+      x = m_scal.asDiagonal() * x;
+    }
+    template<typename Rhs> inline const internal::solve_retval<IncompleteCholesky, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_factorizationIsOk && "IncompleteLLT did not succeed");
+      eigen_assert(m_isInitialized && "IncompleteLLT is not initialized.");
+      eigen_assert(cols()==b.rows()
+                && "IncompleteLLT::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<IncompleteCholesky, Rhs>(*this, b.derived());
+    }
+  protected:
+    SparseMatrix<Scalar,ColMajor> m_L;  // The lower part stored in CSC
+    ScalarType m_scal; // The vector for scaling the matrix 
+    Scalar m_shift; //The initial shift parameter
+    bool m_analysisIsOk; 
+    bool m_factorizationIsOk; 
+    bool m_isInitialized;
+    ComputationInfo m_info;
+    PermutationType m_perm; 
+    
+  private:
+    template <typename IdxType, typename SclType>
+    inline void updateList(const IdxType& colPtr, IdxType& rowIdx, SclType& vals, const Index& col, const Index& jk, IndexType& firstElt, VectorList& listCol); 
+}; 
+
+template<typename Scalar, int _UpLo, typename OrderingType>
+template<typename _MatrixType>
+void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
+{
+  using std::sqrt;
+  using std::min;
+  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
+    
+  // Dropping strategies : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added
+  
+  // Apply the fill-reducing permutation computed in analyzePattern()
+  if (m_perm.rows() == mat.rows() ) // To detect the null permutation
+    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>().twistedBy(m_perm);
+  else
+    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>();
+  
+  Index n = m_L.cols(); 
+  Index nnz = m_L.nonZeros();
+  Map<ScalarType> vals(m_L.valuePtr(), nnz); //values
+  Map<IndexType> rowIdx(m_L.innerIndexPtr(), nnz);  //Row indices
+  Map<IndexType> colPtr( m_L.outerIndexPtr(), n+1); // Pointer to the beginning of each row
+  IndexType firstElt(n-1); // for each j, points to the next entry in vals that will be used in the factorization
+  VectorList listCol(n); // listCol(j) is a linked list of columns to update column j
+  ScalarType curCol(n); // Store a  nonzero values in each column
+  IndexType irow(n); // Row indices of nonzero elements in each column
+  
+  
+  // Computes the scaling factors 
+  m_scal.resize(n);
+  for (int j = 0; j < n; j++)
+  {
+    m_scal(j) = m_L.col(j).norm();
+    m_scal(j) = sqrt(m_scal(j));
+  }
+  // Scale and compute the shift for the matrix 
+  Scalar mindiag = vals[0];
+  for (int j = 0; j < n; j++){
+    for (int k = colPtr[j]; k < colPtr[j+1]; k++)
+     vals[k] /= (m_scal(j) * m_scal(rowIdx[k]));
+    mindiag = (min)(vals[colPtr[j]], mindiag);
+  }
+  
+  if(mindiag < Scalar(0.)) m_shift = m_shift - mindiag;
+  // Apply the shift to the diagonal elements of the matrix
+  for (int j = 0; j < n; j++)
+    vals[colPtr[j]] += m_shift;
+  // jki version of the Cholesky factorization 
+  for (int j=0; j < n; ++j)
+  {  
+    //Left-looking factorize the column j 
+    // First, load the jth column into curCol 
+    Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
+    curCol.setZero();
+    irow.setLinSpaced(n,0,n-1); 
+    for (int i = colPtr[j] + 1; i < colPtr[j+1]; i++)
+    {
+      curCol(rowIdx[i]) = vals[i]; 
+      irow(rowIdx[i]) = rowIdx[i]; 
+    }
+    std::list<int>::iterator k; 
+    // Browse all previous columns that will update column j
+    for(k = listCol[j].begin(); k != listCol[j].end(); k++) 
+    {
+      int jk = firstElt(*k); // First element to use in the column 
+      jk += 1; 
+      for (int i = jk; i < colPtr[*k+1]; i++)
+      {
+        curCol(rowIdx[i]) -= vals[i] * vals[jk] ;
+      }
+      updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
+    }
+    
+    // Scale the current column
+    if(RealScalar(diag) <= 0) 
+    {
+      std::cerr << "\nNegative diagonal during Incomplete factorization... "<< j << "\n";
+      m_info = NumericalIssue; 
+      return; 
+    }
+    RealScalar rdiag = sqrt(RealScalar(diag));
+    vals[colPtr[j]] = rdiag;
+    for (int i = j+1; i < n; i++)
+    {
+      //Scale 
+      curCol(i) /= rdiag;
+      //Update the remaining diagonals with curCol
+      vals[colPtr[i]] -= curCol(i) * curCol(i);
+    }
+    // Select the largest p elements
+    //  p is the original number of elements in the column (without the diagonal)
+    int p = colPtr[j+1] - colPtr[j] - 1 ; 
+    internal::QuickSplit(curCol, irow, p); 
+    // Insert the largest p elements in the matrix
+    int cpt = 0; 
+    for (int i = colPtr[j]+1; i < colPtr[j+1]; i++)
+    {
+      vals[i] = curCol(cpt); 
+      rowIdx[i] = irow(cpt); 
+      cpt ++; 
+    }
+    // Get the first smallest row index and put it after the diagonal element
+    Index jk = colPtr(j)+1;
+    updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); 
+  }
+  m_factorizationIsOk = true; 
+  m_isInitialized = true;
+  m_info = Success; 
+}
+
+template<typename Scalar, int _UpLo, typename OrderingType>
+template <typename IdxType, typename SclType>
+inline void IncompleteCholesky<Scalar,_UpLo, OrderingType>::updateList(const IdxType& colPtr, IdxType& rowIdx, SclType& vals, const Index& col, const Index& jk, IndexType& firstElt, VectorList& listCol)
+{
+  if (jk < colPtr(col+1) )
+  {
+    Index p = colPtr(col+1) - jk;
+    Index minpos; 
+    rowIdx.segment(jk,p).minCoeff(&minpos);
+    minpos += jk;
+    if (rowIdx(minpos) != rowIdx(jk))
+    {
+      //Swap
+      std::swap(rowIdx(jk),rowIdx(minpos));
+      std::swap(vals(jk),vals(minpos));
+    }
+    firstElt(col) = jk;
+    listCol[rowIdx(jk)].push_back(col);
+  }
+}
+namespace internal {
+
+template<typename _Scalar, int _UpLo, typename OrderingType, typename Rhs>
+struct solve_retval<IncompleteCholesky<_Scalar,  _UpLo, OrderingType>, Rhs>
+  : solve_retval_base<IncompleteCholesky<_Scalar, _UpLo, OrderingType>, Rhs>
+{
+  typedef IncompleteCholesky<_Scalar, _UpLo, OrderingType> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen 
+
+#endif
diff --git a/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
new file mode 100644
index 0000000..67e7801
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
@@ -0,0 +1,113 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INCOMPLETE_LU_H
+#define EIGEN_INCOMPLETE_LU_H
+
+namespace Eigen { 
+
+template <typename _Scalar>
+class IncompleteLU
+{
+    typedef _Scalar Scalar;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef typename Vector::Index Index;
+    typedef SparseMatrix<Scalar,RowMajor> FactorType;
+
+  public:
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+
+    IncompleteLU() : m_isInitialized(false) {}
+
+    template<typename MatrixType>
+    IncompleteLU(const MatrixType& mat) : m_isInitialized(false)
+    {
+      compute(mat);
+    }
+
+    Index rows() const { return m_lu.rows(); }
+    Index cols() const { return m_lu.cols(); }
+
+    template<typename MatrixType>
+    IncompleteLU& compute(const MatrixType& mat)
+    {
+      m_lu = mat;
+      int size = mat.cols();
+      Vector diag(size);
+      for(int i=0; i<size; ++i)
+      {
+        typename FactorType::InnerIterator k_it(m_lu,i);
+        for(; k_it && k_it.index()<i; ++k_it)
+        {
+          int k = k_it.index();
+          k_it.valueRef() /= diag(k);
+
+          typename FactorType::InnerIterator j_it(k_it);
+          typename FactorType::InnerIterator kj_it(m_lu, k);
+          while(kj_it && kj_it.index()<=k) ++kj_it;
+          for(++j_it; j_it; )
+          {
+            if(kj_it.index()==j_it.index())
+            {
+              j_it.valueRef() -= k_it.value() * kj_it.value();
+              ++j_it;
+              ++kj_it;
+            }
+            else if(kj_it.index()<j_it.index()) ++kj_it;
+            else                                ++j_it;
+          }
+        }
+        if(k_it && k_it.index()==i) diag(i) = k_it.value();
+        else                        diag(i) = 1;
+      }
+      m_isInitialized = true;
+      return *this;
+    }
+
+    template<typename Rhs, typename Dest>
+    void _solve(const Rhs& b, Dest& x) const
+    {
+      x = m_lu.template triangularView<UnitLower>().solve(b);
+      x = m_lu.template triangularView<Upper>().solve(x);
+    }
+
+    template<typename Rhs> inline const internal::solve_retval<IncompleteLU, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "IncompleteLU is not initialized.");
+      eigen_assert(cols()==b.rows()
+                && "IncompleteLU::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<IncompleteLU, Rhs>(*this, b.derived());
+    }
+
+  protected:
+    FactorType m_lu;
+    bool m_isInitialized;
+};
+
+namespace internal {
+
+template<typename _MatrixType, typename Rhs>
+struct solve_retval<IncompleteLU<_MatrixType>, Rhs>
+  : solve_retval_base<IncompleteLU<_MatrixType>, Rhs>
+{
+  typedef IncompleteLU<_MatrixType> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INCOMPLETE_LU_H
diff --git a/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IterationController.h b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IterationController.h
new file mode 100644
index 0000000..c9c1a4b
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IterationController.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+
+/* NOTE The class IterationController has been adapted from the iteration
+ *      class of the GMM++ and ITL libraries.
+ */
+
+//=======================================================================
+// Copyright (C) 1997-2001
+// Authors: Andrew Lumsdaine <lums at osl.iu.edu> 
+//          Lie-Quan Lee     <llee at osl.iu.edu>
+//
+// This file is part of the Iterative Template Library
+//
+// You should have received a copy of the License Agreement for the
+// Iterative Template Library along with the software;  see the
+// file LICENSE.  
+//
+// Permission to modify the code and to distribute modified code is
+// granted, provided the text of this NOTICE is retained, a notice that
+// the code was modified is included with the above COPYRIGHT NOTICE and
+// with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE
+// file is distributed with the modified code.
+//
+// LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
+// By way of example, but not limitation, Licensor MAKES NO
+// REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY
+// PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS
+// OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS
+// OR OTHER RIGHTS.
+//=======================================================================
+
+//========================================================================
+//
+// Copyright (C) 2002-2007 Yves Renard
+//
+// This file is a part of GETFEM++
+//
+// Getfem++ is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; version 2.1 of the License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Lesser General Public License for more details.
+// You should have received a copy of the GNU Lesser General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301,
+// USA.
+//
+//========================================================================
+
+#include "../../../../Eigen/src/Core/util/NonMPL2.h"
+
+#ifndef EIGEN_ITERATION_CONTROLLER_H
+#define EIGEN_ITERATION_CONTROLLER_H
+
+namespace Eigen { 
+
+/** \ingroup IterativeSolvers_Module
+  * \class IterationController
+  *
+  * \brief Controls the iterations of the iterative solvers
+  *
+  * This class has been adapted from the iteration class of GMM++ and ITL libraries.
+  *
+  */
+class IterationController
+{
+  protected :
+    double m_rhsn;        ///< Right hand side norm
+    size_t m_maxiter;     ///< Max. number of iterations
+    int m_noise;          ///< if noise > 0 iterations are printed
+    double m_resmax;      ///< maximum residual
+    double m_resminreach, m_resadd;
+    size_t m_nit;         ///< iteration number
+    double m_res;         ///< last computed residual
+    bool m_written;
+    void (*m_callback)(const IterationController&);
+  public :
+
+    void init()
+    {
+      m_nit = 0; m_res = 0.0; m_written = false;
+      m_resminreach = 1E50; m_resadd = 0.0;
+      m_callback = 0;
+    }
+
+    IterationController(double r = 1.0E-8, int noi = 0, size_t mit = size_t(-1))
+      : m_rhsn(1.0), m_maxiter(mit), m_noise(noi), m_resmax(r) { init(); }
+
+    void operator ++(int) { m_nit++; m_written = false; m_resadd += m_res; }
+    void operator ++() { (*this)++; }
+
+    bool first() { return m_nit == 0; }
+
+    /* get/set the "noisyness" (verbosity) of the solvers */
+    int noiseLevel() const { return m_noise; }
+    void setNoiseLevel(int n) { m_noise = n; }
+    void reduceNoiseLevel() { if (m_noise > 0) m_noise--; }
+
+    double maxResidual() const { return m_resmax; }
+    void setMaxResidual(double r) { m_resmax = r; }
+
+    double residual() const { return m_res; }
+
+    /* change the user-definable callback, called after each iteration */
+    void setCallback(void (*t)(const IterationController&))
+    {
+      m_callback = t;
+    }
+
+    size_t iteration() const { return m_nit; }
+    void setIteration(size_t i) { m_nit = i; }
+
+    size_t maxIterarions() const { return m_maxiter; }
+    void setMaxIterations(size_t i) { m_maxiter = i; }
+
+    double rhsNorm() const { return m_rhsn; }
+    void setRhsNorm(double r) { m_rhsn = r; }
+
+    bool converged() const { return m_res <= m_rhsn * m_resmax; }
+    bool converged(double nr)
+    {
+      using std::abs;
+      m_res = abs(nr); 
+      m_resminreach = (std::min)(m_resminreach, m_res);
+      return converged();
+    }
+    template<typename VectorType> bool converged(const VectorType &v)
+    { return converged(v.squaredNorm()); }
+
+    bool finished(double nr)
+    {
+      if (m_callback) m_callback(*this);
+      if (m_noise > 0 && !m_written)
+      {
+        converged(nr);
+        m_written = true;
+      }
+      return (m_nit >= m_maxiter || converged(nr));
+    }
+    template <typename VectorType>
+    bool finished(const MatrixBase<VectorType> &v)
+    { return finished(double(v.squaredNorm())); }
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_ITERATION_CONTROLLER_H
diff --git a/include/eigen3/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/MINRES.h
new file mode 100644
index 0000000..0e56342
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -0,0 +1,302 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Giacomo Po <gpo at ucla.edu>
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_MINRES_H_
+#define EIGEN_MINRES_H_
+
+
+namespace Eigen {
+    
+    namespace internal {
+        
+        /** \internal Low-level MINRES algorithm
+         * \param mat The matrix A
+         * \param rhs The right hand side vector b
+         * \param x On input and initial solution, on output the computed solution.
+         * \param precond A right preconditioner being able to efficiently solve for an
+         *                approximation of Ax=b (regardless of b)
+         * \param iters On input the max number of iteration, on output the number of performed iterations.
+         * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+         */
+        template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+        EIGEN_DONT_INLINE
+        void minres(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                    const Preconditioner& precond, int& iters,
+                    typename Dest::RealScalar& tol_error)
+        {
+            using std::sqrt;
+            typedef typename Dest::RealScalar RealScalar;
+            typedef typename Dest::Scalar Scalar;
+            typedef Matrix<Scalar,Dynamic,1> VectorType;
+
+            // initialize
+            const int maxIters(iters);  // initialize maxIters to iters
+            const int N(mat.cols());    // the size of the matrix
+            const RealScalar rhsNorm2(rhs.squaredNorm());
+            const RealScalar threshold2(tol_error*tol_error*rhsNorm2); // convergence threshold (compared to residualNorm2)
+            
+            // Initialize preconditioned Lanczos
+//            VectorType v_old(N); // will be initialized inside loop
+            VectorType v( VectorType::Zero(N) ); //initialize v
+            VectorType v_new(rhs-mat*x); //initialize v_new
+            RealScalar residualNorm2(v_new.squaredNorm());
+//            VectorType w(N); // will be initialized inside loop
+            VectorType w_new(precond.solve(v_new)); // initialize w_new
+//            RealScalar beta; // will be initialized inside loop
+            RealScalar beta_new2(v_new.dot(w_new));
+            eigen_assert(beta_new2 >= 0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+            RealScalar beta_new(sqrt(beta_new2));
+            const RealScalar beta_one(beta_new);
+            v_new /= beta_new;
+            w_new /= beta_new;
+            // Initialize other variables
+            RealScalar c(1.0); // the cosine of the Givens rotation
+            RealScalar c_old(1.0);
+            RealScalar s(0.0); // the sine of the Givens rotation
+            RealScalar s_old(0.0); // the sine of the Givens rotation
+//            VectorType p_oold(N); // will be initialized in loop
+            VectorType p_old(VectorType::Zero(N)); // initialize p_old=0
+            VectorType p(p_old); // initialize p=0
+            RealScalar eta(1.0);
+                        
+            iters = 0; // reset iters
+            while ( iters < maxIters ){
+                
+                // Preconditioned Lanczos
+                /* Note that there are 4 variants on the Lanczos algorithm. These are
+                 * described in Paige, C. C. (1972). Computational variants of
+                 * the Lanczos method for the eigenproblem. IMA Journal of Applied
+                 * Mathematics, 10(3), 373–381. The current implementation corresponds 
+                 * to the case A(2,7) in the paper. It also corresponds to 
+                 * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
+                 * Systems, 2003 p.173. For the preconditioned version see 
+                 * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
+                 */
+                const RealScalar beta(beta_new);
+//                v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
+                const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT
+                v = v_new; // update
+//                w = w_new; // update
+                const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT
+                v_new.noalias() = mat*w - beta*v_old; // compute v_new
+                const RealScalar alpha = v_new.dot(w);
+                v_new -= alpha*v; // overwrite v_new
+                w_new = precond.solve(v_new); // overwrite w_new
+                beta_new2 = v_new.dot(w_new); // compute beta_new
+                eigen_assert(beta_new2 >= 0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+                beta_new = sqrt(beta_new2); // compute beta_new
+                v_new /= beta_new; // overwrite v_new for next iteration
+                w_new /= beta_new; // overwrite w_new for next iteration
+                
+                // Givens rotation
+                const RealScalar r2 =s*alpha+c*c_old*beta; // s, s_old, c and c_old are still from previous iteration
+                const RealScalar r3 =s_old*beta; // s, s_old, c and c_old are still from previous iteration
+                const RealScalar r1_hat=c*alpha-c_old*s*beta;
+                const RealScalar r1 =sqrt( std::pow(r1_hat,2) + std::pow(beta_new,2) );
+                c_old = c; // store for next iteration
+                s_old = s; // store for next iteration
+                c=r1_hat/r1; // new cosine
+                s=beta_new/r1; // new sine
+                
+                // Update solution
+//                p_oold = p_old;
+                const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT
+                p_old = p;
+                p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED?
+                x += beta_one*c*eta*p;
+                residualNorm2 *= s*s;
+                
+                if ( residualNorm2 < threshold2){
+                    break;
+                }
+                
+                eta=-s*eta; // update eta
+                iters++; // increment iteration number (for output purposes)
+            }
+            tol_error = std::sqrt(residualNorm2 / rhsNorm2); // return error. Note that this is the estimated error. The real error |Ax-b|/|b| may be slightly larger 
+        }
+        
+    }
+    
+    template< typename _MatrixType, int _UpLo=Lower,
+    typename _Preconditioner = IdentityPreconditioner>
+//    typename _Preconditioner = IdentityPreconditioner<typename _MatrixType::Scalar> > // preconditioner must be positive definite
+    class MINRES;
+    
+    namespace internal {
+        
+        template< typename _MatrixType, int _UpLo, typename _Preconditioner>
+        struct traits<MINRES<_MatrixType,_UpLo,_Preconditioner> >
+        {
+            typedef _MatrixType MatrixType;
+            typedef _Preconditioner Preconditioner;
+        };
+        
+    }
+    
+    /** \ingroup IterativeLinearSolvers_Module
+     * \brief A minimal residual solver for sparse symmetric problems
+     *
+     * This class allows to solve for A.x = b sparse linear problems using the MINRES algorithm
+     * of Paige and Saunders (1975). The sparse matrix A must be symmetric (possibly indefinite).
+     * The vectors x and b can be either dense or sparse.
+     *
+     * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+     * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+     *               or Upper. Default is Lower.
+     * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+     *
+     * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+     * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+     * and NumTraits<Scalar>::epsilon() for the tolerance.
+     *
+     * This class can be used as the direct solver classes. Here is a typical usage example:
+     * \code
+     * int n = 10000;
+     * VectorXd x(n), b(n);
+     * SparseMatrix<double> A(n,n);
+     * // fill A and b
+     * MINRES<SparseMatrix<double> > mr;
+     * mr.compute(A);
+     * x = mr.solve(b);
+     * std::cout << "#iterations:     " << mr.iterations() << std::endl;
+     * std::cout << "estimated error: " << mr.error()      << std::endl;
+     * // update b, and solve again
+     * x = mr.solve(b);
+     * \endcode
+     *
+     * By default the iterations start with x=0 as an initial guess of the solution.
+     * One can control the start using the solveWithGuess() method. Here is a step by
+     * step execution example starting with a random guess and printing the evolution
+     * of the estimated error:
+     * * \code
+     * x = VectorXd::Random(n);
+     * mr.setMaxIterations(1);
+     * int i = 0;
+     * do {
+     *   x = mr.solveWithGuess(b,x);
+     *   std::cout << i << " : " << mr.error() << std::endl;
+     *   ++i;
+     * } while (mr.info()!=Success && i<100);
+     * \endcode
+     * Note that such a step by step excution is slightly slower.
+     *
+     * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+     */
+    template< typename _MatrixType, int _UpLo, typename _Preconditioner>
+    class MINRES : public IterativeSolverBase<MINRES<_MatrixType,_UpLo,_Preconditioner> >
+    {
+        
+        typedef IterativeSolverBase<MINRES> Base;
+        using Base::mp_matrix;
+        using Base::m_error;
+        using Base::m_iterations;
+        using Base::m_info;
+        using Base::m_isInitialized;
+    public:
+        typedef _MatrixType MatrixType;
+        typedef typename MatrixType::Scalar Scalar;
+        typedef typename MatrixType::Index Index;
+        typedef typename MatrixType::RealScalar RealScalar;
+        typedef _Preconditioner Preconditioner;
+        
+        enum {UpLo = _UpLo};
+        
+    public:
+        
+        /** Default constructor. */
+        MINRES() : Base() {}
+        
+        /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+         *
+         * This constructor is a shortcut for the default constructor followed
+         * by a call to compute().
+         *
+         * \warning this class stores a reference to the matrix A as well as some
+         * precomputed values that depend on it. Therefore, if \a A is changed
+         * this class becomes invalid. Call compute() to update it with the new
+         * matrix A, or modify a copy of A.
+         */
+        MINRES(const MatrixType& A) : Base(A) {}
+        
+        /** Destructor. */
+        ~MINRES(){}
+		
+        /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
+         * \a x0 as an initial solution.
+         *
+         * \sa compute()
+         */
+        template<typename Rhs,typename Guess>
+        inline const internal::solve_retval_with_guess<MINRES, Rhs, Guess>
+        solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
+        {
+            eigen_assert(m_isInitialized && "MINRES is not initialized.");
+            eigen_assert(Base::rows()==b.rows()
+                         && "MINRES::solve(): invalid number of rows of the right hand side matrix b");
+            return internal::solve_retval_with_guess
+            <MINRES, Rhs, Guess>(*this, b.derived(), x0);
+        }
+        
+        /** \internal */
+        template<typename Rhs,typename Dest>
+        void _solveWithGuess(const Rhs& b, Dest& x) const
+        {
+            m_iterations = Base::maxIterations();
+            m_error = Base::m_tolerance;
+            
+            for(int j=0; j<b.cols(); ++j)
+            {
+                m_iterations = Base::maxIterations();
+                m_error = Base::m_tolerance;
+                
+                typename Dest::ColXpr xj(x,j);
+                internal::minres(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj,
+                                 Base::m_preconditioner, m_iterations, m_error);
+            }
+            
+            m_isInitialized = true;
+            m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+        }
+        
+        /** \internal */
+        template<typename Rhs,typename Dest>
+        void _solve(const Rhs& b, Dest& x) const
+        {
+            x.setZero();
+            _solveWithGuess(b,x);
+        }
+        
+    protected:
+        
+    };
+    
+    namespace internal {
+        
+        template<typename _MatrixType, int _UpLo, typename _Preconditioner, typename Rhs>
+        struct solve_retval<MINRES<_MatrixType,_UpLo,_Preconditioner>, Rhs>
+        : solve_retval_base<MINRES<_MatrixType,_UpLo,_Preconditioner>, Rhs>
+        {
+            typedef MINRES<_MatrixType,_UpLo,_Preconditioner> Dec;
+            EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+            
+            template<typename Dest> void evalTo(Dest& dst) const
+            {
+                dec()._solve(rhs(),dst);
+            }
+        };
+        
+    } // end namespace internal
+    
+} // end namespace Eigen
+
+#endif // EIGEN_MINRES_H
+
diff --git a/include/eigen3/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/Scaling.h
new file mode 100644
index 0000000..4fd4392
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/IterativeSolvers/Scaling.h
@@ -0,0 +1,185 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire NUENTSA WAKAM <desire.nuentsa_wakam at inria.fr
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ITERSCALING_H
+#define EIGEN_ITERSCALING_H
+/**
+  * \ingroup IterativeSolvers_Module
+  * \brief iterative scaling algorithm to equilibrate rows and column norms in matrices
+  * 
+  * This class can be used as a preprocessing tool to accelerate the convergence of iterative methods 
+  * 
+  * This feature is  useful to limit the pivoting amount during LU/ILU factorization
+  * The  scaling strategy as presented here preserves the symmetry of the problem
+  * NOTE It is assumed that the matrix does not have empty row or column, 
+  * 
+  * Example with key steps 
+  * \code
+  * VectorXd x(n), b(n);
+  * SparseMatrix<double> A;
+  * // fill A and b;
+  * IterScaling<SparseMatrix<double> > scal; 
+  * // Compute the left and right scaling vectors. The matrix is equilibrated at output
+  * scal.computeRef(A); 
+  * // Scale the right hand side
+  * b = scal.LeftScaling().cwiseProduct(b); 
+  * // Now, solve the equilibrated linear system with any available solver
+  * 
+  * // Scale back the computed solution
+  * x = scal.RightScaling().cwiseProduct(x); 
+  * \endcode
+  * 
+  * \tparam _MatrixType the type of the matrix. It should be a real square sparsematrix
+  * 
+  * References : D. Ruiz and B. Ucar, A Symmetry Preserving Algorithm for Matrix Scaling, INRIA Research report RR-7552
+  * 
+  * \sa \ref IncompleteLUT 
+  */
+namespace Eigen {
+using std::abs; 
+template<typename _MatrixType>
+class IterScaling
+{
+  public:
+    typedef _MatrixType MatrixType; 
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    
+  public:
+    IterScaling() { init(); }
+    
+    IterScaling(const MatrixType& matrix)
+    {
+      init();
+      compute(matrix);
+    }
+    
+    ~IterScaling() { }
+    
+    /** 
+     * Compute the left and right diagonal matrices to scale the input matrix @p mat
+     * 
+     * FIXME This algorithm will be modified such that the diagonal elements are permuted on the diagonal. 
+     * 
+     * \sa LeftScaling() RightScaling()
+     */
+    void compute (const MatrixType& mat)
+    {
+      int m = mat.rows(); 
+      int n = mat.cols();
+      eigen_assert((m>0 && m == n) && "Please give a non - empty matrix");
+      m_left.resize(m); 
+      m_right.resize(n);
+      m_left.setOnes();
+      m_right.setOnes();
+      m_matrix = mat;
+      VectorXd Dr, Dc, DrRes, DcRes; // Temporary Left and right scaling vectors
+      Dr.resize(m); Dc.resize(n);
+      DrRes.resize(m); DcRes.resize(n);
+      double EpsRow = 1.0, EpsCol = 1.0;
+      int its = 0; 
+      do
+      { // Iterate until the infinite norm of each row and column is approximately 1
+        // Get the maximum value in each row and column
+        Dr.setZero(); Dc.setZero();
+        for (int k=0; k<m_matrix.outerSize(); ++k)
+        {
+          for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it)
+          {
+            if ( Dr(it.row()) < abs(it.value()) )
+              Dr(it.row()) = abs(it.value());
+            
+            if ( Dc(it.col()) < abs(it.value()) )
+              Dc(it.col()) = abs(it.value());
+          }
+        }
+        for (int i = 0; i < m; ++i) 
+        {
+          Dr(i) = std::sqrt(Dr(i));
+          Dc(i) = std::sqrt(Dc(i));
+        }
+        // Save the scaling factors 
+        for (int i = 0; i < m; ++i) 
+        {
+          m_left(i) /= Dr(i);
+          m_right(i) /= Dc(i);
+        }
+        // Scale the rows and the columns of the matrix
+        DrRes.setZero(); DcRes.setZero(); 
+        for (int k=0; k<m_matrix.outerSize(); ++k)
+        {
+          for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it)
+          {
+            it.valueRef() = it.value()/( Dr(it.row()) * Dc(it.col()) );
+            // Accumulate the norms of the row and column vectors   
+            if ( DrRes(it.row()) < abs(it.value()) )
+              DrRes(it.row()) = abs(it.value());
+            
+            if ( DcRes(it.col()) < abs(it.value()) )
+              DcRes(it.col()) = abs(it.value());
+          }
+        }  
+        DrRes.array() = (1-DrRes.array()).abs();
+        EpsRow = DrRes.maxCoeff();
+        DcRes.array() = (1-DcRes.array()).abs();
+        EpsCol = DcRes.maxCoeff();
+        its++;
+      }while ( (EpsRow >m_tol || EpsCol > m_tol) && (its < m_maxits) );
+      m_isInitialized = true;
+    }
+    /** Compute the left and right vectors to scale the vectors
+     * the input matrix is scaled with the computed vectors at output
+     * 
+     * \sa compute()
+     */
+    void computeRef (MatrixType& mat)
+    {
+      compute (mat);
+      mat = m_matrix;
+    }
+    /** Get the vector to scale the rows of the matrix 
+     */
+    VectorXd& LeftScaling()
+    {
+      return m_left;
+    }
+    
+    /** Get the vector to scale the columns of the matrix 
+     */
+    VectorXd& RightScaling()
+    {
+      return m_right;
+    }
+    
+    /** Set the tolerance for the convergence of the iterative scaling algorithm
+     */
+    void setTolerance(double tol)
+    {
+      m_tol = tol; 
+    }
+      
+  protected:
+    
+    void init()
+    {
+      m_tol = 1e-10;
+      m_maxits = 5;
+      m_isInitialized = false;
+    }
+    
+    MatrixType m_matrix;
+    mutable ComputationInfo m_info; 
+    bool m_isInitialized; 
+    VectorXd m_left; // Left scaling vector
+    VectorXd m_right; // m_right scaling vector
+    double m_tol; 
+    int m_maxits; // Maximum number of iterations allowed
+};
+}
+#endif
diff --git a/include/eigen3/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/include/eigen3/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
new file mode 100644
index 0000000..532896c
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -0,0 +1,244 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Kolja Brix <brix at igpm.rwth-aachen.de>
+// Copyright (C) 2011 Andreas Platen <andiplaten at gmx.de>
+// Copyright (C) 2012 Chen-Pang He <jdh8 at ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef KRONECKER_TENSOR_PRODUCT_H
+#define KRONECKER_TENSOR_PRODUCT_H
+
+namespace Eigen { 
+
+template<typename Scalar, int Options, typename Index> class SparseMatrix;
+
+/*!
+ * \brief Kronecker tensor product helper class for dense matrices
+ *
+ * This class is the return value of kroneckerProduct(MatrixBase,
+ * MatrixBase). Use the function rather than construct this class
+ * directly to avoid specifying template prarameters.
+ *
+ * \tparam Lhs  Type of the left-hand side, a matrix expression.
+ * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
+ */
+template<typename Lhs, typename Rhs>
+class KroneckerProduct : public ReturnByValue<KroneckerProduct<Lhs,Rhs> >
+{
+  private:
+    typedef ReturnByValue<KroneckerProduct> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::Index Index;
+
+  public:
+    /*! \brief Constructor. */
+    KroneckerProduct(const Lhs& A, const Rhs& B)
+      : m_A(A), m_B(B)
+    {}
+
+    /*! \brief Evaluate the Kronecker tensor product. */
+    template<typename Dest> void evalTo(Dest& dst) const;
+    
+    inline Index rows() const { return m_A.rows() * m_B.rows(); }
+    inline Index cols() const { return m_A.cols() * m_B.cols(); }
+
+    Scalar coeff(Index row, Index col) const
+    {
+      return m_A.coeff(row / m_B.rows(), col / m_B.cols()) *
+             m_B.coeff(row % m_B.rows(), col % m_B.cols());
+    }
+
+    Scalar coeff(Index i) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(KroneckerProduct);
+      return m_A.coeff(i / m_A.size()) * m_B.coeff(i % m_A.size());
+    }
+
+  private:
+    typename Lhs::Nested m_A;
+    typename Rhs::Nested m_B;
+};
+
+/*!
+ * \brief Kronecker tensor product helper class for sparse matrices
+ *
+ * If at least one of the operands is a sparse matrix expression,
+ * then this class is returned and evaluates into a sparse matrix.
+ *
+ * This class is the return value of kroneckerProduct(EigenBase,
+ * EigenBase). Use the function rather than construct this class
+ * directly to avoid specifying template prarameters.
+ *
+ * \tparam Lhs  Type of the left-hand side, a matrix expression.
+ * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
+ */
+template<typename Lhs, typename Rhs>
+class KroneckerProductSparse : public EigenBase<KroneckerProductSparse<Lhs,Rhs> >
+{
+  private:
+    typedef typename internal::traits<KroneckerProductSparse>::Index Index;
+
+  public:
+    /*! \brief Constructor. */
+    KroneckerProductSparse(const Lhs& A, const Rhs& B)
+      : m_A(A), m_B(B)
+    {}
+
+    /*! \brief Evaluate the Kronecker tensor product. */
+    template<typename Dest> void evalTo(Dest& dst) const;
+    
+    inline Index rows() const { return m_A.rows() * m_B.rows(); }
+    inline Index cols() const { return m_A.cols() * m_B.cols(); }
+
+    template<typename Scalar, int Options, typename Index>
+    operator SparseMatrix<Scalar, Options, Index>()
+    {
+      SparseMatrix<Scalar, Options, Index> result;
+      evalTo(result.derived());
+      return result;
+    }
+
+  private:
+    typename Lhs::Nested m_A;
+    typename Rhs::Nested m_B;
+};
+
+template<typename Lhs, typename Rhs>
+template<typename Dest>
+void KroneckerProduct<Lhs,Rhs>::evalTo(Dest& dst) const
+{
+  const int BlockRows = Rhs::RowsAtCompileTime,
+            BlockCols = Rhs::ColsAtCompileTime;
+  const Index Br = m_B.rows(),
+              Bc = m_B.cols();
+  for (Index i=0; i < m_A.rows(); ++i)
+    for (Index j=0; j < m_A.cols(); ++j)
+      Block<Dest,BlockRows,BlockCols>(dst,i*Br,j*Bc,Br,Bc) = m_A.coeff(i,j) * m_B;
+}
+
+template<typename Lhs, typename Rhs>
+template<typename Dest>
+void KroneckerProductSparse<Lhs,Rhs>::evalTo(Dest& dst) const
+{
+  const Index Br = m_B.rows(),
+              Bc = m_B.cols();
+  dst.resize(rows(),cols());
+  dst.resizeNonZeros(0);
+  dst.reserve(m_A.nonZeros() * m_B.nonZeros());
+
+  for (Index kA=0; kA < m_A.outerSize(); ++kA)
+  {
+    for (Index kB=0; kB < m_B.outerSize(); ++kB)
+    {
+      for (typename Lhs::InnerIterator itA(m_A,kA); itA; ++itA)
+      {
+        for (typename Rhs::InnerIterator itB(m_B,kB); itB; ++itB)
+        {
+          const Index i = itA.row() * Br + itB.row(),
+                      j = itA.col() * Bc + itB.col();
+          dst.insert(i,j) = itA.value() * itB.value();
+        }
+      }
+    }
+  }
+}
+
+namespace internal {
+
+template<typename _Lhs, typename _Rhs>
+struct traits<KroneckerProduct<_Lhs,_Rhs> >
+{
+  typedef typename remove_all<_Lhs>::type Lhs;
+  typedef typename remove_all<_Rhs>::type Rhs;
+  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+
+  enum {
+    Rows = size_at_compile_time<traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime>::ret,
+    Cols = size_at_compile_time<traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime>::ret,
+    MaxRows = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
+    MaxCols = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
+    CoeffReadCost = Lhs::CoeffReadCost + Rhs::CoeffReadCost + NumTraits<Scalar>::MulCost
+  };
+
+  typedef Matrix<Scalar,Rows,Cols> ReturnType;
+};
+
+template<typename _Lhs, typename _Rhs>
+struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
+{
+  typedef MatrixXpr XprKind;
+  typedef typename remove_all<_Lhs>::type Lhs;
+  typedef typename remove_all<_Rhs>::type Rhs;
+  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename Lhs::Index, typename Rhs::Index>::type Index;
+
+  enum {
+    LhsFlags = Lhs::Flags,
+    RhsFlags = Rhs::Flags,
+
+    RowsAtCompileTime = size_at_compile_time<traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime>::ret,
+    ColsAtCompileTime = size_at_compile_time<traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime>::ret,
+    MaxRowsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
+    MaxColsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
+
+    EvalToRowMajor = (LhsFlags & RhsFlags & RowMajorBit),
+    RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
+
+    Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+          | EvalBeforeNestingBit | EvalBeforeAssigningBit,
+    CoeffReadCost = Dynamic
+  };
+};
+
+} // end namespace internal
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * Computes Kronecker tensor product of two dense matrices
+ *
+ * \warning If you want to replace a matrix by its Kronecker product
+ *          with some matrix, do \b NOT do this:
+ * \code
+ * A = kroneckerProduct(A,B); // bug!!! caused by aliasing effect
+ * \endcode
+ * instead, use eval() to work around this:
+ * \code
+ * A = kroneckerProduct(A,B).eval();
+ * \endcode
+ *
+ * \param a  Dense matrix a
+ * \param b  Dense matrix b
+ * \return   Kronecker tensor product of a and b
+ */
+template<typename A, typename B>
+KroneckerProduct<A,B> kroneckerProduct(const MatrixBase<A>& a, const MatrixBase<B>& b)
+{
+  return KroneckerProduct<A, B>(a.derived(), b.derived());
+}
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * Computes Kronecker tensor product of two matrices, at least one of
+ * which is sparse
+ *
+ * \param a  Dense/sparse matrix a
+ * \param b  Dense/sparse matrix b
+ * \return   Kronecker tensor product of a and b, stored in a sparse
+ *           matrix
+ */
+template<typename A, typename B>
+KroneckerProductSparse<A,B> kroneckerProduct(const EigenBase<A>& a, const EigenBase<B>& b)
+{
+  return KroneckerProductSparse<A,B>(a.derived(), b.derived());
+}
+
+} // end namespace Eigen
+
+#endif // KRONECKER_TENSOR_PRODUCT_H
diff --git a/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
new file mode 100644
index 0000000..6825a78
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -0,0 +1,451 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009, 2010 Jitse Niesen <jitse at maths.leeds.ac.uk>
+// Copyright (C) 2011 Chen-Pang He <jdh8 at ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_EXPONENTIAL
+#define EIGEN_MATRIX_EXPONENTIAL
+
+#include "StemFunction.h"
+
+namespace Eigen {
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Class for computing the matrix exponential.
+  * \tparam MatrixType type of the argument of the exponential,
+  * expected to be an instantiation of the Matrix class template.
+  */
+template <typename MatrixType>
+class MatrixExponential {
+
+  public:
+
+    /** \brief Constructor.
+      * 
+      * The class stores a reference to \p M, so it should not be
+      * changed (or destroyed) before compute() is called.
+      *
+      * \param[in] M  matrix whose exponential is to be computed.
+      */
+    MatrixExponential(const MatrixType &M);
+
+    /** \brief Computes the matrix exponential.
+      *
+      * \param[out] result  the matrix exponential of \p M in the constructor.
+      */
+    template <typename ResultType> 
+    void compute(ResultType &result);
+
+  private:
+
+    // Prevent copying
+    MatrixExponential(const MatrixExponential&);
+    MatrixExponential& operator=(const MatrixExponential&);
+
+    /** \brief Compute the (3,3)-Padé approximant to the exponential.
+     *
+     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé
+     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+     *
+     *  \param[in] A   Argument of matrix exponential
+     */
+    void pade3(const MatrixType &A);
+
+    /** \brief Compute the (5,5)-Padé approximant to the exponential.
+     *
+     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé
+     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+     *
+     *  \param[in] A   Argument of matrix exponential
+     */
+    void pade5(const MatrixType &A);
+
+    /** \brief Compute the (7,7)-Padé approximant to the exponential.
+     *
+     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé
+     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+     *
+     *  \param[in] A   Argument of matrix exponential
+     */
+    void pade7(const MatrixType &A);
+
+    /** \brief Compute the (9,9)-Padé approximant to the exponential.
+     *
+     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé
+     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+     *
+     *  \param[in] A   Argument of matrix exponential
+     */
+    void pade9(const MatrixType &A);
+
+    /** \brief Compute the (13,13)-Padé approximant to the exponential.
+     *
+     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé
+     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+     *
+     *  \param[in] A   Argument of matrix exponential
+     */
+    void pade13(const MatrixType &A);
+
+    /** \brief Compute the (17,17)-Padé approximant to the exponential.
+     *
+     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé
+     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+     *
+     *  This function activates only if your long double is double-double or quadruple.
+     *
+     *  \param[in] A   Argument of matrix exponential
+     */
+    void pade17(const MatrixType &A);
+
+    /** \brief Compute Padé approximant to the exponential.
+     *
+     * Computes \c m_U, \c m_V and \c m_squarings such that
+     * \f$ (V+U)(V-U)^{-1} \f$ is a Padé of
+     * \f$ \exp(2^{-\mbox{squarings}}M) \f$ around \f$ M = 0 \f$. The
+     * degree of the Padé approximant and the value of
+     * squarings are chosen such that the approximation error is no
+     * more than the round-off error.
+     *
+     * The argument of this function should correspond with the (real
+     * part of) the entries of \c m_M.  It is used to select the
+     * correct implementation using overloading.
+     */
+    void computeUV(double);
+
+    /** \brief Compute Padé approximant to the exponential.
+     *
+     *  \sa computeUV(double);
+     */
+    void computeUV(float);
+    
+    /** \brief Compute Padé approximant to the exponential.
+     *
+     *  \sa computeUV(double);
+     */
+    void computeUV(long double);
+
+    typedef typename internal::traits<MatrixType>::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename std::complex<RealScalar> ComplexScalar;
+
+    /** \brief Reference to matrix whose exponential is to be computed. */
+    typename internal::nested<MatrixType>::type m_M;
+
+    /** \brief Odd-degree terms in numerator of Padé approximant. */
+    MatrixType m_U;
+
+    /** \brief Even-degree terms in numerator of Padé approximant. */
+    MatrixType m_V;
+
+    /** \brief Used for temporary storage. */
+    MatrixType m_tmp1;
+
+    /** \brief Used for temporary storage. */
+    MatrixType m_tmp2;
+
+    /** \brief Identity matrix of the same size as \c m_M. */
+    MatrixType m_Id;
+
+    /** \brief Number of squarings required in the last step. */
+    int m_squarings;
+
+    /** \brief L1 norm of m_M. */
+    RealScalar m_l1norm;
+};
+
+template <typename MatrixType>
+MatrixExponential<MatrixType>::MatrixExponential(const MatrixType &M) :
+  m_M(M),
+  m_U(M.rows(),M.cols()),
+  m_V(M.rows(),M.cols()),
+  m_tmp1(M.rows(),M.cols()),
+  m_tmp2(M.rows(),M.cols()),
+  m_Id(MatrixType::Identity(M.rows(), M.cols())),
+  m_squarings(0),
+  m_l1norm(M.cwiseAbs().colwise().sum().maxCoeff())
+{
+  /* empty body */
+}
+
+template <typename MatrixType>
+template <typename ResultType> 
+void MatrixExponential<MatrixType>::compute(ResultType &result)
+{
+#if LDBL_MANT_DIG > 112 // rarely happens
+  if(sizeof(RealScalar) > 14) {
+    result = m_M.matrixFunction(StdStemFunctions<ComplexScalar>::exp);
+    return;
+  }
+#endif
+  computeUV(RealScalar());
+  m_tmp1 = m_U + m_V;   // numerator of Pade approximant
+  m_tmp2 = -m_U + m_V;  // denominator of Pade approximant
+  result = m_tmp2.partialPivLu().solve(m_tmp1);
+  for (int i=0; i<m_squarings; i++)
+    result *= result;   // undo scaling by repeated squaring
+}
+
+template <typename MatrixType>
+EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade3(const MatrixType &A)
+{
+  const RealScalar b[] = {120., 60., 12., 1.};
+  m_tmp1.noalias() = A * A;
+  m_tmp2 = b[3]*m_tmp1 + b[1]*m_Id;
+  m_U.noalias() = A * m_tmp2;
+  m_V = b[2]*m_tmp1 + b[0]*m_Id;
+}
+
+template <typename MatrixType>
+EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade5(const MatrixType &A)
+{
+  const RealScalar b[] = {30240., 15120., 3360., 420., 30., 1.};
+  MatrixType A2 = A * A;
+  m_tmp1.noalias() = A2 * A2;
+  m_tmp2 = b[5]*m_tmp1 + b[3]*A2 + b[1]*m_Id;
+  m_U.noalias() = A * m_tmp2;
+  m_V = b[4]*m_tmp1 + b[2]*A2 + b[0]*m_Id;
+}
+
+template <typename MatrixType>
+EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade7(const MatrixType &A)
+{
+  const RealScalar b[] = {17297280., 8648640., 1995840., 277200., 25200., 1512., 56., 1.};
+  MatrixType A2 = A * A;
+  MatrixType A4 = A2 * A2;
+  m_tmp1.noalias() = A4 * A2;
+  m_tmp2 = b[7]*m_tmp1 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
+  m_U.noalias() = A * m_tmp2;
+  m_V = b[6]*m_tmp1 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+}
+
+template <typename MatrixType>
+EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade9(const MatrixType &A)
+{
+  const RealScalar b[] = {17643225600., 8821612800., 2075673600., 302702400., 30270240.,
+		      2162160., 110880., 3960., 90., 1.};
+  MatrixType A2 = A * A;
+  MatrixType A4 = A2 * A2;
+  MatrixType A6 = A4 * A2;
+  m_tmp1.noalias() = A6 * A2;
+  m_tmp2 = b[9]*m_tmp1 + b[7]*A6 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
+  m_U.noalias() = A * m_tmp2;
+  m_V = b[8]*m_tmp1 + b[6]*A6 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+}
+
+template <typename MatrixType>
+EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade13(const MatrixType &A)
+{
+  const RealScalar b[] = {64764752532480000., 32382376266240000., 7771770303897600.,
+		      1187353796428800., 129060195264000., 10559470521600., 670442572800.,
+		      33522128640., 1323241920., 40840800., 960960., 16380., 182., 1.};
+  MatrixType A2 = A * A;
+  MatrixType A4 = A2 * A2;
+  m_tmp1.noalias() = A4 * A2;
+  m_V = b[13]*m_tmp1 + b[11]*A4 + b[9]*A2; // used for temporary storage
+  m_tmp2.noalias() = m_tmp1 * m_V;
+  m_tmp2 += b[7]*m_tmp1 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
+  m_U.noalias() = A * m_tmp2;
+  m_tmp2 = b[12]*m_tmp1 + b[10]*A4 + b[8]*A2;
+  m_V.noalias() = m_tmp1 * m_tmp2;
+  m_V += b[6]*m_tmp1 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+}
+
+#if LDBL_MANT_DIG > 64
+template <typename MatrixType>
+EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade17(const MatrixType &A)
+{
+  const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L,
+		      100610229646136770560000.L, 15720348382208870400000.L,
+		      1774878043152614400000.L, 153822763739893248000.L, 10608466464820224000.L,
+		      595373117923584000.L, 27563570274240000.L, 1060137318240000.L,
+		      33924394183680.L, 899510451840.L, 19554575040.L, 341863200.L, 4651200.L,
+		      46512.L, 306.L, 1.L};
+  MatrixType A2 = A * A;
+  MatrixType A4 = A2 * A2;
+  MatrixType A6 = A4 * A2;
+  m_tmp1.noalias() = A4 * A4;
+  m_V = b[17]*m_tmp1 + b[15]*A6 + b[13]*A4 + b[11]*A2; // used for temporary storage
+  m_tmp2.noalias() = m_tmp1 * m_V;
+  m_tmp2 += b[9]*m_tmp1 + b[7]*A6 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
+  m_U.noalias() = A * m_tmp2;
+  m_tmp2 = b[16]*m_tmp1 + b[14]*A6 + b[12]*A4 + b[10]*A2;
+  m_V.noalias() = m_tmp1 * m_tmp2;
+  m_V += b[8]*m_tmp1 + b[6]*A6 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+}
+#endif
+
+template <typename MatrixType>
+void MatrixExponential<MatrixType>::computeUV(float)
+{
+  using std::frexp;
+  using std::pow;
+  if (m_l1norm < 4.258730016922831e-001) {
+    pade3(m_M);
+  } else if (m_l1norm < 1.880152677804762e+000) {
+    pade5(m_M);
+  } else {
+    const float maxnorm = 3.925724783138660f;
+    frexp(m_l1norm / maxnorm, &m_squarings);
+    if (m_squarings < 0) m_squarings = 0;
+    MatrixType A = m_M / pow(Scalar(2), m_squarings);
+    pade7(A);
+  }
+}
+
+template <typename MatrixType>
+void MatrixExponential<MatrixType>::computeUV(double)
+{
+  using std::frexp;
+  using std::pow;
+  if (m_l1norm < 1.495585217958292e-002) {
+    pade3(m_M);
+  } else if (m_l1norm < 2.539398330063230e-001) {
+    pade5(m_M);
+  } else if (m_l1norm < 9.504178996162932e-001) {
+    pade7(m_M);
+  } else if (m_l1norm < 2.097847961257068e+000) {
+    pade9(m_M);
+  } else {
+    const double maxnorm = 5.371920351148152;
+    frexp(m_l1norm / maxnorm, &m_squarings);
+    if (m_squarings < 0) m_squarings = 0;
+    MatrixType A = m_M / pow(Scalar(2), m_squarings);
+    pade13(A);
+  }
+}
+
+template <typename MatrixType>
+void MatrixExponential<MatrixType>::computeUV(long double)
+{
+  using std::frexp;
+  using std::pow;
+#if   LDBL_MANT_DIG == 53   // double precision
+  computeUV(double());
+#elif LDBL_MANT_DIG <= 64   // extended precision
+  if (m_l1norm < 4.1968497232266989671e-003L) {
+    pade3(m_M);
+  } else if (m_l1norm < 1.1848116734693823091e-001L) {
+    pade5(m_M);
+  } else if (m_l1norm < 5.5170388480686700274e-001L) {
+    pade7(m_M);
+  } else if (m_l1norm < 1.3759868875587845383e+000L) {
+    pade9(m_M);
+  } else {
+    const long double maxnorm = 4.0246098906697353063L;
+    frexp(m_l1norm / maxnorm, &m_squarings);
+    if (m_squarings < 0) m_squarings = 0;
+    MatrixType A = m_M / pow(Scalar(2), m_squarings);
+    pade13(A);
+  }
+#elif LDBL_MANT_DIG <= 106  // double-double
+  if (m_l1norm < 3.2787892205607026992947488108213e-005L) {
+    pade3(m_M);
+  } else if (m_l1norm < 6.4467025060072760084130906076332e-003L) {
+    pade5(m_M);
+  } else if (m_l1norm < 6.8988028496595374751374122881143e-002L) {
+    pade7(m_M);
+  } else if (m_l1norm < 2.7339737518502231741495857201670e-001L) {
+    pade9(m_M);
+  } else if (m_l1norm < 1.3203382096514474905666448850278e+000L) {
+    pade13(m_M);
+  } else {
+    const long double maxnorm = 3.2579440895405400856599663723517L;
+    frexp(m_l1norm / maxnorm, &m_squarings);
+    if (m_squarings < 0) m_squarings = 0;
+    MatrixType A = m_M / pow(Scalar(2), m_squarings);
+    pade17(A);
+  }
+#elif LDBL_MANT_DIG <= 112  // quadruple precison
+  if (m_l1norm < 1.639394610288918690547467954466970e-005L) {
+    pade3(m_M);
+  } else if (m_l1norm < 4.253237712165275566025884344433009e-003L) {
+    pade5(m_M);
+  } else if (m_l1norm < 5.125804063165764409885122032933142e-002L) {
+    pade7(m_M);
+  } else if (m_l1norm < 2.170000765161155195453205651889853e-001L) {
+    pade9(m_M);
+  } else if (m_l1norm < 1.125358383453143065081397882891878e+000L) {
+    pade13(m_M);
+  } else {
+    const long double maxnorm = 2.884233277829519311757165057717815L;
+    frexp(m_l1norm / maxnorm, &m_squarings);
+    if (m_squarings < 0) m_squarings = 0;
+    MatrixType A = m_M / pow(Scalar(2), m_squarings);
+    pade17(A);
+  }
+#else
+  // this case should be handled in compute()
+  eigen_assert(false && "Bug in MatrixExponential"); 
+#endif  // LDBL_MANT_DIG
+}
+
+/** \ingroup MatrixFunctions_Module
+  *
+  * \brief Proxy for the matrix exponential of some matrix (expression).
+  *
+  * \tparam Derived  Type of the argument to the matrix exponential.
+  *
+  * This class holds the argument to the matrix exponential until it
+  * is assigned or evaluated for some other reason (so the argument
+  * should not be changed in the meantime). It is the return type of
+  * MatrixBase::exp() and most of the time this is the only way it is
+  * used.
+  */
+template<typename Derived> struct MatrixExponentialReturnValue
+: public ReturnByValue<MatrixExponentialReturnValue<Derived> >
+{
+    typedef typename Derived::Index Index;
+  public:
+    /** \brief Constructor.
+      *
+      * \param[in] src %Matrix (expression) forming the argument of the
+      * matrix exponential.
+      */
+    MatrixExponentialReturnValue(const Derived& src) : m_src(src) { }
+
+    /** \brief Compute the matrix exponential.
+      *
+      * \param[out] result the matrix exponential of \p src in the
+      * constructor.
+      */
+    template <typename ResultType>
+    inline void evalTo(ResultType& result) const
+    {
+      const typename Derived::PlainObject srcEvaluated = m_src.eval();
+      MatrixExponential<typename Derived::PlainObject> me(srcEvaluated);
+      me.compute(result);
+    }
+
+    Index rows() const { return m_src.rows(); }
+    Index cols() const { return m_src.cols(); }
+
+  protected:
+    const Derived& m_src;
+  private:
+    MatrixExponentialReturnValue& operator=(const MatrixExponentialReturnValue&);
+};
+
+namespace internal {
+template<typename Derived>
+struct traits<MatrixExponentialReturnValue<Derived> >
+{
+  typedef typename Derived::PlainObject ReturnType;
+};
+}
+
+template <typename Derived>
+const MatrixExponentialReturnValue<Derived> MatrixBase<Derived>::exp() const
+{
+  eigen_assert(rows() == cols());
+  return MatrixExponentialReturnValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_EXPONENTIAL
diff --git a/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
new file mode 100644
index 0000000..7d42664
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -0,0 +1,591 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2011 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_FUNCTION
+#define EIGEN_MATRIX_FUNCTION
+
+#include "StemFunction.h"
+#include "MatrixFunctionAtomic.h"
+
+
+namespace Eigen { 
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Class for computing matrix functions.
+  * \tparam  MatrixType  type of the argument of the matrix function,
+  *                      expected to be an instantiation of the Matrix class template.
+  * \tparam  AtomicType  type for computing matrix function of atomic blocks.
+  * \tparam  IsComplex   used internally to select correct specialization.
+  *
+  * This class implements the Schur-Parlett algorithm for computing matrix functions. The spectrum of the
+  * matrix is divided in clustered of eigenvalues that lies close together. This class delegates the
+  * computation of the matrix function on every block corresponding to these clusters to an object of type
+  * \p AtomicType and uses these results to compute the matrix function of the whole matrix. The class
+  * \p AtomicType should have a \p compute() member function for computing the matrix function of a block.
+  *
+  * \sa class MatrixFunctionAtomic, class MatrixLogarithmAtomic
+  */
+template <typename MatrixType, 
+	  typename AtomicType,  
+          int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+class MatrixFunction
+{  
+  public:
+
+    /** \brief Constructor. 
+      *
+      * \param[in]  A       argument of matrix function, should be a square matrix.
+      * \param[in]  atomic  class for computing matrix function of atomic blocks.
+      *
+      * The class stores references to \p A and \p atomic, so they should not be
+      * changed (or destroyed) before compute() is called.
+      */
+    MatrixFunction(const MatrixType& A, AtomicType& atomic);
+
+    /** \brief Compute the matrix function.
+      *
+      * \param[out] result  the function \p f applied to \p A, as
+      * specified in the constructor.
+      *
+      * See MatrixBase::matrixFunction() for details on how this computation
+      * is implemented.
+      */
+    template <typename ResultType> 
+    void compute(ResultType &result);    
+};
+
+
+/** \internal \ingroup MatrixFunctions_Module 
+  * \brief Partial specialization of MatrixFunction for real matrices
+  */
+template <typename MatrixType, typename AtomicType>
+class MatrixFunction<MatrixType, AtomicType, 0>
+{  
+  private:
+
+    typedef internal::traits<MatrixType> Traits;
+    typedef typename Traits::Scalar Scalar;
+    static const int Rows = Traits::RowsAtCompileTime;
+    static const int Cols = Traits::ColsAtCompileTime;
+    static const int Options = MatrixType::Options;
+    static const int MaxRows = Traits::MaxRowsAtCompileTime;
+    static const int MaxCols = Traits::MaxColsAtCompileTime;
+
+    typedef std::complex<Scalar> ComplexScalar;
+    typedef Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols> ComplexMatrix;
+
+  public:
+
+    /** \brief Constructor. 
+      *
+      * \param[in]  A       argument of matrix function, should be a square matrix.
+      * \param[in]  atomic  class for computing matrix function of atomic blocks.
+      */
+    MatrixFunction(const MatrixType& A, AtomicType& atomic) : m_A(A), m_atomic(atomic) { }
+
+    /** \brief Compute the matrix function.
+      *
+      * \param[out] result  the function \p f applied to \p A, as
+      * specified in the constructor.
+      *
+      * This function converts the real matrix \c A to a complex matrix,
+      * uses MatrixFunction<MatrixType,1> and then converts the result back to
+      * a real matrix.
+      */
+    template <typename ResultType>
+    void compute(ResultType& result) 
+    {
+      ComplexMatrix CA = m_A.template cast<ComplexScalar>();
+      ComplexMatrix Cresult;
+      MatrixFunction<ComplexMatrix, AtomicType> mf(CA, m_atomic);
+      mf.compute(Cresult);
+      result = Cresult.real();
+    }
+
+  private:
+    typename internal::nested<MatrixType>::type m_A; /**< \brief Reference to argument of matrix function. */
+    AtomicType& m_atomic; /**< \brief Class for computing matrix function of atomic blocks. */
+
+    MatrixFunction& operator=(const MatrixFunction&);
+};
+
+      
+/** \internal \ingroup MatrixFunctions_Module 
+  * \brief Partial specialization of MatrixFunction for complex matrices
+  */
+template <typename MatrixType, typename AtomicType>
+class MatrixFunction<MatrixType, AtomicType, 1>
+{
+  private:
+
+    typedef internal::traits<MatrixType> Traits;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
+    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
+    static const int Options = MatrixType::Options;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef Matrix<Scalar, Traits::RowsAtCompileTime, 1> VectorType;
+    typedef Matrix<Index, Traits::RowsAtCompileTime, 1> IntVectorType;
+    typedef Matrix<Index, Dynamic, 1> DynamicIntVectorType;
+    typedef std::list<Scalar> Cluster;
+    typedef std::list<Cluster> ListOfClusters;
+    typedef Matrix<Scalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+
+  public:
+
+    MatrixFunction(const MatrixType& A, AtomicType& atomic);
+    template <typename ResultType> void compute(ResultType& result);
+
+  private:
+
+    void computeSchurDecomposition();
+    void partitionEigenvalues();
+    typename ListOfClusters::iterator findCluster(Scalar key);
+    void computeClusterSize();
+    void computeBlockStart();
+    void constructPermutation();
+    void permuteSchur();
+    void swapEntriesInSchur(Index index);
+    void computeBlockAtomic();
+    Block<MatrixType> block(MatrixType& A, Index i, Index j);
+    void computeOffDiagonal();
+    DynMatrixType solveTriangularSylvester(const DynMatrixType& A, const DynMatrixType& B, const DynMatrixType& C);
+
+    typename internal::nested<MatrixType>::type m_A; /**< \brief Reference to argument of matrix function. */
+    AtomicType& m_atomic; /**< \brief Class for computing matrix function of atomic blocks. */
+    MatrixType m_T; /**< \brief Triangular part of Schur decomposition */
+    MatrixType m_U; /**< \brief Unitary part of Schur decomposition */
+    MatrixType m_fT; /**< \brief %Matrix function applied to #m_T */
+    ListOfClusters m_clusters; /**< \brief Partition of eigenvalues into clusters of ei'vals "close" to each other */
+    DynamicIntVectorType m_eivalToCluster; /**< \brief m_eivalToCluster[i] = j means i-th ei'val is in j-th cluster */
+    DynamicIntVectorType m_clusterSize; /**< \brief Number of eigenvalues in each clusters  */
+    DynamicIntVectorType m_blockStart; /**< \brief Row index at which block corresponding to i-th cluster starts */
+    IntVectorType m_permutation; /**< \brief Permutation which groups ei'vals in the same cluster together */
+
+    /** \brief Maximum distance allowed between eigenvalues to be considered "close".
+      *
+      * This is morally a \c static \c const \c Scalar, but only
+      * integers can be static constant class members in C++. The
+      * separation constant is set to 0.1, a value taken from the
+      * paper by Davies and Higham. */
+    static const RealScalar separation() { return static_cast<RealScalar>(0.1); }
+
+    MatrixFunction& operator=(const MatrixFunction&);
+};
+
+/** \brief Constructor. 
+ *
+ * \param[in]  A       argument of matrix function, should be a square matrix.
+ * \param[in]  atomic  class for computing matrix function of atomic blocks.
+ */
+template <typename MatrixType, typename AtomicType>
+MatrixFunction<MatrixType,AtomicType,1>::MatrixFunction(const MatrixType& A, AtomicType& atomic)
+  : m_A(A), m_atomic(atomic)
+{
+  /* empty body */
+}
+
+/** \brief Compute the matrix function.
+  *
+  * \param[out] result  the function \p f applied to \p A, as
+  * specified in the constructor.
+  */
+template <typename MatrixType, typename AtomicType>
+template <typename ResultType>
+void MatrixFunction<MatrixType,AtomicType,1>::compute(ResultType& result) 
+{
+  computeSchurDecomposition();
+  partitionEigenvalues();
+  computeClusterSize();
+  computeBlockStart();
+  constructPermutation();
+  permuteSchur();
+  computeBlockAtomic();
+  computeOffDiagonal();
+  result = m_U * (m_fT.template triangularView<Upper>() * m_U.adjoint());
+}
+
+/** \brief Store the Schur decomposition of #m_A in #m_T and #m_U */
+template <typename MatrixType, typename AtomicType>
+void MatrixFunction<MatrixType,AtomicType,1>::computeSchurDecomposition()
+{
+  const ComplexSchur<MatrixType> schurOfA(m_A);  
+  m_T = schurOfA.matrixT();
+  m_U = schurOfA.matrixU();
+}
+
+/** \brief Partition eigenvalues in clusters of ei'vals close to each other
+  * 
+  * This function computes #m_clusters. This is a partition of the
+  * eigenvalues of #m_T in clusters, such that
+  * # Any eigenvalue in a certain cluster is at most separation() away
+  *   from another eigenvalue in the same cluster.
+  * # The distance between two eigenvalues in different clusters is
+  *   more than separation().
+  * The implementation follows Algorithm 4.1 in the paper of Davies
+  * and Higham. 
+  */
+template <typename MatrixType, typename AtomicType>
+void MatrixFunction<MatrixType,AtomicType,1>::partitionEigenvalues()
+{
+  using std::abs;
+  const Index rows = m_T.rows();
+  VectorType diag = m_T.diagonal(); // contains eigenvalues of A
+
+  for (Index i=0; i<rows; ++i) {
+    // Find set containing diag(i), adding a new set if necessary
+    typename ListOfClusters::iterator qi = findCluster(diag(i));
+    if (qi == m_clusters.end()) {
+      Cluster l;
+      l.push_back(diag(i));
+      m_clusters.push_back(l);
+      qi = m_clusters.end();
+      --qi;
+    }
+
+    // Look for other element to add to the set
+    for (Index j=i+1; j<rows; ++j) {
+      if (abs(diag(j) - diag(i)) <= separation() && std::find(qi->begin(), qi->end(), diag(j)) == qi->end()) {
+        typename ListOfClusters::iterator qj = findCluster(diag(j));
+        if (qj == m_clusters.end()) {
+          qi->push_back(diag(j));
+        } else {
+          qi->insert(qi->end(), qj->begin(), qj->end());
+          m_clusters.erase(qj);
+        }
+      }
+    }
+  }
+}
+
+/** \brief Find cluster in #m_clusters containing some value 
+  * \param[in] key Value to find
+  * \returns Iterator to cluster containing \c key, or
+  * \c m_clusters.end() if no cluster in m_clusters contains \c key.
+  */
+template <typename MatrixType, typename AtomicType>
+typename MatrixFunction<MatrixType,AtomicType,1>::ListOfClusters::iterator MatrixFunction<MatrixType,AtomicType,1>::findCluster(Scalar key)
+{
+  typename Cluster::iterator j;
+  for (typename ListOfClusters::iterator i = m_clusters.begin(); i != m_clusters.end(); ++i) {
+    j = std::find(i->begin(), i->end(), key);
+    if (j != i->end())
+      return i;
+  }
+  return m_clusters.end();
+}
+
+/** \brief Compute #m_clusterSize and #m_eivalToCluster using #m_clusters */
+template <typename MatrixType, typename AtomicType>
+void MatrixFunction<MatrixType,AtomicType,1>::computeClusterSize()
+{
+  const Index rows = m_T.rows();
+  VectorType diag = m_T.diagonal(); 
+  const Index numClusters = static_cast<Index>(m_clusters.size());
+
+  m_clusterSize.setZero(numClusters);
+  m_eivalToCluster.resize(rows);
+  Index clusterIndex = 0;
+  for (typename ListOfClusters::const_iterator cluster = m_clusters.begin(); cluster != m_clusters.end(); ++cluster) {
+    for (Index i = 0; i < diag.rows(); ++i) {
+      if (std::find(cluster->begin(), cluster->end(), diag(i)) != cluster->end()) {
+        ++m_clusterSize[clusterIndex];
+        m_eivalToCluster[i] = clusterIndex;
+      }
+    }
+    ++clusterIndex;
+  }
+}
+
+/** \brief Compute #m_blockStart using #m_clusterSize */
+template <typename MatrixType, typename AtomicType>
+void MatrixFunction<MatrixType,AtomicType,1>::computeBlockStart()
+{
+  m_blockStart.resize(m_clusterSize.rows());
+  m_blockStart(0) = 0;
+  for (Index i = 1; i < m_clusterSize.rows(); i++) {
+    m_blockStart(i) = m_blockStart(i-1) + m_clusterSize(i-1);
+  }
+}
+
+/** \brief Compute #m_permutation using #m_eivalToCluster and #m_blockStart */
+template <typename MatrixType, typename AtomicType>
+void MatrixFunction<MatrixType,AtomicType,1>::constructPermutation()
+{
+  DynamicIntVectorType indexNextEntry = m_blockStart;
+  m_permutation.resize(m_T.rows());
+  for (Index i = 0; i < m_T.rows(); i++) {
+    Index cluster = m_eivalToCluster[i];
+    m_permutation[i] = indexNextEntry[cluster];
+    ++indexNextEntry[cluster];
+  }
+}  
+
+/** \brief Permute Schur decomposition in #m_U and #m_T according to #m_permutation */
+template <typename MatrixType, typename AtomicType>
+void MatrixFunction<MatrixType,AtomicType,1>::permuteSchur()
+{
+  IntVectorType p = m_permutation;
+  for (Index i = 0; i < p.rows() - 1; i++) {
+    Index j;
+    for (j = i; j < p.rows(); j++) {
+      if (p(j) == i) break;
+    }
+    eigen_assert(p(j) == i);
+    for (Index k = j-1; k >= i; k--) {
+      swapEntriesInSchur(k);
+      std::swap(p.coeffRef(k), p.coeffRef(k+1));
+    }
+  }
+}
+
+/** \brief Swap rows \a index and \a index+1 in Schur decomposition in #m_U and #m_T */
+template <typename MatrixType, typename AtomicType>
+void MatrixFunction<MatrixType,AtomicType,1>::swapEntriesInSchur(Index index)
+{
+  JacobiRotation<Scalar> rotation;
+  rotation.makeGivens(m_T(index, index+1), m_T(index+1, index+1) - m_T(index, index));
+  m_T.applyOnTheLeft(index, index+1, rotation.adjoint());
+  m_T.applyOnTheRight(index, index+1, rotation);
+  m_U.applyOnTheRight(index, index+1, rotation);
+}  
+
+/** \brief Compute block diagonal part of #m_fT.
+  *
+  * This routine computes the matrix function applied to the block diagonal part of #m_T, with the blocking
+  * given by #m_blockStart. The matrix function of each diagonal block is computed by #m_atomic. The
+  * off-diagonal parts of #m_fT are set to zero.
+  */
+template <typename MatrixType, typename AtomicType>
+void MatrixFunction<MatrixType,AtomicType,1>::computeBlockAtomic()
+{ 
+  m_fT.resize(m_T.rows(), m_T.cols());
+  m_fT.setZero();
+  for (Index i = 0; i < m_clusterSize.rows(); ++i) {
+    block(m_fT, i, i) = m_atomic.compute(block(m_T, i, i));
+  }
+}
+
+/** \brief Return block of matrix according to blocking given by #m_blockStart */
+template <typename MatrixType, typename AtomicType>
+Block<MatrixType> MatrixFunction<MatrixType,AtomicType,1>::block(MatrixType& A, Index i, Index j)
+{
+  return A.block(m_blockStart(i), m_blockStart(j), m_clusterSize(i), m_clusterSize(j));
+}
+
+/** \brief Compute part of #m_fT above block diagonal.
+  *
+  * This routine assumes that the block diagonal part of #m_fT (which
+  * equals the matrix function applied to #m_T) has already been computed and computes
+  * the part above the block diagonal. The part below the diagonal is
+  * zero, because #m_T is upper triangular.
+  */
+template <typename MatrixType, typename AtomicType>
+void MatrixFunction<MatrixType,AtomicType,1>::computeOffDiagonal()
+{ 
+  for (Index diagIndex = 1; diagIndex < m_clusterSize.rows(); diagIndex++) {
+    for (Index blockIndex = 0; blockIndex < m_clusterSize.rows() - diagIndex; blockIndex++) {
+      // compute (blockIndex, blockIndex+diagIndex) block
+      DynMatrixType A = block(m_T, blockIndex, blockIndex);
+      DynMatrixType B = -block(m_T, blockIndex+diagIndex, blockIndex+diagIndex);
+      DynMatrixType C = block(m_fT, blockIndex, blockIndex) * block(m_T, blockIndex, blockIndex+diagIndex);
+      C -= block(m_T, blockIndex, blockIndex+diagIndex) * block(m_fT, blockIndex+diagIndex, blockIndex+diagIndex);
+      for (Index k = blockIndex + 1; k < blockIndex + diagIndex; k++) {
+	C += block(m_fT, blockIndex, k) * block(m_T, k, blockIndex+diagIndex);
+	C -= block(m_T, blockIndex, k) * block(m_fT, k, blockIndex+diagIndex);
+      }
+      block(m_fT, blockIndex, blockIndex+diagIndex) = solveTriangularSylvester(A, B, C);
+    }
+  }
+}
+
+/** \brief Solve a triangular Sylvester equation AX + XB = C 
+  *
+  * \param[in]  A  the matrix A; should be square and upper triangular
+  * \param[in]  B  the matrix B; should be square and upper triangular
+  * \param[in]  C  the matrix C; should have correct size.
+  *
+  * \returns the solution X.
+  *
+  * If A is m-by-m and B is n-by-n, then both C and X are m-by-n. 
+  * The (i,j)-th component of the Sylvester equation is
+  * \f[ 
+  *     \sum_{k=i}^m A_{ik} X_{kj} + \sum_{k=1}^j X_{ik} B_{kj} = C_{ij}. 
+  * \f]
+  * This can be re-arranged to yield:
+  * \f[ 
+  *     X_{ij} = \frac{1}{A_{ii} + B_{jj}} \Bigl( C_{ij}
+  *     - \sum_{k=i+1}^m A_{ik} X_{kj} - \sum_{k=1}^{j-1} X_{ik} B_{kj} \Bigr).
+  * \f]
+  * It is assumed that A and B are such that the numerator is never
+  * zero (otherwise the Sylvester equation does not have a unique
+  * solution). In that case, these equations can be evaluated in the
+  * order \f$ i=m,\ldots,1 \f$ and \f$ j=1,\ldots,n \f$.
+  */
+template <typename MatrixType, typename AtomicType>
+typename MatrixFunction<MatrixType,AtomicType,1>::DynMatrixType MatrixFunction<MatrixType,AtomicType,1>::solveTriangularSylvester(
+  const DynMatrixType& A, 
+  const DynMatrixType& B, 
+  const DynMatrixType& C)
+{
+  eigen_assert(A.rows() == A.cols());
+  eigen_assert(A.isUpperTriangular());
+  eigen_assert(B.rows() == B.cols());
+  eigen_assert(B.isUpperTriangular());
+  eigen_assert(C.rows() == A.rows());
+  eigen_assert(C.cols() == B.rows());
+
+  Index m = A.rows();
+  Index n = B.rows();
+  DynMatrixType X(m, n);
+
+  for (Index i = m - 1; i >= 0; --i) {
+    for (Index j = 0; j < n; ++j) {
+
+      // Compute AX = \sum_{k=i+1}^m A_{ik} X_{kj}
+      Scalar AX;
+      if (i == m - 1) {
+	AX = 0; 
+      } else {
+	Matrix<Scalar,1,1> AXmatrix = A.row(i).tail(m-1-i) * X.col(j).tail(m-1-i);
+	AX = AXmatrix(0,0);
+      }
+
+      // Compute XB = \sum_{k=1}^{j-1} X_{ik} B_{kj}
+      Scalar XB;
+      if (j == 0) {
+	XB = 0; 
+      } else {
+	Matrix<Scalar,1,1> XBmatrix = X.row(i).head(j) * B.col(j).head(j);
+	XB = XBmatrix(0,0);
+      }
+
+      X(i,j) = (C(i,j) - AX - XB) / (A(i,i) + B(j,j));
+    }
+  }
+  return X;
+}
+
+/** \ingroup MatrixFunctions_Module
+  *
+  * \brief Proxy for the matrix function of some matrix (expression).
+  *
+  * \tparam Derived  Type of the argument to the matrix function.
+  *
+  * This class holds the argument to the matrix function until it is
+  * assigned or evaluated for some other reason (so the argument
+  * should not be changed in the meantime). It is the return type of
+  * matrixBase::matrixFunction() and related functions and most of the
+  * time this is the only way it is used.
+  */
+template<typename Derived> class MatrixFunctionReturnValue
+: public ReturnByValue<MatrixFunctionReturnValue<Derived> >
+{
+  public:
+
+    typedef typename Derived::Scalar Scalar;
+    typedef typename Derived::Index Index;
+    typedef typename internal::stem_function<Scalar>::type StemFunction;
+
+   /** \brief Constructor.
+      *
+      * \param[in] A  %Matrix (expression) forming the argument of the
+      * matrix function.
+      * \param[in] f  Stem function for matrix function under consideration.
+      */
+    MatrixFunctionReturnValue(const Derived& A, StemFunction f) : m_A(A), m_f(f) { }
+
+    /** \brief Compute the matrix function.
+      *
+      * \param[out] result \p f applied to \p A, where \p f and \p A
+      * are as in the constructor.
+      */
+    template <typename ResultType>
+    inline void evalTo(ResultType& result) const
+    {
+      typedef typename Derived::PlainObject PlainObject;
+      typedef internal::traits<PlainObject> Traits;
+      static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
+      static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
+      static const int Options = PlainObject::Options;
+      typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+      typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+      typedef MatrixFunctionAtomic<DynMatrixType> AtomicType;
+      AtomicType atomic(m_f);
+
+      const PlainObject Aevaluated = m_A.eval();
+      MatrixFunction<PlainObject, AtomicType> mf(Aevaluated, atomic);
+      mf.compute(result);
+    }
+
+    Index rows() const { return m_A.rows(); }
+    Index cols() const { return m_A.cols(); }
+
+  private:
+    typename internal::nested<Derived>::type m_A;
+    StemFunction *m_f;
+
+    MatrixFunctionReturnValue& operator=(const MatrixFunctionReturnValue&);
+};
+
+namespace internal {
+template<typename Derived>
+struct traits<MatrixFunctionReturnValue<Derived> >
+{
+  typedef typename Derived::PlainObject ReturnType;
+};
+}
+
+
+/********** MatrixBase methods **********/
+
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::matrixFunction(typename internal::stem_function<typename internal::traits<Derived>::Scalar>::type f) const
+{
+  eigen_assert(rows() == cols());
+  return MatrixFunctionReturnValue<Derived>(derived(), f);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const
+{
+  eigen_assert(rows() == cols());
+  typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::sin);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const
+{
+  eigen_assert(rows() == cols());
+  typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::cos);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sinh() const
+{
+  eigen_assert(rows() == cols());
+  typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::sinh);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cosh() const
+{
+  eigen_assert(rows() == cols());
+  typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::cosh);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_FUNCTION
diff --git a/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
new file mode 100644
index 0000000..efe332c
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
@@ -0,0 +1,131 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_FUNCTION_ATOMIC
+#define EIGEN_MATRIX_FUNCTION_ATOMIC
+
+namespace Eigen { 
+
+/** \ingroup MatrixFunctions_Module
+  * \class MatrixFunctionAtomic
+  * \brief Helper class for computing matrix functions of atomic matrices.
+  *
+  * \internal
+  * Here, an atomic matrix is a triangular matrix whose diagonal
+  * entries are close to each other.
+  */
+template <typename MatrixType>
+class MatrixFunctionAtomic
+{
+  public:
+
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::stem_function<Scalar>::type StemFunction;
+    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+    /** \brief Constructor
+      * \param[in]  f  matrix function to compute.
+      */
+    MatrixFunctionAtomic(StemFunction f) : m_f(f) { }
+
+    /** \brief Compute matrix function of atomic matrix
+      * \param[in]  A  argument of matrix function, should be upper triangular and atomic
+      * \returns  f(A), the matrix function evaluated at the given matrix
+      */
+    MatrixType compute(const MatrixType& A);
+
+  private:
+
+    // Prevent copying
+    MatrixFunctionAtomic(const MatrixFunctionAtomic&);
+    MatrixFunctionAtomic& operator=(const MatrixFunctionAtomic&);
+
+    void computeMu();
+    bool taylorConverged(Index s, const MatrixType& F, const MatrixType& Fincr, const MatrixType& P);
+
+    /** \brief Pointer to scalar function */
+    StemFunction* m_f;
+
+    /** \brief Size of matrix function */
+    Index m_Arows;
+
+    /** \brief Mean of eigenvalues */
+    Scalar m_avgEival;
+
+    /** \brief Argument shifted by mean of eigenvalues */
+    MatrixType m_Ashifted;
+
+    /** \brief Constant used to determine whether Taylor series has converged */
+    RealScalar m_mu;
+};
+
+template <typename MatrixType>
+MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
+{
+  // TODO: Use that A is upper triangular
+  m_Arows = A.rows();
+  m_avgEival = A.trace() / Scalar(RealScalar(m_Arows));
+  m_Ashifted = A - m_avgEival * MatrixType::Identity(m_Arows, m_Arows);
+  computeMu();
+  MatrixType F = m_f(m_avgEival, 0) * MatrixType::Identity(m_Arows, m_Arows);
+  MatrixType P = m_Ashifted;
+  MatrixType Fincr;
+  for (Index s = 1; s < 1.1 * m_Arows + 10; s++) { // upper limit is fairly arbitrary
+    Fincr = m_f(m_avgEival, static_cast<int>(s)) * P;
+    F += Fincr;
+    P = Scalar(RealScalar(1.0/(s + 1))) * P * m_Ashifted;
+    if (taylorConverged(s, F, Fincr, P)) {
+      return F;
+    }
+  }
+  eigen_assert("Taylor series does not converge" && 0);
+  return F;
+}
+
+/** \brief Compute \c m_mu. */
+template <typename MatrixType>
+void MatrixFunctionAtomic<MatrixType>::computeMu()
+{
+  const MatrixType N = MatrixType::Identity(m_Arows, m_Arows) - m_Ashifted;
+  VectorType e = VectorType::Ones(m_Arows);
+  N.template triangularView<Upper>().solveInPlace(e);
+  m_mu = e.cwiseAbs().maxCoeff();
+}
+
+/** \brief Determine whether Taylor series has converged */
+template <typename MatrixType>
+bool MatrixFunctionAtomic<MatrixType>::taylorConverged(Index s, const MatrixType& F,
+						       const MatrixType& Fincr, const MatrixType& P)
+{
+  const Index n = F.rows();
+  const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
+  const RealScalar Fincr_norm = Fincr.cwiseAbs().rowwise().sum().maxCoeff();
+  if (Fincr_norm < NumTraits<Scalar>::epsilon() * F_norm) {
+    RealScalar delta = 0;
+    RealScalar rfactorial = 1;
+    for (Index r = 0; r < n; r++) {
+      RealScalar mx = 0;
+      for (Index i = 0; i < n; i++)
+        mx = (std::max)(mx, std::abs(m_f(m_Ashifted(i, i) + m_avgEival, static_cast<int>(s+r))));
+      if (r != 0)
+        rfactorial *= RealScalar(r);
+      delta = (std::max)(delta, mx / rfactorial);
+    }
+    const RealScalar P_norm = P.cwiseAbs().rowwise().sum().maxCoeff();
+    if (m_mu * delta * P_norm < NumTraits<Scalar>::epsilon() * F_norm)
+      return true;
+  }
+  return false;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_FUNCTION_ATOMIC
diff --git a/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
new file mode 100644
index 0000000..c744fc0
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -0,0 +1,486 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Jitse Niesen <jitse at maths.leeds.ac.uk>
+// Copyright (C) 2011 Chen-Pang He <jdh8 at ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_LOGARITHM
+#define EIGEN_MATRIX_LOGARITHM
+
+#ifndef M_PI
+#define M_PI 3.141592653589793238462643383279503L
+#endif
+
+namespace Eigen { 
+
+/** \ingroup MatrixFunctions_Module
+  * \class MatrixLogarithmAtomic
+  * \brief Helper class for computing matrix logarithm of atomic matrices.
+  *
+  * \internal
+  * Here, an atomic matrix is a triangular matrix whose diagonal
+  * entries are close to each other.
+  *
+  * \sa class MatrixFunctionAtomic, MatrixBase::log()
+  */
+template <typename MatrixType>
+class MatrixLogarithmAtomic
+{
+public:
+
+  typedef typename MatrixType::Scalar Scalar;
+  // typedef typename MatrixType::Index Index;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  // typedef typename internal::stem_function<Scalar>::type StemFunction;
+  // typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  /** \brief Constructor. */
+  MatrixLogarithmAtomic() { }
+
+  /** \brief Compute matrix logarithm of atomic matrix
+    * \param[in]  A  argument of matrix logarithm, should be upper triangular and atomic
+    * \returns  The logarithm of \p A.
+    */
+  MatrixType compute(const MatrixType& A);
+
+private:
+
+  void compute2x2(const MatrixType& A, MatrixType& result);
+  void computeBig(const MatrixType& A, MatrixType& result);
+  int getPadeDegree(float normTminusI);
+  int getPadeDegree(double normTminusI);
+  int getPadeDegree(long double normTminusI);
+  void computePade(MatrixType& result, const MatrixType& T, int degree);
+  void computePade3(MatrixType& result, const MatrixType& T);
+  void computePade4(MatrixType& result, const MatrixType& T);
+  void computePade5(MatrixType& result, const MatrixType& T);
+  void computePade6(MatrixType& result, const MatrixType& T);
+  void computePade7(MatrixType& result, const MatrixType& T);
+  void computePade8(MatrixType& result, const MatrixType& T);
+  void computePade9(MatrixType& result, const MatrixType& T);
+  void computePade10(MatrixType& result, const MatrixType& T);
+  void computePade11(MatrixType& result, const MatrixType& T);
+
+  static const int minPadeDegree = 3;
+  static const int maxPadeDegree = std::numeric_limits<RealScalar>::digits<= 24?  5:  // single precision
+                                   std::numeric_limits<RealScalar>::digits<= 53?  7:  // double precision
+                                   std::numeric_limits<RealScalar>::digits<= 64?  8:  // extended precision
+                                   std::numeric_limits<RealScalar>::digits<=106? 10:  // double-double
+                                                                                 11;  // quadruple precision
+
+  // Prevent copying
+  MatrixLogarithmAtomic(const MatrixLogarithmAtomic&);
+  MatrixLogarithmAtomic& operator=(const MatrixLogarithmAtomic&);
+};
+
+/** \brief Compute logarithm of triangular matrix with clustered eigenvalues. */
+template <typename MatrixType>
+MatrixType MatrixLogarithmAtomic<MatrixType>::compute(const MatrixType& A)
+{
+  using std::log;
+  MatrixType result(A.rows(), A.rows());
+  if (A.rows() == 1)
+    result(0,0) = log(A(0,0));
+  else if (A.rows() == 2)
+    compute2x2(A, result);
+  else
+    computeBig(A, result);
+  return result;
+}
+
+/** \brief Compute logarithm of 2x2 triangular matrix. */
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::compute2x2(const MatrixType& A, MatrixType& result)
+{
+  using std::abs;
+  using std::ceil;
+  using std::imag;
+  using std::log;
+
+  Scalar logA00 = log(A(0,0));
+  Scalar logA11 = log(A(1,1));
+
+  result(0,0) = logA00;
+  result(1,0) = Scalar(0);
+  result(1,1) = logA11;
+
+  if (A(0,0) == A(1,1)) {
+    result(0,1) = A(0,1) / A(0,0);
+  } else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) {
+    result(0,1) = A(0,1) * (logA11 - logA00) / (A(1,1) - A(0,0));
+  } else {
+    // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
+    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI)));
+    Scalar y = A(1,1) - A(0,0), x = A(1,1) + A(0,0);
+    result(0,1) = A(0,1) * (Scalar(2) * numext::atanh2(y,x) + Scalar(0,2*M_PI*unwindingNumber)) / y;
+  }
+}
+
+/** \brief Compute logarithm of triangular matrices with size > 2. 
+  * \details This uses a inverse scale-and-square algorithm. */
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computeBig(const MatrixType& A, MatrixType& result)
+{
+  using std::pow;
+  int numberOfSquareRoots = 0;
+  int numberOfExtraSquareRoots = 0;
+  int degree;
+  MatrixType T = A, sqrtT;
+  const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1:                     // single precision
+                                    maxPadeDegree<= 7? 2.6429608311114350e-1:                     // double precision
+                                    maxPadeDegree<= 8? 2.32777776523703892094e-1L:                // extended precision
+                                    maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L:    // double-double
+                                                       1.1880960220216759245467951592883642e-1L;  // quadruple precision
+
+  while (true) {
+    RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
+    if (normTminusI < maxNormForPade) {
+      degree = getPadeDegree(normTminusI);
+      int degree2 = getPadeDegree(normTminusI / RealScalar(2));
+      if ((degree - degree2 <= 1) || (numberOfExtraSquareRoots == 1)) 
+        break;
+      ++numberOfExtraSquareRoots;
+    }
+    MatrixSquareRootTriangular<MatrixType>(T).compute(sqrtT);
+    T = sqrtT.template triangularView<Upper>();
+    ++numberOfSquareRoots;
+  }
+
+  computePade(result, T, degree);
+  result *= pow(RealScalar(2), numberOfSquareRoots);
+}
+
+/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = float) */
+template <typename MatrixType>
+int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(float normTminusI)
+{
+  const float maxNormForPade[] = { 2.5111573934555054e-1 /* degree = 3 */ , 4.0535837411880493e-1,
+            5.3149729967117310e-1 };
+  int degree = 3;
+  for (; degree <= maxPadeDegree; ++degree) 
+    if (normTminusI <= maxNormForPade[degree - minPadeDegree])
+      break;
+  return degree;
+}
+
+/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = double) */
+template <typename MatrixType>
+int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(double normTminusI)
+{
+  const double maxNormForPade[] = { 1.6206284795015624e-2 /* degree = 3 */ , 5.3873532631381171e-2,
+            1.1352802267628681e-1, 1.8662860613541288e-1, 2.642960831111435e-1 };
+  int degree = 3;
+  for (; degree <= maxPadeDegree; ++degree)
+    if (normTminusI <= maxNormForPade[degree - minPadeDegree])
+      break;
+  return degree;
+}
+
+/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = long double) */
+template <typename MatrixType>
+int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(long double normTminusI)
+{
+#if   LDBL_MANT_DIG == 53         // double precision
+  const long double maxNormForPade[] = { 1.6206284795015624e-2L /* degree = 3 */ , 5.3873532631381171e-2L,
+            1.1352802267628681e-1L, 1.8662860613541288e-1L, 2.642960831111435e-1L };
+#elif LDBL_MANT_DIG <= 64         // extended precision
+  const long double maxNormForPade[] = { 5.48256690357782863103e-3L /* degree = 3 */, 2.34559162387971167321e-2L,
+            5.84603923897347449857e-2L, 1.08486423756725170223e-1L, 1.68385767881294446649e-1L,
+            2.32777776523703892094e-1L };
+#elif LDBL_MANT_DIG <= 106        // double-double
+  const long double maxNormForPade[] = { 8.58970550342939562202529664318890e-5L /* degree = 3 */,
+            9.34074328446359654039446552677759e-4L, 4.26117194647672175773064114582860e-3L,
+            1.21546224740281848743149666560464e-2L, 2.61100544998339436713088248557444e-2L,
+            4.66170074627052749243018566390567e-2L, 7.32585144444135027565872014932387e-2L,
+            1.05026503471351080481093652651105e-1L };
+#else                             // quadruple precision
+  const long double maxNormForPade[] = { 4.7419931187193005048501568167858103e-5L /* degree = 3 */,
+            5.8853168473544560470387769480192666e-4L, 2.9216120366601315391789493628113520e-3L,
+            8.8415758124319434347116734705174308e-3L, 1.9850836029449446668518049562565291e-2L,
+            3.6688019729653446926585242192447447e-2L, 5.9290962294020186998954055264528393e-2L,
+            8.6998436081634343903250580992127677e-2L, 1.1880960220216759245467951592883642e-1L };
+#endif
+  int degree = 3;
+  for (; degree <= maxPadeDegree; ++degree)
+    if (normTminusI <= maxNormForPade[degree - minPadeDegree])
+      break;
+  return degree;
+}
+
+/* \brief Compute Pade approximation to matrix logarithm */
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade(MatrixType& result, const MatrixType& T, int degree)
+{
+  switch (degree) {
+    case 3:  computePade3(result, T);  break;
+    case 4:  computePade4(result, T);  break;
+    case 5:  computePade5(result, T);  break;
+    case 6:  computePade6(result, T);  break;
+    case 7:  computePade7(result, T);  break;
+    case 8:  computePade8(result, T);  break;
+    case 9:  computePade9(result, T);  break;
+    case 10: computePade10(result, T); break;
+    case 11: computePade11(result, T); break;
+    default: assert(false); // should never happen
+  }
+} 
+
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade3(MatrixType& result, const MatrixType& T)
+{
+  const int degree = 3;
+  const RealScalar nodes[]   = { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,
+            0.8872983346207416885179265399782400L };
+  const RealScalar weights[] = { 0.2777777777777777777777777777777778L, 0.4444444444444444444444444444444444L,
+            0.2777777777777777777777777777777778L };
+  eigen_assert(degree <= maxPadeDegree);
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k)
+    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
+                           .template triangularView<Upper>().solve(TminusI);
+}
+
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade4(MatrixType& result, const MatrixType& T)
+{
+  const int degree = 4;
+  const RealScalar nodes[]   = { 0.0694318442029737123880267555535953L, 0.3300094782075718675986671204483777L,
+            0.6699905217924281324013328795516223L, 0.9305681557970262876119732444464048L };
+  const RealScalar weights[] = { 0.1739274225687269286865319746109997L, 0.3260725774312730713134680253890003L,
+            0.3260725774312730713134680253890003L, 0.1739274225687269286865319746109997L };
+  eigen_assert(degree <= maxPadeDegree);
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k)
+    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
+                           .template triangularView<Upper>().solve(TminusI);
+}
+
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade5(MatrixType& result, const MatrixType& T)
+{
+  const int degree = 5;
+  const RealScalar nodes[]   = { 0.0469100770306680036011865608503035L, 0.2307653449471584544818427896498956L,
+            0.5000000000000000000000000000000000L, 0.7692346550528415455181572103501044L,
+            0.9530899229693319963988134391496965L };
+  const RealScalar weights[] = { 0.1184634425280945437571320203599587L, 0.2393143352496832340206457574178191L,
+            0.2844444444444444444444444444444444L, 0.2393143352496832340206457574178191L,
+            0.1184634425280945437571320203599587L };
+  eigen_assert(degree <= maxPadeDegree);
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k)
+    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
+                           .template triangularView<Upper>().solve(TminusI);
+}
+
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade6(MatrixType& result, const MatrixType& T)
+{
+  const int degree = 6;
+  const RealScalar nodes[]   = { 0.0337652428984239860938492227530027L, 0.1693953067668677431693002024900473L,
+            0.3806904069584015456847491391596440L, 0.6193095930415984543152508608403560L,
+            0.8306046932331322568306997975099527L, 0.9662347571015760139061507772469973L };
+  const RealScalar weights[] = { 0.0856622461895851725201480710863665L, 0.1803807865240693037849167569188581L,
+            0.2339569672863455236949351719947755L, 0.2339569672863455236949351719947755L,
+            0.1803807865240693037849167569188581L, 0.0856622461895851725201480710863665L };
+  eigen_assert(degree <= maxPadeDegree);
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k)
+    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
+                           .template triangularView<Upper>().solve(TminusI);
+}
+
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade7(MatrixType& result, const MatrixType& T)
+{
+  const int degree = 7;
+  const RealScalar nodes[]   = { 0.0254460438286207377369051579760744L, 0.1292344072003027800680676133596058L,
+            0.2970774243113014165466967939615193L, 0.5000000000000000000000000000000000L,
+            0.7029225756886985834533032060384807L, 0.8707655927996972199319323866403942L,
+            0.9745539561713792622630948420239256L };
+  const RealScalar weights[] = { 0.0647424830844348466353057163395410L, 0.1398526957446383339507338857118898L,
+            0.1909150252525594724751848877444876L, 0.2089795918367346938775510204081633L,
+            0.1909150252525594724751848877444876L, 0.1398526957446383339507338857118898L,
+            0.0647424830844348466353057163395410L };
+  eigen_assert(degree <= maxPadeDegree);
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k)
+    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
+                           .template triangularView<Upper>().solve(TminusI);
+}
+
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade8(MatrixType& result, const MatrixType& T)
+{
+  const int degree = 8;
+  const RealScalar nodes[]   = { 0.0198550717512318841582195657152635L, 0.1016667612931866302042230317620848L,
+            0.2372337950418355070911304754053768L, 0.4082826787521750975302619288199080L,
+            0.5917173212478249024697380711800920L, 0.7627662049581644929088695245946232L,
+            0.8983332387068133697957769682379152L, 0.9801449282487681158417804342847365L };
+  const RealScalar weights[] = { 0.0506142681451881295762656771549811L, 0.1111905172266872352721779972131204L,
+            0.1568533229389436436689811009933007L, 0.1813418916891809914825752246385978L,
+            0.1813418916891809914825752246385978L, 0.1568533229389436436689811009933007L,
+            0.1111905172266872352721779972131204L, 0.0506142681451881295762656771549811L };
+  eigen_assert(degree <= maxPadeDegree);
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k)
+    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
+                           .template triangularView<Upper>().solve(TminusI);
+}
+
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade9(MatrixType& result, const MatrixType& T)
+{
+  const int degree = 9;
+  const RealScalar nodes[]   = { 0.0159198802461869550822118985481636L, 0.0819844463366821028502851059651326L,
+            0.1933142836497048013456489803292629L, 0.3378732882980955354807309926783317L,
+            0.5000000000000000000000000000000000L, 0.6621267117019044645192690073216683L,
+            0.8066857163502951986543510196707371L, 0.9180155536633178971497148940348674L,
+            0.9840801197538130449177881014518364L };
+  const RealScalar weights[] = { 0.0406371941807872059859460790552618L, 0.0903240803474287020292360156214564L,
+            0.1303053482014677311593714347093164L, 0.1561735385200014200343152032922218L,
+            0.1651196775006298815822625346434870L, 0.1561735385200014200343152032922218L,
+            0.1303053482014677311593714347093164L, 0.0903240803474287020292360156214564L,
+            0.0406371941807872059859460790552618L };
+  eigen_assert(degree <= maxPadeDegree);
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k)
+    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
+                           .template triangularView<Upper>().solve(TminusI);
+}
+
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade10(MatrixType& result, const MatrixType& T)
+{
+  const int degree = 10;
+  const RealScalar nodes[]   = { 0.0130467357414141399610179939577740L, 0.0674683166555077446339516557882535L,
+            0.1602952158504877968828363174425632L, 0.2833023029353764046003670284171079L,
+            0.4255628305091843945575869994351400L, 0.5744371694908156054424130005648600L,
+            0.7166976970646235953996329715828921L, 0.8397047841495122031171636825574368L,
+            0.9325316833444922553660483442117465L, 0.9869532642585858600389820060422260L };
+  const RealScalar weights[] = { 0.0333356721543440687967844049466659L, 0.0747256745752902965728881698288487L,
+            0.1095431812579910219977674671140816L, 0.1346333596549981775456134607847347L,
+            0.1477621123573764350869464973256692L, 0.1477621123573764350869464973256692L,
+            0.1346333596549981775456134607847347L, 0.1095431812579910219977674671140816L,
+            0.0747256745752902965728881698288487L, 0.0333356721543440687967844049466659L };
+  eigen_assert(degree <= maxPadeDegree);
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k)
+    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
+                           .template triangularView<Upper>().solve(TminusI);
+}
+
+template <typename MatrixType>
+void MatrixLogarithmAtomic<MatrixType>::computePade11(MatrixType& result, const MatrixType& T)
+{
+  const int degree = 11;
+  const RealScalar nodes[]   = { 0.0108856709269715035980309994385713L, 0.0564687001159523504624211153480364L,
+            0.1349239972129753379532918739844233L, 0.2404519353965940920371371652706952L,
+            0.3652284220238275138342340072995692L, 0.5000000000000000000000000000000000L,
+            0.6347715779761724861657659927004308L, 0.7595480646034059079628628347293048L,
+            0.8650760027870246620467081260155767L, 0.9435312998840476495375788846519636L,
+            0.9891143290730284964019690005614287L };
+  const RealScalar weights[] = { 0.0278342835580868332413768602212743L, 0.0627901847324523123173471496119701L,
+            0.0931451054638671257130488207158280L, 0.1165968822959952399592618524215876L,
+            0.1314022722551233310903444349452546L, 0.1364625433889503153572417641681711L,
+            0.1314022722551233310903444349452546L, 0.1165968822959952399592618524215876L,
+            0.0931451054638671257130488207158280L, 0.0627901847324523123173471496119701L,
+            0.0278342835580868332413768602212743L };
+  eigen_assert(degree <= maxPadeDegree);
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k)
+    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
+                           .template triangularView<Upper>().solve(TminusI);
+}
+
+/** \ingroup MatrixFunctions_Module
+  *
+  * \brief Proxy for the matrix logarithm of some matrix (expression).
+  *
+  * \tparam Derived  Type of the argument to the matrix function.
+  *
+  * This class holds the argument to the matrix function until it is
+  * assigned or evaluated for some other reason (so the argument
+  * should not be changed in the meantime). It is the return type of
+  * MatrixBase::log() and most of the time this is the only way it
+  * is used.
+  */
+template<typename Derived> class MatrixLogarithmReturnValue
+: public ReturnByValue<MatrixLogarithmReturnValue<Derived> >
+{
+public:
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Index Index;
+
+  /** \brief Constructor.
+    *
+    * \param[in]  A  %Matrix (expression) forming the argument of the matrix logarithm.
+    */
+  MatrixLogarithmReturnValue(const Derived& A) : m_A(A) { }
+  
+  /** \brief Compute the matrix logarithm.
+    *
+    * \param[out]  result  Logarithm of \p A, where \A is as specified in the constructor.
+    */
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const
+  {
+    typedef typename Derived::PlainObject PlainObject;
+    typedef internal::traits<PlainObject> Traits;
+    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
+    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
+    static const int Options = PlainObject::Options;
+    typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+    typedef MatrixLogarithmAtomic<DynMatrixType> AtomicType;
+    AtomicType atomic;
+    
+    const PlainObject Aevaluated = m_A.eval();
+    MatrixFunction<PlainObject, AtomicType> mf(Aevaluated, atomic);
+    mf.compute(result);
+  }
+
+  Index rows() const { return m_A.rows(); }
+  Index cols() const { return m_A.cols(); }
+  
+private:
+  typename internal::nested<Derived>::type m_A;
+  
+  MatrixLogarithmReturnValue& operator=(const MatrixLogarithmReturnValue&);
+};
+
+namespace internal {
+  template<typename Derived>
+  struct traits<MatrixLogarithmReturnValue<Derived> >
+  {
+    typedef typename Derived::PlainObject ReturnType;
+  };
+}
+
+
+/********** MatrixBase method **********/
+
+
+template <typename Derived>
+const MatrixLogarithmReturnValue<Derived> MatrixBase<Derived>::log() const
+{
+  eigen_assert(rows() == cols());
+  return MatrixLogarithmReturnValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_LOGARITHM
diff --git a/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
new file mode 100644
index 0000000..78a307e
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -0,0 +1,508 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012, 2013 Chen-Pang He <jdh8 at ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_POWER
+#define EIGEN_MATRIX_POWER
+
+namespace Eigen {
+
+template<typename MatrixType> class MatrixPower;
+
+template<typename MatrixType>
+class MatrixPowerRetval : public ReturnByValue< MatrixPowerRetval<MatrixType> >
+{
+  public:
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+
+    MatrixPowerRetval(MatrixPower<MatrixType>& pow, RealScalar p) : m_pow(pow), m_p(p)
+    { }
+
+    template<typename ResultType>
+    inline void evalTo(ResultType& res) const
+    { m_pow.compute(res, m_p); }
+
+    Index rows() const { return m_pow.rows(); }
+    Index cols() const { return m_pow.cols(); }
+
+  private:
+    MatrixPower<MatrixType>& m_pow;
+    const RealScalar m_p;
+    MatrixPowerRetval& operator=(const MatrixPowerRetval&);
+};
+
+template<typename MatrixType>
+class MatrixPowerAtomic
+{
+  private:
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef std::complex<RealScalar> ComplexScalar;
+    typedef typename MatrixType::Index Index;
+    typedef Array<Scalar, RowsAtCompileTime, 1, ColMajor, MaxRowsAtCompileTime> ArrayType;
+
+    const MatrixType& m_A;
+    RealScalar m_p;
+
+    void computePade(int degree, const MatrixType& IminusT, MatrixType& res) const;
+    void compute2x2(MatrixType& res, RealScalar p) const;
+    void computeBig(MatrixType& res) const;
+    static int getPadeDegree(float normIminusT);
+    static int getPadeDegree(double normIminusT);
+    static int getPadeDegree(long double normIminusT);
+    static ComplexScalar computeSuperDiag(const ComplexScalar&, const ComplexScalar&, RealScalar p);
+    static RealScalar computeSuperDiag(RealScalar, RealScalar, RealScalar p);
+
+  public:
+    MatrixPowerAtomic(const MatrixType& T, RealScalar p);
+    void compute(MatrixType& res) const;
+};
+
+template<typename MatrixType>
+MatrixPowerAtomic<MatrixType>::MatrixPowerAtomic(const MatrixType& T, RealScalar p) :
+  m_A(T), m_p(p)
+{ eigen_assert(T.rows() == T.cols()); }
+
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::compute(MatrixType& res) const
+{
+  res.resizeLike(m_A);
+  switch (m_A.rows()) {
+    case 0:
+      break;
+    case 1:
+      res(0,0) = std::pow(m_A(0,0), m_p);
+      break;
+    case 2:
+      compute2x2(res, m_p);
+      break;
+    default:
+      computeBig(res);
+  }
+}
+
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, MatrixType& res) const
+{
+  int i = degree<<1;
+  res = (m_p-degree) / ((i-1)<<1) * IminusT;
+  for (--i; i; --i) {
+    res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView<Upper>()
+	.solve((i==1 ? -m_p : i&1 ? (-m_p-(i>>1))/(i<<1) : (m_p-(i>>1))/((i-1)<<1)) * IminusT).eval();
+  }
+  res += MatrixType::Identity(IminusT.rows(), IminusT.cols());
+}
+
+// This function assumes that res has the correct size (see bug 614)
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::compute2x2(MatrixType& res, RealScalar p) const
+{
+  using std::abs;
+  using std::pow;
+  
+  res.coeffRef(0,0) = pow(m_A.coeff(0,0), p);
+
+  for (Index i=1; i < m_A.cols(); ++i) {
+    res.coeffRef(i,i) = pow(m_A.coeff(i,i), p);
+    if (m_A.coeff(i-1,i-1) == m_A.coeff(i,i))
+      res.coeffRef(i-1,i) = p * pow(m_A.coeff(i,i), p-1);
+    else if (2*abs(m_A.coeff(i-1,i-1)) < abs(m_A.coeff(i,i)) || 2*abs(m_A.coeff(i,i)) < abs(m_A.coeff(i-1,i-1)))
+      res.coeffRef(i-1,i) = (res.coeff(i,i)-res.coeff(i-1,i-1)) / (m_A.coeff(i,i)-m_A.coeff(i-1,i-1));
+    else
+      res.coeffRef(i-1,i) = computeSuperDiag(m_A.coeff(i,i), m_A.coeff(i-1,i-1), p);
+    res.coeffRef(i-1,i) *= m_A.coeff(i-1,i);
+  }
+}
+
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::computeBig(MatrixType& res) const
+{
+  const int digits = std::numeric_limits<RealScalar>::digits;
+  const RealScalar maxNormForPade = digits <=  24? 4.3386528e-1f:                           // sigle precision
+				    digits <=  53? 2.789358995219730e-1:                    // double precision
+				    digits <=  64? 2.4471944416607995472e-1L:               // extended precision
+				    digits <= 106? 1.1016843812851143391275867258512e-1L:   // double-double
+						   9.134603732914548552537150753385375e-2L; // quadruple precision
+  MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
+  RealScalar normIminusT;
+  int degree, degree2, numberOfSquareRoots = 0;
+  bool hasExtraSquareRoot = false;
+
+  /* FIXME
+   * For singular T, norm(I - T) >= 1 but maxNormForPade < 1, leads to infinite
+   * loop.  We should move 0 eigenvalues to bottom right corner.  We need not
+   * worry about tiny values (e.g. 1e-300) because they will reach 1 if
+   * repetitively sqrt'ed.
+   *
+   * If the 0 eigenvalues are semisimple, they can form a 0 matrix at the
+   * bottom right corner.
+   *
+   * [ T  A ]^p   [ T^p  (T^-1 T^p A) ]
+   * [      ]   = [                   ]
+   * [ 0  0 ]     [  0         0      ]
+   */
+  for (Index i=0; i < m_A.cols(); ++i)
+    eigen_assert(m_A(i,i) != RealScalar(0));
+
+  while (true) {
+    IminusT = MatrixType::Identity(m_A.rows(), m_A.cols()) - T;
+    normIminusT = IminusT.cwiseAbs().colwise().sum().maxCoeff();
+    if (normIminusT < maxNormForPade) {
+      degree = getPadeDegree(normIminusT);
+      degree2 = getPadeDegree(normIminusT/2);
+      if (degree - degree2 <= 1 || hasExtraSquareRoot)
+	break;
+      hasExtraSquareRoot = true;
+    }
+    MatrixSquareRootTriangular<MatrixType>(T).compute(sqrtT);
+    T = sqrtT.template triangularView<Upper>();
+    ++numberOfSquareRoots;
+  }
+  computePade(degree, IminusT, res);
+
+  for (; numberOfSquareRoots; --numberOfSquareRoots) {
+    compute2x2(res, std::ldexp(m_p, -numberOfSquareRoots));
+    res = res.template triangularView<Upper>() * res;
+  }
+  compute2x2(res, m_p);
+}
+  
+template<typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(float normIminusT)
+{
+  const float maxNormForPade[] = { 2.8064004e-1f /* degree = 3 */ , 4.3386528e-1f };
+  int degree = 3;
+  for (; degree <= 4; ++degree)
+    if (normIminusT <= maxNormForPade[degree - 3])
+      break;
+  return degree;
+}
+
+template<typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(double normIminusT)
+{
+  const double maxNormForPade[] = { 1.884160592658218e-2 /* degree = 3 */ , 6.038881904059573e-2, 1.239917516308172e-1,
+      1.999045567181744e-1, 2.789358995219730e-1 };
+  int degree = 3;
+  for (; degree <= 7; ++degree)
+    if (normIminusT <= maxNormForPade[degree - 3])
+      break;
+  return degree;
+}
+
+template<typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT)
+{
+#if   LDBL_MANT_DIG == 53
+  const int maxPadeDegree = 7;
+  const double maxNormForPade[] = { 1.884160592658218e-2L /* degree = 3 */ , 6.038881904059573e-2L, 1.239917516308172e-1L,
+      1.999045567181744e-1L, 2.789358995219730e-1L };
+#elif LDBL_MANT_DIG <= 64
+  const int maxPadeDegree = 8;
+  const double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
+      6.4216043030404063729e-2L, 1.1701165502926694307e-1L, 1.7904284231268670284e-1L, 2.4471944416607995472e-1L };
+#elif LDBL_MANT_DIG <= 106
+  const int maxPadeDegree = 10;
+  const double maxNormForPade[] = { 1.0007161601787493236741409687186e-4L /* degree = 3 */ ,
+      1.0007161601787493236741409687186e-3L, 4.7069769360887572939882574746264e-3L, 1.3220386624169159689406653101695e-2L,
+      2.8063482381631737920612944054906e-2L, 4.9625993951953473052385361085058e-2L, 7.7367040706027886224557538328171e-2L,
+      1.1016843812851143391275867258512e-1L };
+#else
+  const int maxPadeDegree = 10;
+  const double maxNormForPade[] = { 5.524506147036624377378713555116378e-5L /* degree = 3 */ ,
+      6.640600568157479679823602193345995e-4L, 3.227716520106894279249709728084626e-3L,
+      9.619593944683432960546978734646284e-3L, 2.134595382433742403911124458161147e-2L,
+      3.908166513900489428442993794761185e-2L, 6.266780814639442865832535460550138e-2L,
+      9.134603732914548552537150753385375e-2L };
+#endif
+  int degree = 3;
+  for (; degree <= maxPadeDegree; ++degree)
+    if (normIminusT <= maxNormForPade[degree - 3])
+      break;
+  return degree;
+}
+
+template<typename MatrixType>
+inline typename MatrixPowerAtomic<MatrixType>::ComplexScalar
+MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar& prev, RealScalar p)
+{
+  ComplexScalar logCurr = std::log(curr);
+  ComplexScalar logPrev = std::log(prev);
+  int unwindingNumber = std::ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI));
+  ComplexScalar w = numext::atanh2(curr - prev, curr + prev) + ComplexScalar(0, M_PI*unwindingNumber);
+  return RealScalar(2) * std::exp(RealScalar(0.5) * p * (logCurr + logPrev)) * std::sinh(p * w) / (curr - prev);
+}
+
+template<typename MatrixType>
+inline typename MatrixPowerAtomic<MatrixType>::RealScalar
+MatrixPowerAtomic<MatrixType>::computeSuperDiag(RealScalar curr, RealScalar prev, RealScalar p)
+{
+  RealScalar w = numext::atanh2(curr - prev, curr + prev);
+  return 2 * std::exp(p * (std::log(curr) + std::log(prev)) / 2) * std::sinh(p * w) / (curr - prev);
+}
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Class for computing matrix powers.
+ *
+ * \tparam MatrixType  type of the base, expected to be an instantiation
+ * of the Matrix class template.
+ *
+ * This class is capable of computing real/complex matrices raised to
+ * an arbitrary real power. Meanwhile, it saves the result of Schur
+ * decomposition if an non-integral power has even been calculated.
+ * Therefore, if you want to compute multiple (>= 2) matrix powers
+ * for the same matrix, using the class directly is more efficient than
+ * calling MatrixBase::pow().
+ *
+ * Example:
+ * \include MatrixPower_optimal.cpp
+ * Output: \verbinclude MatrixPower_optimal.out
+ */
+template<typename MatrixType>
+class MatrixPower
+{
+  private:
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
+
+  public:
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] A  the base of the matrix power.
+     *
+     * The class stores a reference to A, so it should not be changed
+     * (or destroyed) before evaluation.
+     */
+    explicit MatrixPower(const MatrixType& A) : m_A(A), m_conditionNumber(0)
+    { eigen_assert(A.rows() == A.cols()); }
+
+    /**
+     * \brief Returns the matrix power.
+     *
+     * \param[in] p  exponent, a real scalar.
+     * \return The expression \f$ A^p \f$, where A is specified in the
+     * constructor.
+     */
+    const MatrixPowerRetval<MatrixType> operator()(RealScalar p)
+    { return MatrixPowerRetval<MatrixType>(*this, p); }
+
+    /**
+     * \brief Compute the matrix power.
+     *
+     * \param[in]  p    exponent, a real scalar.
+     * \param[out] res  \f$ A^p \f$ where A is specified in the
+     * constructor.
+     */
+    template<typename ResultType>
+    void compute(ResultType& res, RealScalar p);
+    
+    Index rows() const { return m_A.rows(); }
+    Index cols() const { return m_A.cols(); }
+
+  private:
+    typedef std::complex<RealScalar> ComplexScalar;
+    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, MatrixType::Options,
+              MaxRowsAtCompileTime, MaxColsAtCompileTime> ComplexMatrix;
+
+    typename MatrixType::Nested m_A;
+    MatrixType m_tmp;
+    ComplexMatrix m_T, m_U, m_fT;
+    RealScalar m_conditionNumber;
+
+    RealScalar modfAndInit(RealScalar, RealScalar*);
+
+    template<typename ResultType>
+    void computeIntPower(ResultType&, RealScalar);
+
+    template<typename ResultType>
+    void computeFracPower(ResultType&, RealScalar);
+
+    template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+    static void revertSchur(
+        Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+        const ComplexMatrix& T,
+        const ComplexMatrix& U);
+
+    template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+    static void revertSchur(
+        Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+        const ComplexMatrix& T,
+        const ComplexMatrix& U);
+};
+
+template<typename MatrixType>
+template<typename ResultType>
+void MatrixPower<MatrixType>::compute(ResultType& res, RealScalar p)
+{
+  switch (cols()) {
+    case 0:
+      break;
+    case 1:
+      res(0,0) = std::pow(m_A.coeff(0,0), p);
+      break;
+    default:
+      RealScalar intpart, x = modfAndInit(p, &intpart);
+      computeIntPower(res, intpart);
+      computeFracPower(res, x);
+  }
+}
+
+template<typename MatrixType>
+typename MatrixPower<MatrixType>::RealScalar
+MatrixPower<MatrixType>::modfAndInit(RealScalar x, RealScalar* intpart)
+{
+  typedef Array<RealScalar, RowsAtCompileTime, 1, ColMajor, MaxRowsAtCompileTime> RealArray;
+
+  *intpart = std::floor(x);
+  RealScalar res = x - *intpart;
+
+  if (!m_conditionNumber && res) {
+    const ComplexSchur<MatrixType> schurOfA(m_A);
+    m_T = schurOfA.matrixT();
+    m_U = schurOfA.matrixU();
+    
+    const RealArray absTdiag = m_T.diagonal().array().abs();
+    m_conditionNumber = absTdiag.maxCoeff() / absTdiag.minCoeff();
+  }
+
+  if (res>RealScalar(0.5) && res>(1-res)*std::pow(m_conditionNumber, res)) {
+    --res;
+    ++*intpart;
+  }
+  return res;
+}
+
+template<typename MatrixType>
+template<typename ResultType>
+void MatrixPower<MatrixType>::computeIntPower(ResultType& res, RealScalar p)
+{
+  RealScalar pp = std::abs(p);
+
+  if (p<0)  m_tmp = m_A.inverse();
+  else      m_tmp = m_A;
+
+  res = MatrixType::Identity(rows(), cols());
+  while (pp >= 1) {
+    if (std::fmod(pp, 2) >= 1)
+      res = m_tmp * res;
+    m_tmp *= m_tmp;
+    pp /= 2;
+  }
+}
+
+template<typename MatrixType>
+template<typename ResultType>
+void MatrixPower<MatrixType>::computeFracPower(ResultType& res, RealScalar p)
+{
+  if (p) {
+    eigen_assert(m_conditionNumber);
+    MatrixPowerAtomic<ComplexMatrix>(m_T, p).compute(m_fT);
+    revertSchur(m_tmp, m_fT, m_U);
+    res = m_tmp * res;
+  }
+}
+
+template<typename MatrixType>
+template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+inline void MatrixPower<MatrixType>::revertSchur(
+    Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+    const ComplexMatrix& T,
+    const ComplexMatrix& U)
+{ res.noalias() = U * (T.template triangularView<Upper>() * U.adjoint()); }
+
+template<typename MatrixType>
+template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+inline void MatrixPower<MatrixType>::revertSchur(
+    Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+    const ComplexMatrix& T,
+    const ComplexMatrix& U)
+{ res.noalias() = (U * (T.template triangularView<Upper>() * U.adjoint())).real(); }
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix (expression).
+ *
+ * \tparam Derived  type of the base, a matrix (expression).
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::pow() and related functions and most of the
+ * time this is the only way it is used.
+ */
+template<typename Derived>
+class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Derived> >
+{
+  public:
+    typedef typename Derived::PlainObject PlainObject;
+    typedef typename Derived::RealScalar RealScalar;
+    typedef typename Derived::Index Index;
+
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] A  %Matrix (expression), the base of the matrix power.
+     * \param[in] p  scalar, the exponent of the matrix power.
+     */
+    MatrixPowerReturnValue(const Derived& A, RealScalar p) : m_A(A), m_p(p)
+    { }
+
+    /**
+     * \brief Compute the matrix power.
+     *
+     * \param[out] result  \f$ A^p \f$ where \p A and \p p are as in the
+     * constructor.
+     */
+    template<typename ResultType>
+    inline void evalTo(ResultType& res) const
+    { MatrixPower<PlainObject>(m_A.eval()).compute(res, m_p); }
+
+    Index rows() const { return m_A.rows(); }
+    Index cols() const { return m_A.cols(); }
+
+  private:
+    const Derived& m_A;
+    const RealScalar m_p;
+    MatrixPowerReturnValue& operator=(const MatrixPowerReturnValue&);
+};
+
+namespace internal {
+
+template<typename MatrixPowerType>
+struct traits< MatrixPowerRetval<MatrixPowerType> >
+{ typedef typename MatrixPowerType::PlainObject ReturnType; };
+
+template<typename Derived>
+struct traits< MatrixPowerReturnValue<Derived> >
+{ typedef typename Derived::PlainObject ReturnType; };
+
+}
+
+template<typename Derived>
+const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(const RealScalar& p) const
+{ return MatrixPowerReturnValue<Derived>(derived(), p); }
+
+} // namespace Eigen
+
+#endif // EIGEN_MATRIX_POWER
diff --git a/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
new file mode 100644
index 0000000..b48ea9d
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -0,0 +1,466 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_SQUARE_ROOT
+#define EIGEN_MATRIX_SQUARE_ROOT
+
+namespace Eigen { 
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Class for computing matrix square roots of upper quasi-triangular matrices.
+  * \tparam  MatrixType  type of the argument of the matrix square root,
+  *                      expected to be an instantiation of the Matrix class template.
+  *
+  * This class computes the square root of the upper quasi-triangular
+  * matrix stored in the upper Hessenberg part of the matrix passed to
+  * the constructor.
+  *
+  * \sa MatrixSquareRoot, MatrixSquareRootTriangular
+  */
+template <typename MatrixType>
+class MatrixSquareRootQuasiTriangular
+{
+  public:
+
+    /** \brief Constructor. 
+      *
+      * \param[in]  A  upper quasi-triangular matrix whose square root 
+      *                is to be computed.
+      *
+      * The class stores a reference to \p A, so it should not be
+      * changed (or destroyed) before compute() is called.
+      */
+    MatrixSquareRootQuasiTriangular(const MatrixType& A) 
+      : m_A(A) 
+    {
+      eigen_assert(A.rows() == A.cols());
+    }
+    
+    /** \brief Compute the matrix square root
+      *
+      * \param[out] result  square root of \p A, as specified in the constructor.
+      *
+      * Only the upper Hessenberg part of \p result is updated, the
+      * rest is not touched.  See MatrixBase::sqrt() for details on
+      * how this computation is implemented.
+      */
+    template <typename ResultType> void compute(ResultType &result);    
+    
+  private:
+    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::Scalar Scalar;
+    
+    void computeDiagonalPartOfSqrt(MatrixType& sqrtT, const MatrixType& T);
+    void computeOffDiagonalPartOfSqrt(MatrixType& sqrtT, const MatrixType& T);
+    void compute2x2diagonalBlock(MatrixType& sqrtT, const MatrixType& T, typename MatrixType::Index i);
+    void compute1x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
+				  typename MatrixType::Index i, typename MatrixType::Index j);
+    void compute1x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
+				  typename MatrixType::Index i, typename MatrixType::Index j);
+    void compute2x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
+				  typename MatrixType::Index i, typename MatrixType::Index j);
+    void compute2x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
+				  typename MatrixType::Index i, typename MatrixType::Index j);
+  
+    template <typename SmallMatrixType>
+    static void solveAuxiliaryEquation(SmallMatrixType& X, const SmallMatrixType& A, 
+				     const SmallMatrixType& B, const SmallMatrixType& C);
+  
+    const MatrixType& m_A;
+};
+
+template <typename MatrixType>
+template <typename ResultType> 
+void MatrixSquareRootQuasiTriangular<MatrixType>::compute(ResultType &result)
+{
+  result.resize(m_A.rows(), m_A.cols());
+  computeDiagonalPartOfSqrt(result, m_A);
+  computeOffDiagonalPartOfSqrt(result, m_A);
+}
+
+// pre:  T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
+// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
+template <typename MatrixType>
+void MatrixSquareRootQuasiTriangular<MatrixType>::computeDiagonalPartOfSqrt(MatrixType& sqrtT, 
+									  const MatrixType& T)
+{
+  using std::sqrt;
+  const Index size = m_A.rows();
+  for (Index i = 0; i < size; i++) {
+    if (i == size - 1 || T.coeff(i+1, i) == 0) {
+      eigen_assert(T(i,i) >= 0);
+      sqrtT.coeffRef(i,i) = sqrt(T.coeff(i,i));
+    }
+    else {
+      compute2x2diagonalBlock(sqrtT, T, i);
+      ++i;
+    }
+  }
+}
+
+// pre:  T is quasi-upper-triangular and diagonal blocks of sqrtT are square root of diagonal blocks of T.
+// post: sqrtT is the square root of T.
+template <typename MatrixType>
+void MatrixSquareRootQuasiTriangular<MatrixType>::computeOffDiagonalPartOfSqrt(MatrixType& sqrtT, 
+									     const MatrixType& T)
+{
+  const Index size = m_A.rows();
+  for (Index j = 1; j < size; j++) {
+      if (T.coeff(j, j-1) != 0)  // if T(j-1:j, j-1:j) is a 2-by-2 block
+	continue;
+    for (Index i = j-1; i >= 0; i--) {
+      if (i > 0 && T.coeff(i, i-1) != 0)  // if T(i-1:i, i-1:i) is a 2-by-2 block
+	continue;
+      bool iBlockIs2x2 = (i < size - 1) && (T.coeff(i+1, i) != 0);
+      bool jBlockIs2x2 = (j < size - 1) && (T.coeff(j+1, j) != 0);
+      if (iBlockIs2x2 && jBlockIs2x2) 
+	compute2x2offDiagonalBlock(sqrtT, T, i, j);
+      else if (iBlockIs2x2 && !jBlockIs2x2) 
+	compute2x1offDiagonalBlock(sqrtT, T, i, j);
+      else if (!iBlockIs2x2 && jBlockIs2x2) 
+	compute1x2offDiagonalBlock(sqrtT, T, i, j);
+      else if (!iBlockIs2x2 && !jBlockIs2x2) 
+	compute1x1offDiagonalBlock(sqrtT, T, i, j);
+    }
+  }
+}
+
+// pre:  T.block(i,i,2,2) has complex conjugate eigenvalues
+// post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
+template <typename MatrixType>
+void MatrixSquareRootQuasiTriangular<MatrixType>
+     ::compute2x2diagonalBlock(MatrixType& sqrtT, const MatrixType& T, typename MatrixType::Index i)
+{
+  // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
+  //       in EigenSolver. If we expose it, we could call it directly from here.
+  Matrix<Scalar,2,2> block = T.template block<2,2>(i,i);
+  EigenSolver<Matrix<Scalar,2,2> > es(block);
+  sqrtT.template block<2,2>(i,i)
+    = (es.eigenvectors() * es.eigenvalues().cwiseSqrt().asDiagonal() * es.eigenvectors().inverse()).real();
+}
+
+// pre:  block structure of T is such that (i,j) is a 1x1 block,
+//       all blocks of sqrtT to left of and below (i,j) are correct
+// post: sqrtT(i,j) has the correct value
+template <typename MatrixType>
+void MatrixSquareRootQuasiTriangular<MatrixType>
+     ::compute1x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
+				  typename MatrixType::Index i, typename MatrixType::Index j)
+{
+  Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
+  sqrtT.coeffRef(i,j) = (T.coeff(i,j) - tmp) / (sqrtT.coeff(i,i) + sqrtT.coeff(j,j));
+}
+
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType>
+void MatrixSquareRootQuasiTriangular<MatrixType>
+     ::compute1x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
+				  typename MatrixType::Index i, typename MatrixType::Index j)
+{
+  Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
+  if (j-i > 1)
+    rhs -= sqrtT.block(i, i+1, 1, j-i-1) * sqrtT.block(i+1, j, j-i-1, 2);
+  Matrix<Scalar,2,2> A = sqrtT.coeff(i,i) * Matrix<Scalar,2,2>::Identity();
+  A += sqrtT.template block<2,2>(j,j).transpose();
+  sqrtT.template block<1,2>(i,j).transpose() = A.fullPivLu().solve(rhs.transpose());
+}
+
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType>
+void MatrixSquareRootQuasiTriangular<MatrixType>
+     ::compute2x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
+				  typename MatrixType::Index i, typename MatrixType::Index j)
+{
+  Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
+  if (j-i > 2)
+    rhs -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 1);
+  Matrix<Scalar,2,2> A = sqrtT.coeff(j,j) * Matrix<Scalar,2,2>::Identity();
+  A += sqrtT.template block<2,2>(i,i);
+  sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs);
+}
+
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType>
+void MatrixSquareRootQuasiTriangular<MatrixType>
+     ::compute2x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
+				  typename MatrixType::Index i, typename MatrixType::Index j)
+{
+  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
+  Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
+  Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
+  if (j-i > 2)
+    C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
+  Matrix<Scalar,2,2> X;
+  solveAuxiliaryEquation(X, A, B, C);
+  sqrtT.template block<2,2>(i,j) = X;
+}
+
+// solves the equation A X + X B = C where all matrices are 2-by-2
+template <typename MatrixType>
+template <typename SmallMatrixType>
+void MatrixSquareRootQuasiTriangular<MatrixType>
+     ::solveAuxiliaryEquation(SmallMatrixType& X, const SmallMatrixType& A,
+			      const SmallMatrixType& B, const SmallMatrixType& C)
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<SmallMatrixType, Matrix<Scalar,2,2> >::value),
+		      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+
+  Matrix<Scalar,4,4> coeffMatrix = Matrix<Scalar,4,4>::Zero();
+  coeffMatrix.coeffRef(0,0) = A.coeff(0,0) + B.coeff(0,0);
+  coeffMatrix.coeffRef(1,1) = A.coeff(0,0) + B.coeff(1,1);
+  coeffMatrix.coeffRef(2,2) = A.coeff(1,1) + B.coeff(0,0);
+  coeffMatrix.coeffRef(3,3) = A.coeff(1,1) + B.coeff(1,1);
+  coeffMatrix.coeffRef(0,1) = B.coeff(1,0);
+  coeffMatrix.coeffRef(0,2) = A.coeff(0,1);
+  coeffMatrix.coeffRef(1,0) = B.coeff(0,1);
+  coeffMatrix.coeffRef(1,3) = A.coeff(0,1);
+  coeffMatrix.coeffRef(2,0) = A.coeff(1,0);
+  coeffMatrix.coeffRef(2,3) = B.coeff(1,0);
+  coeffMatrix.coeffRef(3,1) = A.coeff(1,0);
+  coeffMatrix.coeffRef(3,2) = B.coeff(0,1);
+  
+  Matrix<Scalar,4,1> rhs;
+  rhs.coeffRef(0) = C.coeff(0,0);
+  rhs.coeffRef(1) = C.coeff(0,1);
+  rhs.coeffRef(2) = C.coeff(1,0);
+  rhs.coeffRef(3) = C.coeff(1,1);
+  
+  Matrix<Scalar,4,1> result;
+  result = coeffMatrix.fullPivLu().solve(rhs);
+
+  X.coeffRef(0,0) = result.coeff(0);
+  X.coeffRef(0,1) = result.coeff(1);
+  X.coeffRef(1,0) = result.coeff(2);
+  X.coeffRef(1,1) = result.coeff(3);
+}
+
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Class for computing matrix square roots of upper triangular matrices.
+  * \tparam  MatrixType  type of the argument of the matrix square root,
+  *                      expected to be an instantiation of the Matrix class template.
+  *
+  * This class computes the square root of the upper triangular matrix
+  * stored in the upper triangular part (including the diagonal) of
+  * the matrix passed to the constructor.
+  *
+  * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
+  */
+template <typename MatrixType>
+class MatrixSquareRootTriangular
+{
+  public:
+    MatrixSquareRootTriangular(const MatrixType& A) 
+      : m_A(A) 
+    {
+      eigen_assert(A.rows() == A.cols());
+    }
+
+    /** \brief Compute the matrix square root
+      *
+      * \param[out] result  square root of \p A, as specified in the constructor.
+      *
+      * Only the upper triangular part (including the diagonal) of 
+      * \p result is updated, the rest is not touched.  See
+      * MatrixBase::sqrt() for details on how this computation is
+      * implemented.
+      */
+    template <typename ResultType> void compute(ResultType &result);    
+
+ private:
+    const MatrixType& m_A;
+};
+
+template <typename MatrixType>
+template <typename ResultType> 
+void MatrixSquareRootTriangular<MatrixType>::compute(ResultType &result)
+{
+  using std::sqrt;
+
+  // Compute square root of m_A and store it in upper triangular part of result
+  // This uses that the square root of triangular matrices can be computed directly.
+  result.resize(m_A.rows(), m_A.cols());
+  typedef typename MatrixType::Index Index;
+  for (Index i = 0; i < m_A.rows(); i++) {
+    result.coeffRef(i,i) = sqrt(m_A.coeff(i,i));
+  }
+  for (Index j = 1; j < m_A.cols(); j++) {
+    for (Index i = j-1; i >= 0; i--) {
+      typedef typename MatrixType::Scalar Scalar;
+      // if i = j-1, then segment has length 0 so tmp = 0
+      Scalar tmp = (result.row(i).segment(i+1,j-i-1) * result.col(j).segment(i+1,j-i-1)).value();
+      // denominator may be zero if original matrix is singular
+      result.coeffRef(i,j) = (m_A.coeff(i,j) - tmp) / (result.coeff(i,i) + result.coeff(j,j));
+    }
+  }
+}
+
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Class for computing matrix square roots of general matrices.
+  * \tparam  MatrixType  type of the argument of the matrix square root,
+  *                      expected to be an instantiation of the Matrix class template.
+  *
+  * \sa MatrixSquareRootTriangular, MatrixSquareRootQuasiTriangular, MatrixBase::sqrt()
+  */
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+class MatrixSquareRoot
+{
+  public:
+
+    /** \brief Constructor. 
+      *
+      * \param[in]  A  matrix whose square root is to be computed.
+      *
+      * The class stores a reference to \p A, so it should not be
+      * changed (or destroyed) before compute() is called.
+      */
+    MatrixSquareRoot(const MatrixType& A); 
+    
+    /** \brief Compute the matrix square root
+      *
+      * \param[out] result  square root of \p A, as specified in the constructor.
+      *
+      * See MatrixBase::sqrt() for details on how this computation is
+      * implemented.
+      */
+    template <typename ResultType> void compute(ResultType &result);    
+};
+
+
+// ********** Partial specialization for real matrices **********
+
+template <typename MatrixType>
+class MatrixSquareRoot<MatrixType, 0>
+{
+  public:
+
+    MatrixSquareRoot(const MatrixType& A) 
+      : m_A(A) 
+    {  
+      eigen_assert(A.rows() == A.cols());
+    }
+  
+    template <typename ResultType> void compute(ResultType &result)
+    {
+      // Compute Schur decomposition of m_A
+      const RealSchur<MatrixType> schurOfA(m_A);  
+      const MatrixType& T = schurOfA.matrixT();
+      const MatrixType& U = schurOfA.matrixU();
+    
+      // Compute square root of T
+      MatrixType sqrtT = MatrixType::Zero(m_A.rows(), m_A.cols());
+      MatrixSquareRootQuasiTriangular<MatrixType>(T).compute(sqrtT);
+    
+      // Compute square root of m_A
+      result = U * sqrtT * U.adjoint();
+    }
+    
+  private:
+    const MatrixType& m_A;
+};
+
+
+// ********** Partial specialization for complex matrices **********
+
+template <typename MatrixType>
+class MatrixSquareRoot<MatrixType, 1>
+{
+  public:
+
+    MatrixSquareRoot(const MatrixType& A) 
+      : m_A(A) 
+    {  
+      eigen_assert(A.rows() == A.cols());
+    }
+  
+    template <typename ResultType> void compute(ResultType &result)
+    {
+      // Compute Schur decomposition of m_A
+      const ComplexSchur<MatrixType> schurOfA(m_A);  
+      const MatrixType& T = schurOfA.matrixT();
+      const MatrixType& U = schurOfA.matrixU();
+    
+      // Compute square root of T
+      MatrixType sqrtT;
+      MatrixSquareRootTriangular<MatrixType>(T).compute(sqrtT);
+    
+      // Compute square root of m_A
+      result = U * (sqrtT.template triangularView<Upper>() * U.adjoint());
+    }
+    
+  private:
+    const MatrixType& m_A;
+};
+
+
+/** \ingroup MatrixFunctions_Module
+  *
+  * \brief Proxy for the matrix square root of some matrix (expression).
+  *
+  * \tparam Derived  Type of the argument to the matrix square root.
+  *
+  * This class holds the argument to the matrix square root until it
+  * is assigned or evaluated for some other reason (so the argument
+  * should not be changed in the meantime). It is the return type of
+  * MatrixBase::sqrt() and most of the time this is the only way it is
+  * used.
+  */
+template<typename Derived> class MatrixSquareRootReturnValue
+: public ReturnByValue<MatrixSquareRootReturnValue<Derived> >
+{
+    typedef typename Derived::Index Index;
+  public:
+    /** \brief Constructor.
+      *
+      * \param[in]  src  %Matrix (expression) forming the argument of the
+      * matrix square root.
+      */
+    MatrixSquareRootReturnValue(const Derived& src) : m_src(src) { }
+
+    /** \brief Compute the matrix square root.
+      *
+      * \param[out]  result  the matrix square root of \p src in the
+      * constructor.
+      */
+    template <typename ResultType>
+    inline void evalTo(ResultType& result) const
+    {
+      const typename Derived::PlainObject srcEvaluated = m_src.eval();
+      MatrixSquareRoot<typename Derived::PlainObject> me(srcEvaluated);
+      me.compute(result);
+    }
+
+    Index rows() const { return m_src.rows(); }
+    Index cols() const { return m_src.cols(); }
+
+  protected:
+    const Derived& m_src;
+  private:
+    MatrixSquareRootReturnValue& operator=(const MatrixSquareRootReturnValue&);
+};
+
+namespace internal {
+template<typename Derived>
+struct traits<MatrixSquareRootReturnValue<Derived> >
+{
+  typedef typename Derived::PlainObject ReturnType;
+};
+}
+
+template <typename Derived>
+const MatrixSquareRootReturnValue<Derived> MatrixBase<Derived>::sqrt() const
+{
+  eigen_assert(rows() == cols());
+  return MatrixSquareRootReturnValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_FUNCTION
diff --git a/include/eigen3/unsupported/Eigen/src/MatrixFunctions/StemFunction.h b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/StemFunction.h
new file mode 100644
index 0000000..724e55c
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/MatrixFunctions/StemFunction.h
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Jitse Niesen <jitse at maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STEM_FUNCTION
+#define EIGEN_STEM_FUNCTION
+
+namespace Eigen { 
+
+/** \ingroup MatrixFunctions_Module 
+  * \brief Stem functions corresponding to standard mathematical functions.
+  */
+template <typename Scalar>
+class StdStemFunctions
+{
+  public:
+
+    /** \brief The exponential function (and its derivatives). */
+    static Scalar exp(Scalar x, int)
+    {
+      return std::exp(x);
+    }
+
+    /** \brief Cosine (and its derivatives). */
+    static Scalar cos(Scalar x, int n)
+    {
+      Scalar res;
+      switch (n % 4) {
+      case 0: 
+	res = std::cos(x);
+	break;
+      case 1:
+	res = -std::sin(x);
+	break;
+      case 2:
+	res = -std::cos(x);
+	break;
+      case 3:
+	res = std::sin(x);
+	break;
+      }
+      return res;
+    }
+
+    /** \brief Sine (and its derivatives). */
+    static Scalar sin(Scalar x, int n)
+    {
+      Scalar res;
+      switch (n % 4) {
+      case 0:
+	res = std::sin(x);
+	break;
+      case 1:
+	res = std::cos(x);
+	break;
+      case 2:
+	res = -std::sin(x);
+	break;
+      case 3:
+	res = -std::cos(x);
+	break;
+      }
+      return res;
+    }
+
+    /** \brief Hyperbolic cosine (and its derivatives). */
+    static Scalar cosh(Scalar x, int n)
+    {
+      Scalar res;
+      switch (n % 2) {
+      case 0:
+	res = std::cosh(x);
+	break;
+      case 1:
+	res = std::sinh(x);
+	break;
+      }
+      return res;
+    }
+	
+    /** \brief Hyperbolic sine (and its derivatives). */
+    static Scalar sinh(Scalar x, int n)
+    {
+      Scalar res;
+      switch (n % 2) {
+      case 0:
+	res = std::sinh(x);
+	break;
+      case 1:
+	res = std::cosh(x);
+	break;
+      }
+      return res;
+    }
+
+}; // end of class StdStemFunctions
+
+} // end namespace Eigen
+
+#endif // EIGEN_STEM_FUNCTION
diff --git a/include/eigen3/unsupported/Eigen/src/MoreVectorization/MathFunctions.h b/include/eigen3/unsupported/Eigen/src/MoreVectorization/MathFunctions.h
new file mode 100644
index 0000000..63cb28d
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/MoreVectorization/MathFunctions.h
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Rohit Garg <rpg.314 at gmail.com>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1 at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
+#define EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal \returns the arcsin of \a a (coeff-wise) */
+template<typename Packet> inline static Packet pasin(Packet a) { return std::asin(a); }
+
+#ifdef EIGEN_VECTORIZE_SSE
+
+template<> EIGEN_DONT_INLINE Packet4f pasin(Packet4f x)
+{
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5);
+  _EIGEN_DECLARE_CONST_Packet4f(3half, 1.5);
+
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
+
+  _EIGEN_DECLARE_CONST_Packet4f(pi, 3.141592654);
+  _EIGEN_DECLARE_CONST_Packet4f(pi_over_2, 3.141592654*0.5);
+
+  _EIGEN_DECLARE_CONST_Packet4f(asin1, 4.2163199048E-2);
+  _EIGEN_DECLARE_CONST_Packet4f(asin2, 2.4181311049E-2);
+  _EIGEN_DECLARE_CONST_Packet4f(asin3, 4.5470025998E-2);
+  _EIGEN_DECLARE_CONST_Packet4f(asin4, 7.4953002686E-2);
+  _EIGEN_DECLARE_CONST_Packet4f(asin5, 1.6666752422E-1);
+
+  Packet4f a = pabs(x);//got the absolute value
+
+  Packet4f sign_bit= _mm_and_ps(x, p4f_sign_mask);//extracted the sign bit
+
+  Packet4f z1,z2;//will need them during computation    
+
+
+//will compute the two branches for asin
+//so first compare with half
+
+  Packet4f branch_mask= _mm_cmpgt_ps(a, p4f_half);//this is to select which branch to take
+//both will be taken, and finally results will be merged
+//the branch for values >0.5
+
+    {
+//the core series expansion 
+    z1=pmadd(p4f_minus_half,a,p4f_half);
+    Packet4f x1=psqrt(z1);
+    Packet4f s1=pmadd(p4f_asin1, z1, p4f_asin2);
+    Packet4f s2=pmadd(s1, z1, p4f_asin3);
+    Packet4f s3=pmadd(s2,z1, p4f_asin4);
+    Packet4f s4=pmadd(s3,z1, p4f_asin5);
+    Packet4f temp=pmul(s4,z1);//not really a madd but a mul by z so that the next term can be a madd
+    z1=pmadd(temp,x1,x1);
+    z1=padd(z1,z1);
+    z1=psub(p4f_pi_over_2,z1);
+    }
+
+    {
+//the core series expansion 
+    Packet4f x2=a;
+    z2=pmul(x2,x2);
+    Packet4f s1=pmadd(p4f_asin1, z2, p4f_asin2);
+    Packet4f s2=pmadd(s1, z2, p4f_asin3);
+    Packet4f s3=pmadd(s2,z2, p4f_asin4);
+    Packet4f s4=pmadd(s3,z2, p4f_asin5);
+    Packet4f temp=pmul(s4,z2);//not really a madd but a mul by z so that the next term can be a madd
+    z2=pmadd(temp,x2,x2);
+    }
+
+/* select the correct result from the two branch evaluations */
+  z1  = _mm_and_ps(branch_mask, z1);
+  z2  = _mm_andnot_ps(branch_mask, z2);
+  Packet4f z  = _mm_or_ps(z1,z2);
+
+/* update the sign */
+  return _mm_xor_ps(z, sign_bit);
+}
+
+#endif // EIGEN_VECTORIZE_SSE
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
new file mode 100644
index 0000000..b8ba6dd
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -0,0 +1,601 @@
+// -*- coding: utf-8
+// vim: set fileencoding=utf-8
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel at freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_HYBRIDNONLINEARSOLVER_H
+#define EIGEN_HYBRIDNONLINEARSOLVER_H
+
+namespace Eigen { 
+
+namespace HybridNonLinearSolverSpace { 
+    enum Status {
+        Running = -1,
+        ImproperInputParameters = 0,
+        RelativeErrorTooSmall = 1,
+        TooManyFunctionEvaluation = 2,
+        TolTooSmall = 3,
+        NotMakingProgressJacobian = 4,
+        NotMakingProgressIterations = 5,
+        UserAsked = 6
+    };
+}
+
+/**
+  * \ingroup NonLinearOptimization_Module
+  * \brief Finds a zero of a system of n
+  * nonlinear functions in n variables by a modification of the Powell
+  * hybrid method ("dogleg").
+  *
+  * The user must provide a subroutine which calculates the
+  * functions. The Jacobian is either provided by the user, or approximated
+  * using a forward-difference method.
+  *
+  */
+template<typename FunctorType, typename Scalar=double>
+class HybridNonLinearSolver
+{
+public:
+    typedef DenseIndex Index;
+
+    HybridNonLinearSolver(FunctorType &_functor)
+        : functor(_functor) { nfev=njev=iter = 0;  fnorm= 0.; useExternalScaling=false;}
+
+    struct Parameters {
+        Parameters()
+            : factor(Scalar(100.))
+            , maxfev(1000)
+            , xtol(std::sqrt(NumTraits<Scalar>::epsilon()))
+            , nb_of_subdiagonals(-1)
+            , nb_of_superdiagonals(-1)
+            , epsfcn(Scalar(0.)) {}
+        Scalar factor;
+        Index maxfev;   // maximum number of function evaluation
+        Scalar xtol;
+        Index nb_of_subdiagonals;
+        Index nb_of_superdiagonals;
+        Scalar epsfcn;
+    };
+    typedef Matrix< Scalar, Dynamic, 1 > FVectorType;
+    typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType;
+    /* TODO: if eigen provides a triangular storage, use it here */
+    typedef Matrix< Scalar, Dynamic, Dynamic > UpperTriangularType;
+
+    HybridNonLinearSolverSpace::Status hybrj1(
+            FVectorType  &x,
+            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            );
+
+    HybridNonLinearSolverSpace::Status solveInit(FVectorType  &x);
+    HybridNonLinearSolverSpace::Status solveOneStep(FVectorType  &x);
+    HybridNonLinearSolverSpace::Status solve(FVectorType  &x);
+
+    HybridNonLinearSolverSpace::Status hybrd1(
+            FVectorType  &x,
+            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            );
+
+    HybridNonLinearSolverSpace::Status solveNumericalDiffInit(FVectorType  &x);
+    HybridNonLinearSolverSpace::Status solveNumericalDiffOneStep(FVectorType  &x);
+    HybridNonLinearSolverSpace::Status solveNumericalDiff(FVectorType  &x);
+
+    void resetParameters(void) { parameters = Parameters(); }
+    Parameters parameters;
+    FVectorType  fvec, qtf, diag;
+    JacobianType fjac;
+    UpperTriangularType R;
+    Index nfev;
+    Index njev;
+    Index iter;
+    Scalar fnorm;
+    bool useExternalScaling; 
+private:
+    FunctorType &functor;
+    Index n;
+    Scalar sum;
+    bool sing;
+    Scalar temp;
+    Scalar delta;
+    bool jeval;
+    Index ncsuc;
+    Scalar ratio;
+    Scalar pnorm, xnorm, fnorm1;
+    Index nslow1, nslow2;
+    Index ncfail;
+    Scalar actred, prered;
+    FVectorType wa1, wa2, wa3, wa4;
+
+    HybridNonLinearSolver& operator=(const HybridNonLinearSolver&);
+};
+
+
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::hybrj1(
+        FVectorType  &x,
+        const Scalar tol
+        )
+{
+    n = x.size();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || tol < 0.)
+        return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+    resetParameters();
+    parameters.maxfev = 100*(n+1);
+    parameters.xtol = tol;
+    diag.setConstant(n, 1.);
+    useExternalScaling = true;
+    return solve(x);
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveInit(FVectorType  &x)
+{
+    n = x.size();
+
+    wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n);
+    fvec.resize(n);
+    qtf.resize(n);
+    fjac.resize(n, n);
+    if (!useExternalScaling)
+        diag.resize(n);
+    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+
+    /* Function Body */
+    nfev = 0;
+    njev = 0;
+
+    /*     check the input parameters for errors. */
+    if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0. )
+        return HybridNonLinearSolverSpace::ImproperInputParameters;
+    if (useExternalScaling)
+        for (Index j = 0; j < n; ++j)
+            if (diag[j] <= 0.)
+                return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+    /*     evaluate the function at the starting point */
+    /*     and calculate its norm. */
+    nfev = 1;
+    if ( functor(x, fvec) < 0)
+        return HybridNonLinearSolverSpace::UserAsked;
+    fnorm = fvec.stableNorm();
+
+    /*     initialize iteration counter and monitors. */
+    iter = 1;
+    ncsuc = 0;
+    ncfail = 0;
+    nslow1 = 0;
+    nslow2 = 0;
+
+    return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveOneStep(FVectorType  &x)
+{
+    using std::abs;
+    
+    eigen_assert(x.size()==n); // check the caller is not cheating us
+
+    Index j;
+    std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
+
+    jeval = true;
+
+    /* calculate the jacobian matrix. */
+    if ( functor.df(x, fjac) < 0)
+        return HybridNonLinearSolverSpace::UserAsked;
+    ++njev;
+
+    wa2 = fjac.colwise().blueNorm();
+
+    /* on the first iteration and if external scaling is not used, scale according */
+    /* to the norms of the columns of the initial jacobian. */
+    if (iter == 1) {
+        if (!useExternalScaling)
+            for (j = 0; j < n; ++j)
+                diag[j] = (wa2[j]==0.) ? 1. : wa2[j];
+
+        /* on the first iteration, calculate the norm of the scaled x */
+        /* and initialize the step bound delta. */
+        xnorm = diag.cwiseProduct(x).stableNorm();
+        delta = parameters.factor * xnorm;
+        if (delta == 0.)
+            delta = parameters.factor;
+    }
+
+    /* compute the qr factorization of the jacobian. */
+    HouseholderQR<JacobianType> qrfac(fjac); // no pivoting:
+
+    /* copy the triangular factor of the qr factorization into r. */
+    R = qrfac.matrixQR();
+
+    /* accumulate the orthogonal factor in fjac. */
+    fjac = qrfac.householderQ();
+
+    /* form (q transpose)*fvec and store in qtf. */
+    qtf = fjac.transpose() * fvec;
+
+    /* rescale if necessary. */
+    if (!useExternalScaling)
+        diag = diag.cwiseMax(wa2);
+
+    while (true) {
+        /* determine the direction p. */
+        internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
+
+        /* store the direction p and x + p. calculate the norm of p. */
+        wa1 = -wa1;
+        wa2 = x + wa1;
+        pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+        /* on the first iteration, adjust the initial step bound. */
+        if (iter == 1)
+            delta = (std::min)(delta,pnorm);
+
+        /* evaluate the function at x + p and calculate its norm. */
+        if ( functor(wa2, wa4) < 0)
+            return HybridNonLinearSolverSpace::UserAsked;
+        ++nfev;
+        fnorm1 = wa4.stableNorm();
+
+        /* compute the scaled actual reduction. */
+        actred = -1.;
+        if (fnorm1 < fnorm) /* Computing 2nd power */
+            actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+        /* compute the scaled predicted reduction. */
+        wa3 = R.template triangularView<Upper>()*wa1 + qtf;
+        temp = wa3.stableNorm();
+        prered = 0.;
+        if (temp < fnorm) /* Computing 2nd power */
+            prered = 1. - numext::abs2(temp / fnorm);
+
+        /* compute the ratio of the actual to the predicted reduction. */
+        ratio = 0.;
+        if (prered > 0.)
+            ratio = actred / prered;
+
+        /* update the step bound. */
+        if (ratio < Scalar(.1)) {
+            ncsuc = 0;
+            ++ncfail;
+            delta = Scalar(.5) * delta;
+        } else {
+            ncfail = 0;
+            ++ncsuc;
+            if (ratio >= Scalar(.5) || ncsuc > 1)
+                delta = (std::max)(delta, pnorm / Scalar(.5));
+            if (abs(ratio - 1.) <= Scalar(.1)) {
+                delta = pnorm / Scalar(.5);
+            }
+        }
+
+        /* test for successful iteration. */
+        if (ratio >= Scalar(1e-4)) {
+            /* successful iteration. update x, fvec, and their norms. */
+            x = wa2;
+            wa2 = diag.cwiseProduct(x);
+            fvec = wa4;
+            xnorm = wa2.stableNorm();
+            fnorm = fnorm1;
+            ++iter;
+        }
+
+        /* determine the progress of the iteration. */
+        ++nslow1;
+        if (actred >= Scalar(.001))
+            nslow1 = 0;
+        if (jeval)
+            ++nslow2;
+        if (actred >= Scalar(.1))
+            nslow2 = 0;
+
+        /* test for convergence. */
+        if (delta <= parameters.xtol * xnorm || fnorm == 0.)
+            return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
+
+        /* tests for termination and stringent tolerances. */
+        if (nfev >= parameters.maxfev)
+            return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
+        if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
+            return HybridNonLinearSolverSpace::TolTooSmall;
+        if (nslow2 == 5)
+            return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
+        if (nslow1 == 10)
+            return HybridNonLinearSolverSpace::NotMakingProgressIterations;
+
+        /* criterion for recalculating jacobian. */
+        if (ncfail == 2)
+            break; // leave inner loop and go for the next outer loop iteration
+
+        /* calculate the rank one modification to the jacobian */
+        /* and update qtf if necessary. */
+        wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm );
+        wa2 = fjac.transpose() * wa4;
+        if (ratio >= Scalar(1e-4))
+            qtf = wa2;
+        wa2 = (wa2-wa3)/pnorm;
+
+        /* compute the qr factorization of the updated jacobian. */
+        internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
+        internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
+        internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
+
+        jeval = false;
+    }
+    return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solve(FVectorType  &x)
+{
+    HybridNonLinearSolverSpace::Status status = solveInit(x);
+    if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
+        return status;
+    while (status==HybridNonLinearSolverSpace::Running)
+        status = solveOneStep(x);
+    return status;
+}
+
+
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::hybrd1(
+        FVectorType  &x,
+        const Scalar tol
+        )
+{
+    n = x.size();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || tol < 0.)
+        return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+    resetParameters();
+    parameters.maxfev = 200*(n+1);
+    parameters.xtol = tol;
+
+    diag.setConstant(n, 1.);
+    useExternalScaling = true;
+    return solveNumericalDiff(x);
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffInit(FVectorType  &x)
+{
+    n = x.size();
+
+    if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1;
+    if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1;
+
+    wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n);
+    qtf.resize(n);
+    fjac.resize(n, n);
+    fvec.resize(n);
+    if (!useExternalScaling)
+        diag.resize(n);
+    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+
+    /* Function Body */
+    nfev = 0;
+    njev = 0;
+
+    /*     check the input parameters for errors. */
+    if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.nb_of_subdiagonals< 0 || parameters.nb_of_superdiagonals< 0 || parameters.factor <= 0. )
+        return HybridNonLinearSolverSpace::ImproperInputParameters;
+    if (useExternalScaling)
+        for (Index j = 0; j < n; ++j)
+            if (diag[j] <= 0.)
+                return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+    /*     evaluate the function at the starting point */
+    /*     and calculate its norm. */
+    nfev = 1;
+    if ( functor(x, fvec) < 0)
+        return HybridNonLinearSolverSpace::UserAsked;
+    fnorm = fvec.stableNorm();
+
+    /*     initialize iteration counter and monitors. */
+    iter = 1;
+    ncsuc = 0;
+    ncfail = 0;
+    nslow1 = 0;
+    nslow2 = 0;
+
+    return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffOneStep(FVectorType  &x)
+{
+    using std::sqrt;
+    using std::abs;
+    
+    assert(x.size()==n); // check the caller is not cheating us
+
+    Index j;
+    std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
+
+    jeval = true;
+    if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1;
+    if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1;
+
+    /* calculate the jacobian matrix. */
+    if (internal::fdjac1(functor, x, fvec, fjac, parameters.nb_of_subdiagonals, parameters.nb_of_superdiagonals, parameters.epsfcn) <0)
+        return HybridNonLinearSolverSpace::UserAsked;
+    nfev += (std::min)(parameters.nb_of_subdiagonals+parameters.nb_of_superdiagonals+ 1, n);
+
+    wa2 = fjac.colwise().blueNorm();
+
+    /* on the first iteration and if external scaling is not used, scale according */
+    /* to the norms of the columns of the initial jacobian. */
+    if (iter == 1) {
+        if (!useExternalScaling)
+            for (j = 0; j < n; ++j)
+                diag[j] = (wa2[j]==0.) ? 1. : wa2[j];
+
+        /* on the first iteration, calculate the norm of the scaled x */
+        /* and initialize the step bound delta. */
+        xnorm = diag.cwiseProduct(x).stableNorm();
+        delta = parameters.factor * xnorm;
+        if (delta == 0.)
+            delta = parameters.factor;
+    }
+
+    /* compute the qr factorization of the jacobian. */
+    HouseholderQR<JacobianType> qrfac(fjac); // no pivoting:
+
+    /* copy the triangular factor of the qr factorization into r. */
+    R = qrfac.matrixQR();
+
+    /* accumulate the orthogonal factor in fjac. */
+    fjac = qrfac.householderQ();
+
+    /* form (q transpose)*fvec and store in qtf. */
+    qtf = fjac.transpose() * fvec;
+
+    /* rescale if necessary. */
+    if (!useExternalScaling)
+        diag = diag.cwiseMax(wa2);
+
+    while (true) {
+        /* determine the direction p. */
+        internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
+
+        /* store the direction p and x + p. calculate the norm of p. */
+        wa1 = -wa1;
+        wa2 = x + wa1;
+        pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+        /* on the first iteration, adjust the initial step bound. */
+        if (iter == 1)
+            delta = (std::min)(delta,pnorm);
+
+        /* evaluate the function at x + p and calculate its norm. */
+        if ( functor(wa2, wa4) < 0)
+            return HybridNonLinearSolverSpace::UserAsked;
+        ++nfev;
+        fnorm1 = wa4.stableNorm();
+
+        /* compute the scaled actual reduction. */
+        actred = -1.;
+        if (fnorm1 < fnorm) /* Computing 2nd power */
+            actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+        /* compute the scaled predicted reduction. */
+        wa3 = R.template triangularView<Upper>()*wa1 + qtf;
+        temp = wa3.stableNorm();
+        prered = 0.;
+        if (temp < fnorm) /* Computing 2nd power */
+            prered = 1. - numext::abs2(temp / fnorm);
+
+        /* compute the ratio of the actual to the predicted reduction. */
+        ratio = 0.;
+        if (prered > 0.)
+            ratio = actred / prered;
+
+        /* update the step bound. */
+        if (ratio < Scalar(.1)) {
+            ncsuc = 0;
+            ++ncfail;
+            delta = Scalar(.5) * delta;
+        } else {
+            ncfail = 0;
+            ++ncsuc;
+            if (ratio >= Scalar(.5) || ncsuc > 1)
+                delta = (std::max)(delta, pnorm / Scalar(.5));
+            if (abs(ratio - 1.) <= Scalar(.1)) {
+                delta = pnorm / Scalar(.5);
+            }
+        }
+
+        /* test for successful iteration. */
+        if (ratio >= Scalar(1e-4)) {
+            /* successful iteration. update x, fvec, and their norms. */
+            x = wa2;
+            wa2 = diag.cwiseProduct(x);
+            fvec = wa4;
+            xnorm = wa2.stableNorm();
+            fnorm = fnorm1;
+            ++iter;
+        }
+
+        /* determine the progress of the iteration. */
+        ++nslow1;
+        if (actred >= Scalar(.001))
+            nslow1 = 0;
+        if (jeval)
+            ++nslow2;
+        if (actred >= Scalar(.1))
+            nslow2 = 0;
+
+        /* test for convergence. */
+        if (delta <= parameters.xtol * xnorm || fnorm == 0.)
+            return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
+
+        /* tests for termination and stringent tolerances. */
+        if (nfev >= parameters.maxfev)
+            return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
+        if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
+            return HybridNonLinearSolverSpace::TolTooSmall;
+        if (nslow2 == 5)
+            return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
+        if (nslow1 == 10)
+            return HybridNonLinearSolverSpace::NotMakingProgressIterations;
+
+        /* criterion for recalculating jacobian. */
+        if (ncfail == 2)
+            break; // leave inner loop and go for the next outer loop iteration
+
+        /* calculate the rank one modification to the jacobian */
+        /* and update qtf if necessary. */
+        wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm );
+        wa2 = fjac.transpose() * wa4;
+        if (ratio >= Scalar(1e-4))
+            qtf = wa2;
+        wa2 = (wa2-wa3)/pnorm;
+
+        /* compute the qr factorization of the updated jacobian. */
+        internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
+        internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
+        internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
+
+        jeval = false;
+    }
+    return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiff(FVectorType  &x)
+{
+    HybridNonLinearSolverSpace::Status status = solveNumericalDiffInit(x);
+    if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
+        return status;
+    while (status==HybridNonLinearSolverSpace::Running)
+        status = solveNumericalDiffOneStep(x);
+    return status;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_HYBRIDNONLINEARSOLVER_H
+
+//vim: ai ts=4 sts=4 et sw=4
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
new file mode 100644
index 0000000..bfeb26f
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -0,0 +1,650 @@
+// -*- coding: utf-8
+// vim: set fileencoding=utf-8
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel at freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEVENBERGMARQUARDT__H
+#define EIGEN_LEVENBERGMARQUARDT__H
+
+namespace Eigen { 
+
+namespace LevenbergMarquardtSpace {
+    enum Status {
+        NotStarted = -2,
+        Running = -1,
+        ImproperInputParameters = 0,
+        RelativeReductionTooSmall = 1,
+        RelativeErrorTooSmall = 2,
+        RelativeErrorAndReductionTooSmall = 3,
+        CosinusTooSmall = 4,
+        TooManyFunctionEvaluation = 5,
+        FtolTooSmall = 6,
+        XtolTooSmall = 7,
+        GtolTooSmall = 8,
+        UserAsked = 9
+    };
+}
+
+
+
+/**
+  * \ingroup NonLinearOptimization_Module
+  * \brief Performs non linear optimization over a non-linear function,
+  * using a variant of the Levenberg Marquardt algorithm.
+  *
+  * Check wikipedia for more information.
+  * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
+  */
+template<typename FunctorType, typename Scalar=double>
+class LevenbergMarquardt
+{
+public:
+    LevenbergMarquardt(FunctorType &_functor)
+        : functor(_functor) { nfev = njev = iter = 0;  fnorm = gnorm = 0.; useExternalScaling=false; }
+
+    typedef DenseIndex Index;
+
+    struct Parameters {
+        Parameters()
+            : factor(Scalar(100.))
+            , maxfev(400)
+            , ftol(std::sqrt(NumTraits<Scalar>::epsilon()))
+            , xtol(std::sqrt(NumTraits<Scalar>::epsilon()))
+            , gtol(Scalar(0.))
+            , epsfcn(Scalar(0.)) {}
+        Scalar factor;
+        Index maxfev;   // maximum number of function evaluation
+        Scalar ftol;
+        Scalar xtol;
+        Scalar gtol;
+        Scalar epsfcn;
+    };
+
+    typedef Matrix< Scalar, Dynamic, 1 > FVectorType;
+    typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType;
+
+    LevenbergMarquardtSpace::Status lmder1(
+            FVectorType &x,
+            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            );
+
+    LevenbergMarquardtSpace::Status minimize(FVectorType &x);
+    LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x);
+    LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x);
+
+    static LevenbergMarquardtSpace::Status lmdif1(
+            FunctorType &functor,
+            FVectorType &x,
+            Index *nfev,
+            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            );
+
+    LevenbergMarquardtSpace::Status lmstr1(
+            FVectorType  &x,
+            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            );
+
+    LevenbergMarquardtSpace::Status minimizeOptimumStorage(FVectorType  &x);
+    LevenbergMarquardtSpace::Status minimizeOptimumStorageInit(FVectorType  &x);
+    LevenbergMarquardtSpace::Status minimizeOptimumStorageOneStep(FVectorType  &x);
+
+    void resetParameters(void) { parameters = Parameters(); }
+
+    Parameters parameters;
+    FVectorType  fvec, qtf, diag;
+    JacobianType fjac;
+    PermutationMatrix<Dynamic,Dynamic> permutation;
+    Index nfev;
+    Index njev;
+    Index iter;
+    Scalar fnorm, gnorm;
+    bool useExternalScaling; 
+
+    Scalar lm_param(void) { return par; }
+private:
+    FunctorType &functor;
+    Index n;
+    Index m;
+    FVectorType wa1, wa2, wa3, wa4;
+
+    Scalar par, sum;
+    Scalar temp, temp1, temp2;
+    Scalar delta;
+    Scalar ratio;
+    Scalar pnorm, xnorm, fnorm1, actred, dirder, prered;
+
+    LevenbergMarquardt& operator=(const LevenbergMarquardt&);
+};
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::lmder1(
+        FVectorType  &x,
+        const Scalar tol
+        )
+{
+    n = x.size();
+    m = functor.values();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || m < n || tol < 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    resetParameters();
+    parameters.ftol = tol;
+    parameters.xtol = tol;
+    parameters.maxfev = 100*(n+1);
+
+    return minimize(x);
+}
+
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x)
+{
+    LevenbergMarquardtSpace::Status status = minimizeInit(x);
+    if (status==LevenbergMarquardtSpace::ImproperInputParameters)
+        return status;
+    do {
+        status = minimizeOneStep(x);
+    } while (status==LevenbergMarquardtSpace::Running);
+    return status;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeInit(FVectorType  &x)
+{
+    n = x.size();
+    m = functor.values();
+
+    wa1.resize(n); wa2.resize(n); wa3.resize(n);
+    wa4.resize(m);
+    fvec.resize(m);
+    fjac.resize(m, n);
+    if (!useExternalScaling)
+        diag.resize(n);
+    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    qtf.resize(n);
+
+    /* Function Body */
+    nfev = 0;
+    njev = 0;
+
+    /*     check the input parameters for errors. */
+    if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    if (useExternalScaling)
+        for (Index j = 0; j < n; ++j)
+            if (diag[j] <= 0.)
+                return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    /*     evaluate the function at the starting point */
+    /*     and calculate its norm. */
+    nfev = 1;
+    if ( functor(x, fvec) < 0)
+        return LevenbergMarquardtSpace::UserAsked;
+    fnorm = fvec.stableNorm();
+
+    /*     initialize levenberg-marquardt parameter and iteration counter. */
+    par = 0.;
+    iter = 1;
+
+    return LevenbergMarquardtSpace::NotStarted;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType  &x)
+{
+    using std::abs;
+    using std::sqrt;
+    
+    eigen_assert(x.size()==n); // check the caller is not cheating us
+
+    /* calculate the jacobian matrix. */
+    Index df_ret = functor.df(x, fjac);
+    if (df_ret<0)
+        return LevenbergMarquardtSpace::UserAsked;
+    if (df_ret>0)
+        // numerical diff, we evaluated the function df_ret times
+        nfev += df_ret;
+    else njev++;
+
+    /* compute the qr factorization of the jacobian. */
+    wa2 = fjac.colwise().blueNorm();
+    ColPivHouseholderQR<JacobianType> qrfac(fjac);
+    fjac = qrfac.matrixQR();
+    permutation = qrfac.colsPermutation();
+
+    /* on the first iteration and if external scaling is not used, scale according */
+    /* to the norms of the columns of the initial jacobian. */
+    if (iter == 1) {
+        if (!useExternalScaling)
+            for (Index j = 0; j < n; ++j)
+                diag[j] = (wa2[j]==0.)? 1. : wa2[j];
+
+        /* on the first iteration, calculate the norm of the scaled x */
+        /* and initialize the step bound delta. */
+        xnorm = diag.cwiseProduct(x).stableNorm();
+        delta = parameters.factor * xnorm;
+        if (delta == 0.)
+            delta = parameters.factor;
+    }
+
+    /* form (q transpose)*fvec and store the first n components in */
+    /* qtf. */
+    wa4 = fvec;
+    wa4.applyOnTheLeft(qrfac.householderQ().adjoint());
+    qtf = wa4.head(n);
+
+    /* compute the norm of the scaled gradient. */
+    gnorm = 0.;
+    if (fnorm != 0.)
+        for (Index j = 0; j < n; ++j)
+            if (wa2[permutation.indices()[j]] != 0.)
+                gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]]));
+
+    /* test for convergence of the gradient norm. */
+    if (gnorm <= parameters.gtol)
+        return LevenbergMarquardtSpace::CosinusTooSmall;
+
+    /* rescale if necessary. */
+    if (!useExternalScaling)
+        diag = diag.cwiseMax(wa2);
+
+    do {
+
+        /* determine the levenberg-marquardt parameter. */
+        internal::lmpar2<Scalar>(qrfac, diag, qtf, delta, par, wa1);
+
+        /* store the direction p and x + p. calculate the norm of p. */
+        wa1 = -wa1;
+        wa2 = x + wa1;
+        pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+        /* on the first iteration, adjust the initial step bound. */
+        if (iter == 1)
+            delta = (std::min)(delta,pnorm);
+
+        /* evaluate the function at x + p and calculate its norm. */
+        if ( functor(wa2, wa4) < 0)
+            return LevenbergMarquardtSpace::UserAsked;
+        ++nfev;
+        fnorm1 = wa4.stableNorm();
+
+        /* compute the scaled actual reduction. */
+        actred = -1.;
+        if (Scalar(.1) * fnorm1 < fnorm)
+            actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+        /* compute the scaled predicted reduction and */
+        /* the scaled directional derivative. */
+        wa3 = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() *wa1);
+        temp1 = numext::abs2(wa3.stableNorm() / fnorm);
+        temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
+        prered = temp1 + temp2 / Scalar(.5);
+        dirder = -(temp1 + temp2);
+
+        /* compute the ratio of the actual to the predicted */
+        /* reduction. */
+        ratio = 0.;
+        if (prered != 0.)
+            ratio = actred / prered;
+
+        /* update the step bound. */
+        if (ratio <= Scalar(.25)) {
+            if (actred >= 0.)
+                temp = Scalar(.5);
+            if (actred < 0.)
+                temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
+            if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1))
+                temp = Scalar(.1);
+            /* Computing MIN */
+            delta = temp * (std::min)(delta, pnorm / Scalar(.1));
+            par /= temp;
+        } else if (!(par != 0. && ratio < Scalar(.75))) {
+            delta = pnorm / Scalar(.5);
+            par = Scalar(.5) * par;
+        }
+
+        /* test for successful iteration. */
+        if (ratio >= Scalar(1e-4)) {
+            /* successful iteration. update x, fvec, and their norms. */
+            x = wa2;
+            wa2 = diag.cwiseProduct(x);
+            fvec = wa4;
+            xnorm = wa2.stableNorm();
+            fnorm = fnorm1;
+            ++iter;
+        }
+
+        /* tests for convergence. */
+        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm)
+            return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
+            return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+        if (delta <= parameters.xtol * xnorm)
+            return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+
+        /* tests for termination and stringent tolerances. */
+        if (nfev >= parameters.maxfev)
+            return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+        if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
+            return LevenbergMarquardtSpace::FtolTooSmall;
+        if (delta <= NumTraits<Scalar>::epsilon() * xnorm)
+            return LevenbergMarquardtSpace::XtolTooSmall;
+        if (gnorm <= NumTraits<Scalar>::epsilon())
+            return LevenbergMarquardtSpace::GtolTooSmall;
+
+    } while (ratio < Scalar(1e-4));
+
+    return LevenbergMarquardtSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::lmstr1(
+        FVectorType  &x,
+        const Scalar tol
+        )
+{
+    n = x.size();
+    m = functor.values();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || m < n || tol < 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    resetParameters();
+    parameters.ftol = tol;
+    parameters.xtol = tol;
+    parameters.maxfev = 100*(n+1);
+
+    return minimizeOptimumStorage(x);
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageInit(FVectorType  &x)
+{
+    n = x.size();
+    m = functor.values();
+
+    wa1.resize(n); wa2.resize(n); wa3.resize(n);
+    wa4.resize(m);
+    fvec.resize(m);
+    // Only R is stored in fjac. Q is only used to compute 'qtf', which is
+    // Q.transpose()*rhs. qtf will be updated using givens rotation,
+    // instead of storing them in Q.
+    // The purpose it to only use a nxn matrix, instead of mxn here, so
+    // that we can handle cases where m>>n :
+    fjac.resize(n, n);
+    if (!useExternalScaling)
+        diag.resize(n);
+    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    qtf.resize(n);
+
+    /* Function Body */
+    nfev = 0;
+    njev = 0;
+
+    /*     check the input parameters for errors. */
+    if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    if (useExternalScaling)
+        for (Index j = 0; j < n; ++j)
+            if (diag[j] <= 0.)
+                return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    /*     evaluate the function at the starting point */
+    /*     and calculate its norm. */
+    nfev = 1;
+    if ( functor(x, fvec) < 0)
+        return LevenbergMarquardtSpace::UserAsked;
+    fnorm = fvec.stableNorm();
+
+    /*     initialize levenberg-marquardt parameter and iteration counter. */
+    par = 0.;
+    iter = 1;
+
+    return LevenbergMarquardtSpace::NotStarted;
+}
+
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageOneStep(FVectorType  &x)
+{
+    using std::abs;
+    using std::sqrt;
+    
+    eigen_assert(x.size()==n); // check the caller is not cheating us
+
+    Index i, j;
+    bool sing;
+
+    /* compute the qr factorization of the jacobian matrix */
+    /* calculated one row at a time, while simultaneously */
+    /* forming (q transpose)*fvec and storing the first */
+    /* n components in qtf. */
+    qtf.fill(0.);
+    fjac.fill(0.);
+    Index rownb = 2;
+    for (i = 0; i < m; ++i) {
+        if (functor.df(x, wa3, rownb) < 0) return LevenbergMarquardtSpace::UserAsked;
+        internal::rwupdt<Scalar>(fjac, wa3, qtf, fvec[i]);
+        ++rownb;
+    }
+    ++njev;
+
+    /* if the jacobian is rank deficient, call qrfac to */
+    /* reorder its columns and update the components of qtf. */
+    sing = false;
+    for (j = 0; j < n; ++j) {
+        if (fjac(j,j) == 0.)
+            sing = true;
+        wa2[j] = fjac.col(j).head(j).stableNorm();
+    }
+    permutation.setIdentity(n);
+    if (sing) {
+        wa2 = fjac.colwise().blueNorm();
+        // TODO We have no unit test covering this code path, do not modify
+        // until it is carefully tested
+        ColPivHouseholderQR<JacobianType> qrfac(fjac);
+        fjac = qrfac.matrixQR();
+        wa1 = fjac.diagonal();
+        fjac.diagonal() = qrfac.hCoeffs();
+        permutation = qrfac.colsPermutation();
+        // TODO : avoid this:
+        for(Index ii=0; ii< fjac.cols(); ii++) fjac.col(ii).segment(ii+1, fjac.rows()-ii-1) *= fjac(ii,ii); // rescale vectors
+
+        for (j = 0; j < n; ++j) {
+            if (fjac(j,j) != 0.) {
+                sum = 0.;
+                for (i = j; i < n; ++i)
+                    sum += fjac(i,j) * qtf[i];
+                temp = -sum / fjac(j,j);
+                for (i = j; i < n; ++i)
+                    qtf[i] += fjac(i,j) * temp;
+            }
+            fjac(j,j) = wa1[j];
+        }
+    }
+
+    /* on the first iteration and if external scaling is not used, scale according */
+    /* to the norms of the columns of the initial jacobian. */
+    if (iter == 1) {
+        if (!useExternalScaling)
+            for (j = 0; j < n; ++j)
+                diag[j] = (wa2[j]==0.)? 1. : wa2[j];
+
+        /* on the first iteration, calculate the norm of the scaled x */
+        /* and initialize the step bound delta. */
+        xnorm = diag.cwiseProduct(x).stableNorm();
+        delta = parameters.factor * xnorm;
+        if (delta == 0.)
+            delta = parameters.factor;
+    }
+
+    /* compute the norm of the scaled gradient. */
+    gnorm = 0.;
+    if (fnorm != 0.)
+        for (j = 0; j < n; ++j)
+            if (wa2[permutation.indices()[j]] != 0.)
+                gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]]));
+
+    /* test for convergence of the gradient norm. */
+    if (gnorm <= parameters.gtol)
+        return LevenbergMarquardtSpace::CosinusTooSmall;
+
+    /* rescale if necessary. */
+    if (!useExternalScaling)
+        diag = diag.cwiseMax(wa2);
+
+    do {
+
+        /* determine the levenberg-marquardt parameter. */
+        internal::lmpar<Scalar>(fjac, permutation.indices(), diag, qtf, delta, par, wa1);
+
+        /* store the direction p and x + p. calculate the norm of p. */
+        wa1 = -wa1;
+        wa2 = x + wa1;
+        pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+        /* on the first iteration, adjust the initial step bound. */
+        if (iter == 1)
+            delta = (std::min)(delta,pnorm);
+
+        /* evaluate the function at x + p and calculate its norm. */
+        if ( functor(wa2, wa4) < 0)
+            return LevenbergMarquardtSpace::UserAsked;
+        ++nfev;
+        fnorm1 = wa4.stableNorm();
+
+        /* compute the scaled actual reduction. */
+        actred = -1.;
+        if (Scalar(.1) * fnorm1 < fnorm)
+            actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+        /* compute the scaled predicted reduction and */
+        /* the scaled directional derivative. */
+        wa3 = fjac.topLeftCorner(n,n).template triangularView<Upper>() * (permutation.inverse() * wa1);
+        temp1 = numext::abs2(wa3.stableNorm() / fnorm);
+        temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
+        prered = temp1 + temp2 / Scalar(.5);
+        dirder = -(temp1 + temp2);
+
+        /* compute the ratio of the actual to the predicted */
+        /* reduction. */
+        ratio = 0.;
+        if (prered != 0.)
+            ratio = actred / prered;
+
+        /* update the step bound. */
+        if (ratio <= Scalar(.25)) {
+            if (actred >= 0.)
+                temp = Scalar(.5);
+            if (actred < 0.)
+                temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
+            if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1))
+                temp = Scalar(.1);
+            /* Computing MIN */
+            delta = temp * (std::min)(delta, pnorm / Scalar(.1));
+            par /= temp;
+        } else if (!(par != 0. && ratio < Scalar(.75))) {
+            delta = pnorm / Scalar(.5);
+            par = Scalar(.5) * par;
+        }
+
+        /* test for successful iteration. */
+        if (ratio >= Scalar(1e-4)) {
+            /* successful iteration. update x, fvec, and their norms. */
+            x = wa2;
+            wa2 = diag.cwiseProduct(x);
+            fvec = wa4;
+            xnorm = wa2.stableNorm();
+            fnorm = fnorm1;
+            ++iter;
+        }
+
+        /* tests for convergence. */
+        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm)
+            return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
+            return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+        if (delta <= parameters.xtol * xnorm)
+            return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+
+        /* tests for termination and stringent tolerances. */
+        if (nfev >= parameters.maxfev)
+            return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+        if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
+            return LevenbergMarquardtSpace::FtolTooSmall;
+        if (delta <= NumTraits<Scalar>::epsilon() * xnorm)
+            return LevenbergMarquardtSpace::XtolTooSmall;
+        if (gnorm <= NumTraits<Scalar>::epsilon())
+            return LevenbergMarquardtSpace::GtolTooSmall;
+
+    } while (ratio < Scalar(1e-4));
+
+    return LevenbergMarquardtSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorage(FVectorType  &x)
+{
+    LevenbergMarquardtSpace::Status status = minimizeOptimumStorageInit(x);
+    if (status==LevenbergMarquardtSpace::ImproperInputParameters)
+        return status;
+    do {
+        status = minimizeOptimumStorageOneStep(x);
+    } while (status==LevenbergMarquardtSpace::Running);
+    return status;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::lmdif1(
+        FunctorType &functor,
+        FVectorType  &x,
+        Index *nfev,
+        const Scalar tol
+        )
+{
+    Index n = x.size();
+    Index m = functor.values();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || m < n || tol < 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    NumericalDiff<FunctorType> numDiff(functor);
+    // embedded LevenbergMarquardt
+    LevenbergMarquardt<NumericalDiff<FunctorType>, Scalar > lm(numDiff);
+    lm.parameters.ftol = tol;
+    lm.parameters.xtol = tol;
+    lm.parameters.maxfev = 200*(n+1);
+
+    LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x));
+    if (nfev)
+        * nfev = lm.nfev;
+    return info;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEVENBERGMARQUARDT__H
+
+//vim: ai ts=4 sts=4 et sw=4
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/chkder.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/chkder.h
new file mode 100644
index 0000000..db8ff7d
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/chkder.h
@@ -0,0 +1,66 @@
+#define chkder_log10e 0.43429448190325182765
+#define chkder_factor 100.
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Scalar>
+void chkder(
+        const Matrix< Scalar, Dynamic, 1 >  &x,
+        const Matrix< Scalar, Dynamic, 1 >  &fvec,
+        const Matrix< Scalar, Dynamic, Dynamic > &fjac,
+        Matrix< Scalar, Dynamic, 1 >  &xp,
+        const Matrix< Scalar, Dynamic, 1 >  &fvecp,
+        int mode,
+        Matrix< Scalar, Dynamic, 1 >  &err
+        )
+{
+    using std::sqrt;
+    using std::abs;
+    using std::log;
+    
+    typedef DenseIndex Index;
+
+    const Scalar eps = sqrt(NumTraits<Scalar>::epsilon());
+    const Scalar epsf = chkder_factor * NumTraits<Scalar>::epsilon();
+    const Scalar epslog = chkder_log10e * log(eps);
+    Scalar temp;
+
+    const Index m = fvec.size(), n = x.size();
+
+    if (mode != 2) {
+        /* mode = 1. */
+        xp.resize(n);
+        for (Index j = 0; j < n; ++j) {
+            temp = eps * abs(x[j]);
+            if (temp == 0.)
+                temp = eps;
+            xp[j] = x[j] + temp;
+        }
+    }
+    else {
+        /* mode = 2. */
+        err.setZero(m); 
+        for (Index j = 0; j < n; ++j) {
+            temp = abs(x[j]);
+            if (temp == 0.)
+                temp = 1.;
+            err += temp * fjac.col(j);
+        }
+        for (Index i = 0; i < m; ++i) {
+            temp = 1.;
+            if (fvec[i] != 0. && fvecp[i] != 0. && abs(fvecp[i] - fvec[i]) >= epsf * abs(fvec[i]))
+                temp = eps * abs((fvecp[i] - fvec[i]) / eps - err[i]) / (abs(fvec[i]) + abs(fvecp[i]));
+            err[i] = 1.;
+            if (temp > NumTraits<Scalar>::epsilon() && temp < eps)
+                err[i] = (chkder_log10e * log(temp) - epslog) / epslog;
+            if (temp >= eps)
+                err[i] = 0.;
+        }
+    }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/covar.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/covar.h
new file mode 100644
index 0000000..68260d1
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/covar.h
@@ -0,0 +1,70 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void covar(
+        Matrix< Scalar, Dynamic, Dynamic > &r,
+        const VectorXi &ipvt,
+        Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()) )
+{
+    using std::abs;
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index i, j, k, l, ii, jj;
+    bool sing;
+    Scalar temp;
+
+    /* Function Body */
+    const Index n = r.cols();
+    const Scalar tolr = tol * abs(r(0,0));
+    Matrix< Scalar, Dynamic, 1 > wa(n);
+    eigen_assert(ipvt.size()==n);
+
+    /* form the inverse of r in the full upper triangle of r. */
+    l = -1;
+    for (k = 0; k < n; ++k)
+        if (abs(r(k,k)) > tolr) {
+            r(k,k) = 1. / r(k,k);
+            for (j = 0; j <= k-1; ++j) {
+                temp = r(k,k) * r(j,k);
+                r(j,k) = 0.;
+                r.col(k).head(j+1) -= r.col(j).head(j+1) * temp;
+            }
+            l = k;
+        }
+
+    /* form the full upper triangle of the inverse of (r transpose)*r */
+    /* in the full upper triangle of r. */
+    for (k = 0; k <= l; ++k) {
+        for (j = 0; j <= k-1; ++j)
+            r.col(j).head(j+1) += r.col(k).head(j+1) * r(j,k);
+        r.col(k).head(k+1) *= r(k,k);
+    }
+
+    /* form the full lower triangle of the covariance matrix */
+    /* in the strict lower triangle of r and in wa. */
+    for (j = 0; j < n; ++j) {
+        jj = ipvt[j];
+        sing = j > l;
+        for (i = 0; i <= j; ++i) {
+            if (sing)
+                r(i,j) = 0.;
+            ii = ipvt[i];
+            if (ii > jj)
+                r(ii,jj) = r(i,j);
+            if (ii < jj)
+                r(jj,ii) = r(i,j);
+        }
+        wa[jj] = r(j,j);
+    }
+
+    /* symmetrize the covariance matrix in r. */
+    r.topLeftCorner(n,n).template triangularView<StrictlyUpper>() = r.topLeftCorner(n,n).transpose();
+    r.diagonal() = wa;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/dogleg.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/dogleg.h
new file mode 100644
index 0000000..80c5d27
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/dogleg.h
@@ -0,0 +1,107 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void dogleg(
+        const Matrix< Scalar, Dynamic, Dynamic >  &qrfac,
+        const Matrix< Scalar, Dynamic, 1 >  &diag,
+        const Matrix< Scalar, Dynamic, 1 >  &qtb,
+        Scalar delta,
+        Matrix< Scalar, Dynamic, 1 >  &x)
+{
+    using std::abs;
+    using std::sqrt;
+    
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index i, j;
+    Scalar sum, temp, alpha, bnorm;
+    Scalar gnorm, qnorm;
+    Scalar sgnorm;
+
+    /* Function Body */
+    const Scalar epsmch = NumTraits<Scalar>::epsilon();
+    const Index n = qrfac.cols();
+    eigen_assert(n==qtb.size());
+    eigen_assert(n==x.size());
+    eigen_assert(n==diag.size());
+    Matrix< Scalar, Dynamic, 1 >  wa1(n), wa2(n);
+
+    /* first, calculate the gauss-newton direction. */
+    for (j = n-1; j >=0; --j) {
+        temp = qrfac(j,j);
+        if (temp == 0.) {
+            temp = epsmch * qrfac.col(j).head(j+1).maxCoeff();
+            if (temp == 0.)
+                temp = epsmch;
+        }
+        if (j==n-1)
+            x[j] = qtb[j] / temp;
+        else
+            x[j] = (qtb[j] - qrfac.row(j).tail(n-j-1).dot(x.tail(n-j-1))) / temp;
+    }
+
+    /* test whether the gauss-newton direction is acceptable. */
+    qnorm = diag.cwiseProduct(x).stableNorm();
+    if (qnorm <= delta)
+        return;
+
+    // TODO : this path is not tested by Eigen unit tests
+
+    /* the gauss-newton direction is not acceptable. */
+    /* next, calculate the scaled gradient direction. */
+
+    wa1.fill(0.);
+    for (j = 0; j < n; ++j) {
+        wa1.tail(n-j) += qrfac.row(j).tail(n-j) * qtb[j];
+        wa1[j] /= diag[j];
+    }
+
+    /* calculate the norm of the scaled gradient and test for */
+    /* the special case in which the scaled gradient is zero. */
+    gnorm = wa1.stableNorm();
+    sgnorm = 0.;
+    alpha = delta / qnorm;
+    if (gnorm == 0.)
+        goto algo_end;
+
+    /* calculate the point along the scaled gradient */
+    /* at which the quadratic is minimized. */
+    wa1.array() /= (diag*gnorm).array();
+    // TODO : once unit tests cover this part,:
+    // wa2 = qrfac.template triangularView<Upper>() * wa1;
+    for (j = 0; j < n; ++j) {
+        sum = 0.;
+        for (i = j; i < n; ++i) {
+            sum += qrfac(j,i) * wa1[i];
+        }
+        wa2[j] = sum;
+    }
+    temp = wa2.stableNorm();
+    sgnorm = gnorm / temp / temp;
+
+    /* test whether the scaled gradient direction is acceptable. */
+    alpha = 0.;
+    if (sgnorm >= delta)
+        goto algo_end;
+
+    /* the scaled gradient direction is not acceptable. */
+    /* finally, calculate the point along the dogleg */
+    /* at which the quadratic is minimized. */
+    bnorm = qtb.stableNorm();
+    temp = bnorm / gnorm * (bnorm / qnorm) * (sgnorm / delta);
+    temp = temp - delta / qnorm * numext::abs2(sgnorm / delta) + sqrt(numext::abs2(temp - delta / qnorm) + (1.-numext::abs2(delta / qnorm)) * (1.-numext::abs2(sgnorm / delta)));
+    alpha = delta / qnorm * (1. - numext::abs2(sgnorm / delta)) / temp;
+algo_end:
+
+    /* form appropriate convex combination of the gauss-newton */
+    /* direction and the scaled gradient direction. */
+    temp = (1.-alpha) * (std::min)(sgnorm,delta);
+    x = temp * wa1 + alpha * x;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h
new file mode 100644
index 0000000..bb7cf26
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h
@@ -0,0 +1,79 @@
+namespace Eigen { 
+
+namespace internal {
+
+template<typename FunctorType, typename Scalar>
+DenseIndex fdjac1(
+        const FunctorType &Functor,
+        Matrix< Scalar, Dynamic, 1 >  &x,
+        Matrix< Scalar, Dynamic, 1 >  &fvec,
+        Matrix< Scalar, Dynamic, Dynamic > &fjac,
+        DenseIndex ml, DenseIndex mu,
+        Scalar epsfcn)
+{
+    using std::sqrt;
+    using std::abs;
+    
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Scalar h;
+    Index j, k;
+    Scalar eps, temp;
+    Index msum;
+    int iflag;
+    Index start, length;
+
+    /* Function Body */
+    const Scalar epsmch = NumTraits<Scalar>::epsilon();
+    const Index n = x.size();
+    eigen_assert(fvec.size()==n);
+    Matrix< Scalar, Dynamic, 1 >  wa1(n);
+    Matrix< Scalar, Dynamic, 1 >  wa2(n);
+
+    eps = sqrt((std::max)(epsfcn,epsmch));
+    msum = ml + mu + 1;
+    if (msum >= n) {
+        /* computation of dense approximate jacobian. */
+        for (j = 0; j < n; ++j) {
+            temp = x[j];
+            h = eps * abs(temp);
+            if (h == 0.)
+                h = eps;
+            x[j] = temp + h;
+            iflag = Functor(x, wa1);
+            if (iflag < 0)
+                return iflag;
+            x[j] = temp;
+            fjac.col(j) = (wa1-fvec)/h;
+        }
+
+    }else {
+        /* computation of banded approximate jacobian. */
+        for (k = 0; k < msum; ++k) {
+            for (j = k; (msum<0) ? (j>n): (j<n); j += msum) {
+                wa2[j] = x[j];
+                h = eps * abs(wa2[j]);
+                if (h == 0.) h = eps;
+                x[j] = wa2[j] + h;
+            }
+            iflag = Functor(x, wa1);
+            if (iflag < 0)
+                return iflag;
+            for (j = k; (msum<0) ? (j>n): (j<n); j += msum) {
+                x[j] = wa2[j];
+                h = eps * abs(wa2[j]);
+                if (h == 0.) h = eps;
+                fjac.col(j).setZero();
+                start = std::max<Index>(0,j-mu);
+                length = (std::min)(n-1, j+ml) - start + 1;
+                fjac.col(j).segment(start, length) = ( wa1.segment(start, length)-fvec.segment(start, length))/h;
+            }
+        }
+    }
+    return 0;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/lmpar.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/lmpar.h
new file mode 100644
index 0000000..4c17d4c
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/lmpar.h
@@ -0,0 +1,298 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void lmpar(
+        Matrix< Scalar, Dynamic, Dynamic > &r,
+        const VectorXi &ipvt,
+        const Matrix< Scalar, Dynamic, 1 >  &diag,
+        const Matrix< Scalar, Dynamic, 1 >  &qtb,
+        Scalar delta,
+        Scalar &par,
+        Matrix< Scalar, Dynamic, 1 >  &x)
+{
+    using std::abs;
+    using std::sqrt;
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index i, j, l;
+    Scalar fp;
+    Scalar parc, parl;
+    Index iter;
+    Scalar temp, paru;
+    Scalar gnorm;
+    Scalar dxnorm;
+
+
+    /* Function Body */
+    const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+    const Index n = r.cols();
+    eigen_assert(n==diag.size());
+    eigen_assert(n==qtb.size());
+    eigen_assert(n==x.size());
+
+    Matrix< Scalar, Dynamic, 1 >  wa1, wa2;
+
+    /* compute and store in x the gauss-newton direction. if the */
+    /* jacobian is rank-deficient, obtain a least squares solution. */
+    Index nsing = n-1;
+    wa1 = qtb;
+    for (j = 0; j < n; ++j) {
+        if (r(j,j) == 0. && nsing == n-1)
+            nsing = j - 1;
+        if (nsing < n-1)
+            wa1[j] = 0.;
+    }
+    for (j = nsing; j>=0; --j) {
+        wa1[j] /= r(j,j);
+        temp = wa1[j];
+        for (i = 0; i < j ; ++i)
+            wa1[i] -= r(i,j) * temp;
+    }
+
+    for (j = 0; j < n; ++j)
+        x[ipvt[j]] = wa1[j];
+
+    /* initialize the iteration counter. */
+    /* evaluate the function at the origin, and test */
+    /* for acceptance of the gauss-newton direction. */
+    iter = 0;
+    wa2 = diag.cwiseProduct(x);
+    dxnorm = wa2.blueNorm();
+    fp = dxnorm - delta;
+    if (fp <= Scalar(0.1) * delta) {
+        par = 0;
+        return;
+    }
+
+    /* if the jacobian is not rank deficient, the newton */
+    /* step provides a lower bound, parl, for the zero of */
+    /* the function. otherwise set this bound to zero. */
+    parl = 0.;
+    if (nsing >= n-1) {
+        for (j = 0; j < n; ++j) {
+            l = ipvt[j];
+            wa1[j] = diag[l] * (wa2[l] / dxnorm);
+        }
+        // it's actually a triangularView.solveInplace(), though in a weird
+        // way:
+        for (j = 0; j < n; ++j) {
+            Scalar sum = 0.;
+            for (i = 0; i < j; ++i)
+                sum += r(i,j) * wa1[i];
+            wa1[j] = (wa1[j] - sum) / r(j,j);
+        }
+        temp = wa1.blueNorm();
+        parl = fp / delta / temp / temp;
+    }
+
+    /* calculate an upper bound, paru, for the zero of the function. */
+    for (j = 0; j < n; ++j)
+        wa1[j] = r.col(j).head(j+1).dot(qtb.head(j+1)) / diag[ipvt[j]];
+
+    gnorm = wa1.stableNorm();
+    paru = gnorm / delta;
+    if (paru == 0.)
+        paru = dwarf / (std::min)(delta,Scalar(0.1));
+
+    /* if the input par lies outside of the interval (parl,paru), */
+    /* set par to the closer endpoint. */
+    par = (std::max)(par,parl);
+    par = (std::min)(par,paru);
+    if (par == 0.)
+        par = gnorm / dxnorm;
+
+    /* beginning of an iteration. */
+    while (true) {
+        ++iter;
+
+        /* evaluate the function at the current value of par. */
+        if (par == 0.)
+            par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
+        wa1 = sqrt(par)* diag;
+
+        Matrix< Scalar, Dynamic, 1 > sdiag(n);
+        qrsolv<Scalar>(r, ipvt, wa1, qtb, x, sdiag);
+
+        wa2 = diag.cwiseProduct(x);
+        dxnorm = wa2.blueNorm();
+        temp = fp;
+        fp = dxnorm - delta;
+
+        /* if the function is small enough, accept the current value */
+        /* of par. also test for the exceptional cases where parl */
+        /* is zero or the number of iterations has reached 10. */
+        if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
+            break;
+
+        /* compute the newton correction. */
+        for (j = 0; j < n; ++j) {
+            l = ipvt[j];
+            wa1[j] = diag[l] * (wa2[l] / dxnorm);
+        }
+        for (j = 0; j < n; ++j) {
+            wa1[j] /= sdiag[j];
+            temp = wa1[j];
+            for (i = j+1; i < n; ++i)
+                wa1[i] -= r(i,j) * temp;
+        }
+        temp = wa1.blueNorm();
+        parc = fp / delta / temp / temp;
+
+        /* depending on the sign of the function, update parl or paru. */
+        if (fp > 0.)
+            parl = (std::max)(parl,par);
+        if (fp < 0.)
+            paru = (std::min)(paru,par);
+
+        /* compute an improved estimate for par. */
+        /* Computing MAX */
+        par = (std::max)(parl,par+parc);
+
+        /* end of an iteration. */
+    }
+
+    /* termination. */
+    if (iter == 0)
+        par = 0.;
+    return;
+}
+
+template <typename Scalar>
+void lmpar2(
+        const ColPivHouseholderQR<Matrix< Scalar, Dynamic, Dynamic> > &qr,
+        const Matrix< Scalar, Dynamic, 1 >  &diag,
+        const Matrix< Scalar, Dynamic, 1 >  &qtb,
+        Scalar delta,
+        Scalar &par,
+        Matrix< Scalar, Dynamic, 1 >  &x)
+
+{
+    using std::sqrt;
+    using std::abs;
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index j;
+    Scalar fp;
+    Scalar parc, parl;
+    Index iter;
+    Scalar temp, paru;
+    Scalar gnorm;
+    Scalar dxnorm;
+
+
+    /* Function Body */
+    const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+    const Index n = qr.matrixQR().cols();
+    eigen_assert(n==diag.size());
+    eigen_assert(n==qtb.size());
+
+    Matrix< Scalar, Dynamic, 1 >  wa1, wa2;
+
+    /* compute and store in x the gauss-newton direction. if the */
+    /* jacobian is rank-deficient, obtain a least squares solution. */
+
+//    const Index rank = qr.nonzeroPivots(); // exactly double(0.)
+    const Index rank = qr.rank(); // use a threshold
+    wa1 = qtb;
+    wa1.tail(n-rank).setZero();
+    qr.matrixQR().topLeftCorner(rank, rank).template triangularView<Upper>().solveInPlace(wa1.head(rank));
+
+    x = qr.colsPermutation()*wa1;
+
+    /* initialize the iteration counter. */
+    /* evaluate the function at the origin, and test */
+    /* for acceptance of the gauss-newton direction. */
+    iter = 0;
+    wa2 = diag.cwiseProduct(x);
+    dxnorm = wa2.blueNorm();
+    fp = dxnorm - delta;
+    if (fp <= Scalar(0.1) * delta) {
+        par = 0;
+        return;
+    }
+
+    /* if the jacobian is not rank deficient, the newton */
+    /* step provides a lower bound, parl, for the zero of */
+    /* the function. otherwise set this bound to zero. */
+    parl = 0.;
+    if (rank==n) {
+        wa1 = qr.colsPermutation().inverse() *  diag.cwiseProduct(wa2)/dxnorm;
+        qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+        temp = wa1.blueNorm();
+        parl = fp / delta / temp / temp;
+    }
+
+    /* calculate an upper bound, paru, for the zero of the function. */
+    for (j = 0; j < n; ++j)
+        wa1[j] = qr.matrixQR().col(j).head(j+1).dot(qtb.head(j+1)) / diag[qr.colsPermutation().indices()(j)];
+
+    gnorm = wa1.stableNorm();
+    paru = gnorm / delta;
+    if (paru == 0.)
+        paru = dwarf / (std::min)(delta,Scalar(0.1));
+
+    /* if the input par lies outside of the interval (parl,paru), */
+    /* set par to the closer endpoint. */
+    par = (std::max)(par,parl);
+    par = (std::min)(par,paru);
+    if (par == 0.)
+        par = gnorm / dxnorm;
+
+    /* beginning of an iteration. */
+    Matrix< Scalar, Dynamic, Dynamic > s = qr.matrixQR();
+    while (true) {
+        ++iter;
+
+        /* evaluate the function at the current value of par. */
+        if (par == 0.)
+            par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
+        wa1 = sqrt(par)* diag;
+
+        Matrix< Scalar, Dynamic, 1 > sdiag(n);
+        qrsolv<Scalar>(s, qr.colsPermutation().indices(), wa1, qtb, x, sdiag);
+
+        wa2 = diag.cwiseProduct(x);
+        dxnorm = wa2.blueNorm();
+        temp = fp;
+        fp = dxnorm - delta;
+
+        /* if the function is small enough, accept the current value */
+        /* of par. also test for the exceptional cases where parl */
+        /* is zero or the number of iterations has reached 10. */
+        if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
+            break;
+
+        /* compute the newton correction. */
+        wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2/dxnorm);
+        // we could almost use this here, but the diagonal is outside qr, in sdiag[]
+        // qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+        for (j = 0; j < n; ++j) {
+            wa1[j] /= sdiag[j];
+            temp = wa1[j];
+            for (Index i = j+1; i < n; ++i)
+                wa1[i] -= s(i,j) * temp;
+        }
+        temp = wa1.blueNorm();
+        parc = fp / delta / temp / temp;
+
+        /* depending on the sign of the function, update parl or paru. */
+        if (fp > 0.)
+            parl = (std::max)(parl,par);
+        if (fp < 0.)
+            paru = (std::min)(paru,par);
+
+        /* compute an improved estimate for par. */
+        par = (std::max)(parl,par+parc);
+    }
+    if (iter == 0)
+        par = 0.;
+    return;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
new file mode 100644
index 0000000..feafd62
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
@@ -0,0 +1,91 @@
+namespace Eigen { 
+
+namespace internal {
+
+// TODO : once qrsolv2 is removed, use ColPivHouseholderQR or PermutationMatrix instead of ipvt
+template <typename Scalar>
+void qrsolv(
+        Matrix< Scalar, Dynamic, Dynamic > &s,
+        // TODO : use a PermutationMatrix once lmpar is no more:
+        const VectorXi &ipvt,
+        const Matrix< Scalar, Dynamic, 1 >  &diag,
+        const Matrix< Scalar, Dynamic, 1 >  &qtb,
+        Matrix< Scalar, Dynamic, 1 >  &x,
+        Matrix< Scalar, Dynamic, 1 >  &sdiag)
+
+{
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index i, j, k, l;
+    Scalar temp;
+    Index n = s.cols();
+    Matrix< Scalar, Dynamic, 1 >  wa(n);
+    JacobiRotation<Scalar> givens;
+
+    /* Function Body */
+    // the following will only change the lower triangular part of s, including
+    // the diagonal, though the diagonal is restored afterward
+
+    /*     copy r and (q transpose)*b to preserve input and initialize s. */
+    /*     in particular, save the diagonal elements of r in x. */
+    x = s.diagonal();
+    wa = qtb;
+
+    s.topLeftCorner(n,n).template triangularView<StrictlyLower>() = s.topLeftCorner(n,n).transpose();
+
+    /*     eliminate the diagonal matrix d using a givens rotation. */
+    for (j = 0; j < n; ++j) {
+
+        /*        prepare the row of d to be eliminated, locating the */
+        /*        diagonal element using p from the qr factorization. */
+        l = ipvt[j];
+        if (diag[l] == 0.)
+            break;
+        sdiag.tail(n-j).setZero();
+        sdiag[j] = diag[l];
+
+        /*        the transformations to eliminate the row of d */
+        /*        modify only a single element of (q transpose)*b */
+        /*        beyond the first n, which is initially zero. */
+        Scalar qtbpj = 0.;
+        for (k = j; k < n; ++k) {
+            /*           determine a givens rotation which eliminates the */
+            /*           appropriate element in the current row of d. */
+            givens.makeGivens(-s(k,k), sdiag[k]);
+
+            /*           compute the modified diagonal element of r and */
+            /*           the modified element of ((q transpose)*b,0). */
+            s(k,k) = givens.c() * s(k,k) + givens.s() * sdiag[k];
+            temp = givens.c() * wa[k] + givens.s() * qtbpj;
+            qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
+            wa[k] = temp;
+
+            /*           accumulate the tranformation in the row of s. */
+            for (i = k+1; i<n; ++i) {
+                temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
+                sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
+                s(i,k) = temp;
+            }
+        }
+    }
+
+    /*     solve the triangular system for z. if the system is */
+    /*     singular, then obtain a least squares solution. */
+    Index nsing;
+    for(nsing=0; nsing<n && sdiag[nsing]!=0; nsing++) {}
+
+    wa.tail(n-nsing).setZero();
+    s.topLeftCorner(nsing, nsing).transpose().template triangularView<Upper>().solveInPlace(wa.head(nsing));
+
+    // restore
+    sdiag = s.diagonal();
+    s.diagonal() = x;
+
+    /*     permute the components of z back to components of x. */
+    for (j = 0; j < n; ++j) x[ipvt[j]] = wa[j];
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h
new file mode 100644
index 0000000..36ff700
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h
@@ -0,0 +1,30 @@
+namespace Eigen { 
+
+namespace internal {
+
+// TODO : move this to GivensQR once there's such a thing in Eigen
+
+template <typename Scalar>
+void r1mpyq(DenseIndex m, DenseIndex n, Scalar *a, const std::vector<JacobiRotation<Scalar> > &v_givens, const std::vector<JacobiRotation<Scalar> > &w_givens)
+{
+    typedef DenseIndex Index;
+
+    /*     apply the first set of givens rotations to a. */
+    for (Index j = n-2; j>=0; --j)
+        for (Index i = 0; i<m; ++i) {
+            Scalar temp = v_givens[j].c() * a[i+m*j] - v_givens[j].s() * a[i+m*(n-1)];
+            a[i+m*(n-1)] = v_givens[j].s() * a[i+m*j] + v_givens[j].c() * a[i+m*(n-1)];
+            a[i+m*j] = temp;
+        }
+    /*     apply the second set of givens rotations to a. */
+    for (Index j = 0; j<n-1; ++j)
+        for (Index i = 0; i<m; ++i) {
+            Scalar temp = w_givens[j].c() * a[i+m*j] + w_givens[j].s() * a[i+m*(n-1)];
+            a[i+m*(n-1)] = -w_givens[j].s() * a[i+m*j] + w_givens[j].c() * a[i+m*(n-1)];
+            a[i+m*j] = temp;
+        }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
new file mode 100644
index 0000000..f287660
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
@@ -0,0 +1,99 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void r1updt(
+        Matrix< Scalar, Dynamic, Dynamic > &s,
+        const Matrix< Scalar, Dynamic, 1> &u,
+        std::vector<JacobiRotation<Scalar> > &v_givens,
+        std::vector<JacobiRotation<Scalar> > &w_givens,
+        Matrix< Scalar, Dynamic, 1> &v,
+        Matrix< Scalar, Dynamic, 1> &w,
+        bool *sing)
+{
+    typedef DenseIndex Index;
+    const JacobiRotation<Scalar> IdentityRotation = JacobiRotation<Scalar>(1,0);
+
+    /* Local variables */
+    const Index m = s.rows();
+    const Index n = s.cols();
+    Index i, j=1;
+    Scalar temp;
+    JacobiRotation<Scalar> givens;
+
+    // r1updt had a broader usecase, but we dont use it here. And, more
+    // importantly, we can not test it.
+    eigen_assert(m==n);
+    eigen_assert(u.size()==m);
+    eigen_assert(v.size()==n);
+    eigen_assert(w.size()==n);
+
+    /* move the nontrivial part of the last column of s into w. */
+    w[n-1] = s(n-1,n-1);
+
+    /* rotate the vector v into a multiple of the n-th unit vector */
+    /* in such a way that a spike is introduced into w. */
+    for (j=n-2; j>=0; --j) {
+        w[j] = 0.;
+        if (v[j] != 0.) {
+            /* determine a givens rotation which eliminates the */
+            /* j-th element of v. */
+            givens.makeGivens(-v[n-1], v[j]);
+
+            /* apply the transformation to v and store the information */
+            /* necessary to recover the givens rotation. */
+            v[n-1] = givens.s() * v[j] + givens.c() * v[n-1];
+            v_givens[j] = givens;
+
+            /* apply the transformation to s and extend the spike in w. */
+            for (i = j; i < m; ++i) {
+                temp = givens.c() * s(j,i) - givens.s() * w[i];
+                w[i] = givens.s() * s(j,i) + givens.c() * w[i];
+                s(j,i) = temp;
+            }
+        } else
+            v_givens[j] = IdentityRotation;
+    }
+
+    /* add the spike from the rank 1 update to w. */
+    w += v[n-1] * u;
+
+    /* eliminate the spike. */
+    *sing = false;
+    for (j = 0; j < n-1; ++j) {
+        if (w[j] != 0.) {
+            /* determine a givens rotation which eliminates the */
+            /* j-th element of the spike. */
+            givens.makeGivens(-s(j,j), w[j]);
+
+            /* apply the transformation to s and reduce the spike in w. */
+            for (i = j; i < m; ++i) {
+                temp = givens.c() * s(j,i) + givens.s() * w[i];
+                w[i] = -givens.s() * s(j,i) + givens.c() * w[i];
+                s(j,i) = temp;
+            }
+
+            /* store the information necessary to recover the */
+            /* givens rotation. */
+            w_givens[j] = givens;
+        } else
+            v_givens[j] = IdentityRotation;
+
+        /* test for zero diagonal elements in the output s. */
+        if (s(j,j) == 0.) {
+            *sing = true;
+        }
+    }
+    /* move w back into the last column of the output s. */
+    s(n-1,n-1) = w[n-1];
+
+    if (s(j,j) == 0.) {
+        *sing = true;
+    }
+    return;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h
new file mode 100644
index 0000000..6ebf856
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h
@@ -0,0 +1,49 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void rwupdt(
+        Matrix< Scalar, Dynamic, Dynamic >  &r,
+        const Matrix< Scalar, Dynamic, 1>  &w,
+        Matrix< Scalar, Dynamic, 1>  &b,
+        Scalar alpha)
+{
+    typedef DenseIndex Index;
+
+    const Index n = r.cols();
+    eigen_assert(r.rows()>=n);
+    std::vector<JacobiRotation<Scalar> > givens(n);
+
+    /* Local variables */
+    Scalar temp, rowj;
+
+    /* Function Body */
+    for (Index j = 0; j < n; ++j) {
+        rowj = w[j];
+
+        /* apply the previous transformations to */
+        /* r(i,j), i=0,1,...,j-1, and to w(j). */
+        for (Index i = 0; i < j; ++i) {
+            temp = givens[i].c() * r(i,j) + givens[i].s() * rowj;
+            rowj = -givens[i].s() * r(i,j) + givens[i].c() * rowj;
+            r(i,j) = temp;
+        }
+
+        /* determine a givens rotation which eliminates w(j). */
+        givens[j].makeGivens(-r(j,j), rowj);
+
+        if (rowj == 0.)
+            continue; // givens[j] is identity
+
+        /* apply the current transformation to r(j,j), b(j), and alpha. */
+        r(j,j) = givens[j].c() * r(j,j) + givens[j].s() * rowj;
+        temp = givens[j].c() * b[j] + givens[j].s() * alpha;
+        alpha = -givens[j].s() * b[j] + givens[j].c() * alpha;
+        b[j] = temp;
+    }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/include/eigen3/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h b/include/eigen3/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
new file mode 100644
index 0000000..ea5d8bc
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
@@ -0,0 +1,130 @@
+// -*- coding: utf-8
+// vim: set fileencoding=utf-8
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel at freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NUMERICAL_DIFF_H
+#define EIGEN_NUMERICAL_DIFF_H
+
+namespace Eigen { 
+
+enum NumericalDiffMode {
+    Forward,
+    Central
+};
+
+
+/**
+  * This class allows you to add a method df() to your functor, which will 
+  * use numerical differentiation to compute an approximate of the
+  * derivative for the functor. Of course, if you have an analytical form
+  * for the derivative, you should rather implement df() by yourself.
+  *
+  * More information on
+  * http://en.wikipedia.org/wiki/Numerical_differentiation
+  *
+  * Currently only "Forward" and "Central" scheme are implemented.
+  */
+template<typename _Functor, NumericalDiffMode mode=Forward>
+class NumericalDiff : public _Functor
+{
+public:
+    typedef _Functor Functor;
+    typedef typename Functor::Scalar Scalar;
+    typedef typename Functor::InputType InputType;
+    typedef typename Functor::ValueType ValueType;
+    typedef typename Functor::JacobianType JacobianType;
+
+    NumericalDiff(Scalar _epsfcn=0.) : Functor(), epsfcn(_epsfcn) {}
+    NumericalDiff(const Functor& f, Scalar _epsfcn=0.) : Functor(f), epsfcn(_epsfcn) {}
+
+    // forward constructors
+    template<typename T0>
+        NumericalDiff(const T0& a0) : Functor(a0), epsfcn(0) {}
+    template<typename T0, typename T1>
+        NumericalDiff(const T0& a0, const T1& a1) : Functor(a0, a1), epsfcn(0) {}
+    template<typename T0, typename T1, typename T2>
+        NumericalDiff(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2), epsfcn(0) {}
+
+    enum {
+        InputsAtCompileTime = Functor::InputsAtCompileTime,
+        ValuesAtCompileTime = Functor::ValuesAtCompileTime
+    };
+
+    /**
+      * return the number of evaluation of functor
+     */
+    int df(const InputType& _x, JacobianType &jac) const
+    {
+        using std::sqrt;
+        using std::abs;
+        /* Local variables */
+        Scalar h;
+        int nfev=0;
+        const typename InputType::Index n = _x.size();
+        const Scalar eps = sqrt(((std::max)(epsfcn,NumTraits<Scalar>::epsilon() )));
+        ValueType val1, val2;
+        InputType x = _x;
+        // TODO : we should do this only if the size is not already known
+        val1.resize(Functor::values());
+        val2.resize(Functor::values());
+
+        // initialization
+        switch(mode) {
+            case Forward:
+                // compute f(x)
+                Functor::operator()(x, val1); nfev++;
+                break;
+            case Central:
+                // do nothing
+                break;
+            default:
+                eigen_assert(false);
+        };
+
+        // Function Body
+        for (int j = 0; j < n; ++j) {
+            h = eps * abs(x[j]);
+            if (h == 0.) {
+                h = eps;
+            }
+            switch(mode) {
+                case Forward:
+                    x[j] += h;
+                    Functor::operator()(x, val2);
+                    nfev++;
+                    x[j] = _x[j];
+                    jac.col(j) = (val2-val1)/h;
+                    break;
+                case Central:
+                    x[j] += h;
+                    Functor::operator()(x, val2); nfev++;
+                    x[j] -= 2*h;
+                    Functor::operator()(x, val1); nfev++;
+                    x[j] = _x[j];
+                    jac.col(j) = (val2-val1)/(2*h);
+                    break;
+                default:
+                    eigen_assert(false);
+            };
+        }
+        return nfev;
+    }
+private:
+    Scalar epsfcn;
+
+    NumericalDiff& operator=(const NumericalDiff&);
+};
+
+} // end namespace Eigen
+
+//vim: ai ts=4 sts=4 et sw=4
+#endif // EIGEN_NUMERICAL_DIFF_H
+
diff --git a/include/eigen3/unsupported/Eigen/src/Polynomials/Companion.h b/include/eigen3/unsupported/Eigen/src/Polynomials/Companion.h
new file mode 100644
index 0000000..b515c29
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Polynomials/Companion.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPANION_H
+#define EIGEN_COMPANION_H
+
+// This file requires the user to include
+// * Eigen/Core
+// * Eigen/src/PolynomialSolver.h
+
+namespace Eigen { 
+
+namespace internal {
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+template <typename T>
+T radix(){ return 2; }
+
+template <typename T>
+T radix2(){ return radix<T>()*radix<T>(); }
+
+template<int Size>
+struct decrement_if_fixed_size
+{
+  enum {
+    ret = (Size == Dynamic) ? Dynamic : Size-1 };
+};
+
+#endif
+
+template< typename _Scalar, int _Deg >
+class companion
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
+
+    enum {
+      Deg = _Deg,
+      Deg_1=decrement_if_fixed_size<Deg>::ret
+    };
+
+    typedef _Scalar                                Scalar;
+    typedef typename NumTraits<Scalar>::Real       RealScalar;
+    typedef Matrix<Scalar, Deg, 1>                 RightColumn;
+    //typedef DiagonalMatrix< Scalar, Deg_1, Deg_1 > BottomLeftDiagonal;
+    typedef Matrix<Scalar, Deg_1, 1>               BottomLeftDiagonal;
+
+    typedef Matrix<Scalar, Deg, Deg>               DenseCompanionMatrixType;
+    typedef Matrix< Scalar, _Deg, Deg_1 >          LeftBlock;
+    typedef Matrix< Scalar, Deg_1, Deg_1 >         BottomLeftBlock;
+    typedef Matrix< Scalar, 1, Deg_1 >             LeftBlockFirstRow;
+
+    typedef DenseIndex Index;
+
+  public:
+    EIGEN_STRONG_INLINE const _Scalar operator()(Index row, Index col ) const
+    {
+      if( m_bl_diag.rows() > col )
+      {
+        if( 0 < row ){ return m_bl_diag[col]; }
+        else{ return 0; }
+      }
+      else{ return m_monic[row]; }
+    }
+
+  public:
+    template<typename VectorType>
+    void setPolynomial( const VectorType& poly )
+    {
+      const Index deg = poly.size()-1;
+      m_monic = -1/poly[deg] * poly.head(deg);
+      //m_bl_diag.setIdentity( deg-1 );
+      m_bl_diag.setOnes(deg-1);
+    }
+
+    template<typename VectorType>
+    companion( const VectorType& poly ){
+      setPolynomial( poly ); }
+
+  public:
+    DenseCompanionMatrixType denseMatrix() const
+    {
+      const Index deg   = m_monic.size();
+      const Index deg_1 = deg-1;
+      DenseCompanionMatrixType companion(deg,deg);
+      companion <<
+        ( LeftBlock(deg,deg_1)
+          << LeftBlockFirstRow::Zero(1,deg_1),
+          BottomLeftBlock::Identity(deg-1,deg-1)*m_bl_diag.asDiagonal() ).finished()
+        , m_monic;
+      return companion;
+    }
+
+
+
+  protected:
+    /** Helper function for the balancing algorithm.
+     * \returns true if the row and the column, having colNorm and rowNorm
+     * as norms, are balanced, false otherwise.
+     * colB and rowB are repectively the multipliers for
+     * the column and the row in order to balance them.
+     * */
+    bool balanced( Scalar colNorm, Scalar rowNorm,
+        bool& isBalanced, Scalar& colB, Scalar& rowB );
+
+    /** Helper function for the balancing algorithm.
+     * \returns true if the row and the column, having colNorm and rowNorm
+     * as norms, are balanced, false otherwise.
+     * colB and rowB are repectively the multipliers for
+     * the column and the row in order to balance them.
+     * */
+    bool balancedR( Scalar colNorm, Scalar rowNorm,
+        bool& isBalanced, Scalar& colB, Scalar& rowB );
+
+  public:
+    /**
+     * Balancing algorithm from B. N. PARLETT and C. REINSCH (1969)
+     * "Balancing a matrix for calculation of eigenvalues and eigenvectors"
+     * adapted to the case of companion matrices.
+     * A matrix with non zero row and non zero column is balanced
+     * for a certain norm if the i-th row and the i-th column
+     * have same norm for all i.
+     */
+    void balance();
+
+  protected:
+      RightColumn                m_monic;
+      BottomLeftDiagonal         m_bl_diag;
+};
+
+
+
+template< typename _Scalar, int _Deg >
+inline
+bool companion<_Scalar,_Deg>::balanced( Scalar colNorm, Scalar rowNorm,
+    bool& isBalanced, Scalar& colB, Scalar& rowB )
+{
+  if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; }
+  else
+  {
+    //To find the balancing coefficients, if the radix is 2,
+    //one finds \f$ \sigma \f$ such that
+    // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$
+    // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$
+    // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$
+    rowB = rowNorm / radix<Scalar>();
+    colB = Scalar(1);
+    const Scalar s = colNorm + rowNorm;
+
+    while (colNorm < rowB)
+    {
+      colB *= radix<Scalar>();
+      colNorm *= radix2<Scalar>();
+    }
+
+    rowB = rowNorm * radix<Scalar>();
+
+    while (colNorm >= rowB)
+    {
+      colB /= radix<Scalar>();
+      colNorm /= radix2<Scalar>();
+    }
+
+    //This line is used to avoid insubstantial balancing
+    if ((rowNorm + colNorm) < Scalar(0.95) * s * colB)
+    {
+      isBalanced = false;
+      rowB = Scalar(1) / colB;
+      return false;
+    }
+    else{
+      return true; }
+  }
+}
+
+template< typename _Scalar, int _Deg >
+inline
+bool companion<_Scalar,_Deg>::balancedR( Scalar colNorm, Scalar rowNorm,
+    bool& isBalanced, Scalar& colB, Scalar& rowB )
+{
+  if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; }
+  else
+  {
+    /**
+     * Set the norm of the column and the row to the geometric mean
+     * of the row and column norm
+     */
+    const _Scalar q = colNorm/rowNorm;
+    if( !isApprox( q, _Scalar(1) ) )
+    {
+      rowB = sqrt( colNorm/rowNorm );
+      colB = Scalar(1)/rowB;
+
+      isBalanced = false;
+      return false;
+    }
+    else{
+      return true; }
+  }
+}
+
+
+template< typename _Scalar, int _Deg >
+void companion<_Scalar,_Deg>::balance()
+{
+  using std::abs;
+  EIGEN_STATIC_ASSERT( Deg == Dynamic || 1 < Deg, YOU_MADE_A_PROGRAMMING_MISTAKE );
+  const Index deg   = m_monic.size();
+  const Index deg_1 = deg-1;
+
+  bool hasConverged=false;
+  while( !hasConverged )
+  {
+    hasConverged = true;
+    Scalar colNorm,rowNorm;
+    Scalar colB,rowB;
+
+    //First row, first column excluding the diagonal
+    //==============================================
+    colNorm = abs(m_bl_diag[0]);
+    rowNorm = abs(m_monic[0]);
+
+    //Compute balancing of the row and the column
+    if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
+    {
+      m_bl_diag[0] *= colB;
+      m_monic[0] *= rowB;
+    }
+
+    //Middle rows and columns excluding the diagonal
+    //==============================================
+    for( Index i=1; i<deg_1; ++i )
+    {
+      // column norm, excluding the diagonal
+      colNorm = abs(m_bl_diag[i]);
+
+      // row norm, excluding the diagonal
+      rowNorm = abs(m_bl_diag[i-1]) + abs(m_monic[i]);
+
+      //Compute balancing of the row and the column
+      if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
+      {
+        m_bl_diag[i]   *= colB;
+        m_bl_diag[i-1] *= rowB;
+        m_monic[i]     *= rowB;
+      }
+    }
+
+    //Last row, last column excluding the diagonal
+    //============================================
+    const Index ebl = m_bl_diag.size()-1;
+    VectorBlock<RightColumn,Deg_1> headMonic( m_monic, 0, deg_1 );
+    colNorm = headMonic.array().abs().sum();
+    rowNorm = abs( m_bl_diag[ebl] );
+
+    //Compute balancing of the row and the column
+    if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
+    {
+      headMonic      *= colB;
+      m_bl_diag[ebl] *= rowB;
+    }
+  }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPANION_H
diff --git a/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
new file mode 100644
index 0000000..cd5c04b
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_POLYNOMIAL_SOLVER_H
+#define EIGEN_POLYNOMIAL_SOLVER_H
+
+namespace Eigen { 
+
+/** \ingroup Polynomials_Module
+ *  \class PolynomialSolverBase.
+ *
+ * \brief Defined to be inherited by polynomial solvers: it provides
+ * convenient methods such as
+ *  - real roots,
+ *  - greatest, smallest complex roots,
+ *  - real roots with greatest, smallest absolute real value,
+ *  - greatest, smallest real roots.
+ *
+ * It stores the set of roots as a vector of complexes.
+ *
+ */
+template< typename _Scalar, int _Deg >
+class PolynomialSolverBase
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
+
+    typedef _Scalar                             Scalar;
+    typedef typename NumTraits<Scalar>::Real    RealScalar;
+    typedef std::complex<RealScalar>            RootType;
+    typedef Matrix<RootType,_Deg,1>             RootsType;
+
+    typedef DenseIndex Index;
+
+  protected:
+    template< typename OtherPolynomial >
+    inline void setPolynomial( const OtherPolynomial& poly ){
+      m_roots.resize(poly.size()); }
+
+  public:
+    template< typename OtherPolynomial >
+    inline PolynomialSolverBase( const OtherPolynomial& poly ){
+      setPolynomial( poly() ); }
+
+    inline PolynomialSolverBase(){}
+
+  public:
+    /** \returns the complex roots of the polynomial */
+    inline const RootsType& roots() const { return m_roots; }
+
+  public:
+    /** Clear and fills the back insertion sequence with the real roots of the polynomial
+     * i.e. the real part of the complex roots that have an imaginary part which
+     * absolute value is smaller than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     *
+     * \param[out] bi_seq : the back insertion sequence (stl concept)
+     * \param[in]  absImaginaryThreshold : the maximum bound of the imaginary part of a complex
+     *  number that is considered as real.
+     * */
+    template<typename Stl_back_insertion_sequence>
+    inline void realRoots( Stl_back_insertion_sequence& bi_seq,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      using std::abs;
+      bi_seq.clear();
+      for(Index i=0; i<m_roots.size(); ++i )
+      {
+        if( abs( m_roots[i].imag() ) < absImaginaryThreshold ){
+          bi_seq.push_back( m_roots[i].real() ); }
+      }
+    }
+
+  protected:
+    template<typename squaredNormBinaryPredicate>
+    inline const RootType& selectComplexRoot_withRespectToNorm( squaredNormBinaryPredicate& pred ) const
+    {
+      Index res=0;
+      RealScalar norm2 = numext::abs2( m_roots[0] );
+      for( Index i=1; i<m_roots.size(); ++i )
+      {
+        const RealScalar currNorm2 = numext::abs2( m_roots[i] );
+        if( pred( currNorm2, norm2 ) ){
+          res=i; norm2=currNorm2; }
+      }
+      return m_roots[res];
+    }
+
+  public:
+    /**
+     * \returns the complex root with greatest norm.
+     */
+    inline const RootType& greatestRoot() const
+    {
+      std::greater<Scalar> greater;
+      return selectComplexRoot_withRespectToNorm( greater );
+    }
+
+    /**
+     * \returns the complex root with smallest norm.
+     */
+    inline const RootType& smallestRoot() const
+    {
+      std::less<Scalar> less;
+      return selectComplexRoot_withRespectToNorm( less );
+    }
+
+  protected:
+    template<typename squaredRealPartBinaryPredicate>
+    inline const RealScalar& selectRealRoot_withRespectToAbsRealPart(
+        squaredRealPartBinaryPredicate& pred,
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      using std::abs;
+      hasArealRoot = false;
+      Index res=0;
+      RealScalar abs2(0);
+
+      for( Index i=0; i<m_roots.size(); ++i )
+      {
+        if( abs( m_roots[i].imag() ) < absImaginaryThreshold )
+        {
+          if( !hasArealRoot )
+          {
+            hasArealRoot = true;
+            res = i;
+            abs2 = m_roots[i].real() * m_roots[i].real();
+          }
+          else
+          {
+            const RealScalar currAbs2 = m_roots[i].real() * m_roots[i].real();
+            if( pred( currAbs2, abs2 ) )
+            {
+              abs2 = currAbs2;
+              res = i;
+            }
+          }
+        }
+        else
+        {
+          if( abs( m_roots[i].imag() ) < abs( m_roots[res].imag() ) ){
+            res = i; }
+        }
+      }
+      return numext::real_ref(m_roots[res]);
+    }
+
+
+    template<typename RealPartBinaryPredicate>
+    inline const RealScalar& selectRealRoot_withRespectToRealPart(
+        RealPartBinaryPredicate& pred,
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      using std::abs;
+      hasArealRoot = false;
+      Index res=0;
+      RealScalar val(0);
+
+      for( Index i=0; i<m_roots.size(); ++i )
+      {
+        if( abs( m_roots[i].imag() ) < absImaginaryThreshold )
+        {
+          if( !hasArealRoot )
+          {
+            hasArealRoot = true;
+            res = i;
+            val = m_roots[i].real();
+          }
+          else
+          {
+            const RealScalar curr = m_roots[i].real();
+            if( pred( curr, val ) )
+            {
+              val = curr;
+              res = i;
+            }
+          }
+        }
+        else
+        {
+          if( abs( m_roots[i].imag() ) < abs( m_roots[res].imag() ) ){
+            res = i; }
+        }
+      }
+      return numext::real_ref(m_roots[res]);
+    }
+
+  public:
+    /**
+     * \returns a real root with greatest absolute magnitude.
+     * A real root is defined as the real part of a complex root with absolute imaginary
+     * part smallest than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     * If no real root is found the boolean hasArealRoot is set to false and the real part of
+     * the root with smallest absolute imaginary part is returned instead.
+     *
+     * \param[out] hasArealRoot : boolean true if a real root is found according to the
+     *  absImaginaryThreshold criterion, false otherwise.
+     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+     *  whether or not a root is real.
+     */
+    inline const RealScalar& absGreatestRealRoot(
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      std::greater<Scalar> greater;
+      return selectRealRoot_withRespectToAbsRealPart( greater, hasArealRoot, absImaginaryThreshold );
+    }
+
+
+    /**
+     * \returns a real root with smallest absolute magnitude.
+     * A real root is defined as the real part of a complex root with absolute imaginary
+     * part smallest than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     * If no real root is found the boolean hasArealRoot is set to false and the real part of
+     * the root with smallest absolute imaginary part is returned instead.
+     *
+     * \param[out] hasArealRoot : boolean true if a real root is found according to the
+     *  absImaginaryThreshold criterion, false otherwise.
+     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+     *  whether or not a root is real.
+     */
+    inline const RealScalar& absSmallestRealRoot(
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      std::less<Scalar> less;
+      return selectRealRoot_withRespectToAbsRealPart( less, hasArealRoot, absImaginaryThreshold );
+    }
+
+
+    /**
+     * \returns the real root with greatest value.
+     * A real root is defined as the real part of a complex root with absolute imaginary
+     * part smallest than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     * If no real root is found the boolean hasArealRoot is set to false and the real part of
+     * the root with smallest absolute imaginary part is returned instead.
+     *
+     * \param[out] hasArealRoot : boolean true if a real root is found according to the
+     *  absImaginaryThreshold criterion, false otherwise.
+     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+     *  whether or not a root is real.
+     */
+    inline const RealScalar& greatestRealRoot(
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      std::greater<Scalar> greater;
+      return selectRealRoot_withRespectToRealPart( greater, hasArealRoot, absImaginaryThreshold );
+    }
+
+
+    /**
+     * \returns the real root with smallest value.
+     * A real root is defined as the real part of a complex root with absolute imaginary
+     * part smallest than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     * If no real root is found the boolean hasArealRoot is set to false and the real part of
+     * the root with smallest absolute imaginary part is returned instead.
+     *
+     * \param[out] hasArealRoot : boolean true if a real root is found according to the
+     *  absImaginaryThreshold criterion, false otherwise.
+     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+     *  whether or not a root is real.
+     */
+    inline const RealScalar& smallestRealRoot(
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      std::less<Scalar> less;
+      return selectRealRoot_withRespectToRealPart( less, hasArealRoot, absImaginaryThreshold );
+    }
+
+  protected:
+    RootsType               m_roots;
+};
+
+#define EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( BASE )  \
+  typedef typename BASE::Scalar                 Scalar;       \
+  typedef typename BASE::RealScalar             RealScalar;   \
+  typedef typename BASE::RootType               RootType;     \
+  typedef typename BASE::RootsType              RootsType;
+
+
+
+/** \ingroup Polynomials_Module
+  *
+  * \class PolynomialSolver
+  *
+  * \brief A polynomial solver
+  *
+  * Computes the complex roots of a real polynomial.
+  *
+  * \param _Scalar the scalar type, i.e., the type of the polynomial coefficients
+  * \param _Deg the degree of the polynomial, can be a compile time value or Dynamic.
+  *             Notice that the number of polynomial coefficients is _Deg+1.
+  *
+  * This class implements a polynomial solver and provides convenient methods such as
+  * - real roots,
+  * - greatest, smallest complex roots,
+  * - real roots with greatest, smallest absolute real value.
+  * - greatest, smallest real roots.
+  *
+  * WARNING: this polynomial solver is experimental, part of the unsuported Eigen modules.
+  *
+  *
+  * Currently a QR algorithm is used to compute the eigenvalues of the companion matrix of
+  * the polynomial to compute its roots.
+  * This supposes that the complex moduli of the roots are all distinct: e.g. there should
+  * be no multiple roots or conjugate roots for instance.
+  * With 32bit (float) floating types this problem shows up frequently.
+  * However, almost always, correct accuracy is reached even in these cases for 64bit
+  * (double) floating types and small polynomial degree (<20).
+  */
+template< typename _Scalar, int _Deg >
+class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
+
+    typedef PolynomialSolverBase<_Scalar,_Deg>    PS_Base;
+    EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
+
+    typedef Matrix<Scalar,_Deg,_Deg>                 CompanionMatrixType;
+    typedef EigenSolver<CompanionMatrixType>         EigenSolverType;
+
+  public:
+    /** Computes the complex roots of a new polynomial. */
+    template< typename OtherPolynomial >
+    void compute( const OtherPolynomial& poly )
+    {
+      eigen_assert( Scalar(0) != poly[poly.size()-1] );
+      internal::companion<Scalar,_Deg> companion( poly );
+      companion.balance();
+      m_eigenSolver.compute( companion.denseMatrix() );
+      m_roots = m_eigenSolver.eigenvalues();
+    }
+
+  public:
+    template< typename OtherPolynomial >
+    inline PolynomialSolver( const OtherPolynomial& poly ){
+      compute( poly ); }
+
+    inline PolynomialSolver(){}
+
+  protected:
+    using                   PS_Base::m_roots;
+    EigenSolverType         m_eigenSolver;
+};
+
+
+template< typename _Scalar >
+class PolynomialSolver<_Scalar,1> : public PolynomialSolverBase<_Scalar,1>
+{
+  public:
+    typedef PolynomialSolverBase<_Scalar,1>    PS_Base;
+    EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
+
+  public:
+    /** Computes the complex roots of a new polynomial. */
+    template< typename OtherPolynomial >
+    void compute( const OtherPolynomial& poly )
+    {
+      eigen_assert( Scalar(0) != poly[poly.size()-1] );
+      m_roots[0] = -poly[0]/poly[poly.size()-1];
+    }
+
+  protected:
+    using                   PS_Base::m_roots;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_POLYNOMIAL_SOLVER_H
diff --git a/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialUtils.h b/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
new file mode 100644
index 0000000..2bb8bc8
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
@@ -0,0 +1,143 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_POLYNOMIAL_UTILS_H
+#define EIGEN_POLYNOMIAL_UTILS_H
+
+namespace Eigen { 
+
+/** \ingroup Polynomials_Module
+ * \returns the evaluation of the polynomial at x using Horner algorithm.
+ *
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ * \param[in] x : the value to evaluate the polynomial at.
+ *
+ * <i><b>Note for stability:</b></i>
+ *  <dd> \f$ |x| \le 1 \f$ </dd>
+ */
+template <typename Polynomials, typename T>
+inline
+T poly_eval_horner( const Polynomials& poly, const T& x )
+{
+  T val=poly[poly.size()-1];
+  for(DenseIndex i=poly.size()-2; i>=0; --i ){
+    val = val*x + poly[i]; }
+  return val;
+}
+
+/** \ingroup Polynomials_Module
+ * \returns the evaluation of the polynomial at x using stabilized Horner algorithm.
+ *
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ * \param[in] x : the value to evaluate the polynomial at.
+ */
+template <typename Polynomials, typename T>
+inline
+T poly_eval( const Polynomials& poly, const T& x )
+{
+  typedef typename NumTraits<T>::Real Real;
+
+  if( numext::abs2( x ) <= Real(1) ){
+    return poly_eval_horner( poly, x ); }
+  else
+  {
+    T val=poly[0];
+    T inv_x = T(1)/x;
+    for( DenseIndex i=1; i<poly.size(); ++i ){
+      val = val*inv_x + poly[i]; }
+
+    return std::pow(x,(T)(poly.size()-1)) * val;
+  }
+}
+
+/** \ingroup Polynomials_Module
+ * \returns a maximum bound for the absolute value of any root of the polynomial.
+ *
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ *
+ *  <i><b>Precondition:</b></i>
+ *  <dd> the leading coefficient of the input polynomial poly must be non zero </dd>
+ */
+template <typename Polynomial>
+inline
+typename NumTraits<typename Polynomial::Scalar>::Real cauchy_max_bound( const Polynomial& poly )
+{
+  using std::abs;
+  typedef typename Polynomial::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real Real;
+
+  eigen_assert( Scalar(0) != poly[poly.size()-1] );
+  const Scalar inv_leading_coeff = Scalar(1)/poly[poly.size()-1];
+  Real cb(0);
+
+  for( DenseIndex i=0; i<poly.size()-1; ++i ){
+    cb += abs(poly[i]*inv_leading_coeff); }
+  return cb + Real(1);
+}
+
+/** \ingroup Polynomials_Module
+ * \returns a minimum bound for the absolute value of any non zero root of the polynomial.
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ */
+template <typename Polynomial>
+inline
+typename NumTraits<typename Polynomial::Scalar>::Real cauchy_min_bound( const Polynomial& poly )
+{
+  using std::abs;
+  typedef typename Polynomial::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real Real;
+
+  DenseIndex i=0;
+  while( i<poly.size()-1 && Scalar(0) == poly(i) ){ ++i; }
+  if( poly.size()-1 == i ){
+    return Real(1); }
+
+  const Scalar inv_min_coeff = Scalar(1)/poly[i];
+  Real cb(1);
+  for( DenseIndex j=i+1; j<poly.size(); ++j ){
+    cb += abs(poly[j]*inv_min_coeff); }
+  return Real(1)/cb;
+}
+
+/** \ingroup Polynomials_Module
+ * Given the roots of a polynomial compute the coefficients in the
+ * monomial basis of the monic polynomial with same roots and minimal degree.
+ * If RootVector is a vector of complexes, Polynomial should also be a vector
+ * of complexes.
+ * \param[in] rv : a vector containing the roots of a polynomial.
+ * \param[out] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 3 + x^2 \f$ is stored as a vector \f$ [ 3, 0, 1 ] \f$.
+ */
+template <typename RootVector, typename Polynomial>
+void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly )
+{
+
+  typedef typename Polynomial::Scalar Scalar;
+
+  poly.setZero( rv.size()+1 );
+  poly[0] = -rv[0]; poly[1] = Scalar(1);
+  for( DenseIndex i=1; i< rv.size(); ++i )
+  {
+    for( DenseIndex j=i+1; j>0; --j ){ poly[j] = poly[j-1] - rv[i]*poly[j]; }
+    poly[0] = -rv[i]*poly[0];
+  }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_POLYNOMIAL_UTILS_H
diff --git a/include/eigen3/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
new file mode 100644
index 0000000..a1f54ed
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
@@ -0,0 +1,352 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Guillaume Saupin <guillaume.saupin at cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEINPLACELU_H
+#define EIGEN_SKYLINEINPLACELU_H
+
+namespace Eigen { 
+
+/** \ingroup Skyline_Module
+ *
+ * \class SkylineInplaceLU
+ *
+ * \brief Inplace LU decomposition of a skyline matrix and associated features
+ *
+ * \param MatrixType the type of the matrix of which we are computing the LU factorization
+ *
+ */
+template<typename MatrixType>
+class SkylineInplaceLU {
+protected:
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+
+public:
+
+    /** Creates a LU object and compute the respective factorization of \a matrix using
+     * flags \a flags. */
+    SkylineInplaceLU(MatrixType& matrix, int flags = 0)
+    : /*m_matrix(matrix.rows(), matrix.cols()),*/ m_flags(flags), m_status(0), m_lu(matrix) {
+        m_precision = RealScalar(0.1) * Eigen::dummy_precision<RealScalar > ();
+        m_lu.IsRowMajor ? computeRowMajor() : compute();
+    }
+
+    /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
+     *
+     * Setting a value greater than zero speeds up computation, and yields to an imcomplete
+     * factorization with fewer non zero coefficients. Such approximate factors are especially
+     * useful to initialize an iterative solver.
+     *
+     * Note that the exact meaning of this parameter might depends on the actual
+     * backend. Moreover, not all backends support this feature.
+     *
+     * \sa precision() */
+    void setPrecision(RealScalar v) {
+        m_precision = v;
+    }
+
+    /** \returns the current precision.
+     *
+     * \sa setPrecision() */
+    RealScalar precision() const {
+        return m_precision;
+    }
+
+    /** Sets the flags. Possible values are:
+     *  - CompleteFactorization
+     *  - IncompleteFactorization
+     *  - MemoryEfficient
+     *  - one of the ordering methods
+     *  - etc...
+     *
+     * \sa flags() */
+    void setFlags(int f) {
+        m_flags = f;
+    }
+
+    /** \returns the current flags */
+    int flags() const {
+        return m_flags;
+    }
+
+    void setOrderingMethod(int m) {
+        m_flags = m;
+    }
+
+    int orderingMethod() const {
+        return m_flags;
+    }
+
+    /** Computes/re-computes the LU factorization */
+    void compute();
+    void computeRowMajor();
+
+    /** \returns the lower triangular matrix L */
+    //inline const MatrixType& matrixL() const { return m_matrixL; }
+
+    /** \returns the upper triangular matrix U */
+    //inline const MatrixType& matrixU() const { return m_matrixU; }
+
+    template<typename BDerived, typename XDerived>
+    bool solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x,
+            const int transposed = 0) const;
+
+    /** \returns true if the factorization succeeded */
+    inline bool succeeded(void) const {
+        return m_succeeded;
+    }
+
+protected:
+    RealScalar m_precision;
+    int m_flags;
+    mutable int m_status;
+    bool m_succeeded;
+    MatrixType& m_lu;
+};
+
+/** Computes / recomputes the in place LU decomposition of the SkylineInplaceLU.
+ * using the default algorithm.
+ */
+template<typename MatrixType>
+//template<typename _Scalar>
+void SkylineInplaceLU<MatrixType>::compute() {
+    const size_t rows = m_lu.rows();
+    const size_t cols = m_lu.cols();
+
+    eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
+    eigen_assert(!m_lu.IsRowMajor && "LU decomposition does not work with rowMajor Storage");
+
+    for (Index row = 0; row < rows; row++) {
+        const double pivot = m_lu.coeffDiag(row);
+
+        //Lower matrix Columns update
+        const Index& col = row;
+        for (typename MatrixType::InnerLowerIterator lIt(m_lu, col); lIt; ++lIt) {
+            lIt.valueRef() /= pivot;
+        }
+
+        //Upper matrix update -> contiguous memory access
+        typename MatrixType::InnerLowerIterator lIt(m_lu, col);
+        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
+            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
+            typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
+            const double coef = lIt.value();
+
+            uItPivot += (rrow - row - 1);
+
+            //update upper part  -> contiguous memory access
+            for (++uItPivot; uIt && uItPivot;) {
+                uIt.valueRef() -= uItPivot.value() * coef;
+
+                ++uIt;
+                ++uItPivot;
+            }
+            ++lIt;
+        }
+
+        //Upper matrix update -> non contiguous memory access
+        typename MatrixType::InnerLowerIterator lIt3(m_lu, col);
+        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
+            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
+            const double coef = lIt3.value();
+
+            //update lower part ->  non contiguous memory access
+            for (Index i = 0; i < rrow - row - 1; i++) {
+                m_lu.coeffRefLower(rrow, row + i + 1) -= uItPivot.value() * coef;
+                ++uItPivot;
+            }
+            ++lIt3;
+        }
+        //update diag -> contiguous
+        typename MatrixType::InnerLowerIterator lIt2(m_lu, col);
+        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
+
+            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
+            typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
+            const double coef = lIt2.value();
+
+            uItPivot += (rrow - row - 1);
+            m_lu.coeffRefDiag(rrow) -= uItPivot.value() * coef;
+            ++lIt2;
+        }
+    }
+}
+
+template<typename MatrixType>
+void SkylineInplaceLU<MatrixType>::computeRowMajor() {
+    const size_t rows = m_lu.rows();
+    const size_t cols = m_lu.cols();
+
+    eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
+    eigen_assert(m_lu.IsRowMajor && "You're trying to apply rowMajor decomposition on a ColMajor matrix !");
+
+    for (Index row = 0; row < rows; row++) {
+        typename MatrixType::InnerLowerIterator llIt(m_lu, row);
+
+
+        for (Index col = llIt.col(); col < row; col++) {
+            if (m_lu.coeffExistLower(row, col)) {
+                const double diag = m_lu.coeffDiag(col);
+
+                typename MatrixType::InnerLowerIterator lIt(m_lu, row);
+                typename MatrixType::InnerUpperIterator uIt(m_lu, col);
+
+
+                const Index offset = lIt.col() - uIt.row();
+
+
+                Index stop = offset > 0 ? col - lIt.col() : col - uIt.row();
+
+                //#define VECTORIZE
+#ifdef VECTORIZE
+                Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
+                Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
+
+
+                Scalar newCoeff = m_lu.coeffLower(row, col) - rowVal.dot(colVal);
+#else
+                if (offset > 0) //Skip zero value of lIt
+                    uIt += offset;
+                else //Skip zero values of uIt
+                    lIt += -offset;
+                Scalar newCoeff = m_lu.coeffLower(row, col);
+
+                for (Index k = 0; k < stop; ++k) {
+                    const Scalar tmp = newCoeff;
+                    newCoeff = tmp - lIt.value() * uIt.value();
+                    ++lIt;
+                    ++uIt;
+                }
+#endif
+
+                m_lu.coeffRefLower(row, col) = newCoeff / diag;
+            }
+        }
+
+        //Upper matrix update
+        const Index col = row;
+        typename MatrixType::InnerUpperIterator uuIt(m_lu, col);
+        for (Index rrow = uuIt.row(); rrow < col; rrow++) {
+
+            typename MatrixType::InnerLowerIterator lIt(m_lu, rrow);
+            typename MatrixType::InnerUpperIterator uIt(m_lu, col);
+            const Index offset = lIt.col() - uIt.row();
+
+            Index stop = offset > 0 ? rrow - lIt.col() : rrow - uIt.row();
+
+#ifdef VECTORIZE
+            Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
+            Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
+
+            Scalar newCoeff = m_lu.coeffUpper(rrow, col) - rowVal.dot(colVal);
+#else
+            if (offset > 0) //Skip zero value of lIt
+                uIt += offset;
+            else //Skip zero values of uIt
+                lIt += -offset;
+            Scalar newCoeff = m_lu.coeffUpper(rrow, col);
+            for (Index k = 0; k < stop; ++k) {
+                const Scalar tmp = newCoeff;
+                newCoeff = tmp - lIt.value() * uIt.value();
+
+                ++lIt;
+                ++uIt;
+            }
+#endif
+            m_lu.coeffRefUpper(rrow, col) = newCoeff;
+        }
+
+
+        //Diag matrix update
+        typename MatrixType::InnerLowerIterator lIt(m_lu, row);
+        typename MatrixType::InnerUpperIterator uIt(m_lu, row);
+
+        const Index offset = lIt.col() - uIt.row();
+
+
+        Index stop = offset > 0 ? lIt.size() : uIt.size();
+#ifdef VECTORIZE
+        Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
+        Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
+        Scalar newCoeff = m_lu.coeffDiag(row) - rowVal.dot(colVal);
+#else
+        if (offset > 0) //Skip zero value of lIt
+            uIt += offset;
+        else //Skip zero values of uIt
+            lIt += -offset;
+        Scalar newCoeff = m_lu.coeffDiag(row);
+        for (Index k = 0; k < stop; ++k) {
+            const Scalar tmp = newCoeff;
+            newCoeff = tmp - lIt.value() * uIt.value();
+            ++lIt;
+            ++uIt;
+        }
+#endif
+        m_lu.coeffRefDiag(row) = newCoeff;
+    }
+}
+
+/** Computes *x = U^-1 L^-1 b
+ *
+ * If \a transpose is set to SvTranspose or SvAdjoint, the solution
+ * of the transposed/adjoint system is computed instead.
+ *
+ * Not all backends implement the solution of the transposed or
+ * adjoint system.
+ */
+template<typename MatrixType>
+template<typename BDerived, typename XDerived>
+bool SkylineInplaceLU<MatrixType>::solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x, const int transposed) const {
+    const size_t rows = m_lu.rows();
+    const size_t cols = m_lu.cols();
+
+
+    for (Index row = 0; row < rows; row++) {
+        x->coeffRef(row) = b.coeff(row);
+        Scalar newVal = x->coeff(row);
+        typename MatrixType::InnerLowerIterator lIt(m_lu, row);
+
+        Index col = lIt.col();
+        while (lIt.col() < row) {
+
+            newVal -= x->coeff(col++) * lIt.value();
+            ++lIt;
+        }
+
+        x->coeffRef(row) = newVal;
+    }
+
+
+    for (Index col = rows - 1; col > 0; col--) {
+        x->coeffRef(col) = x->coeff(col) / m_lu.coeffDiag(col);
+
+        const Scalar x_col = x->coeff(col);
+
+        typename MatrixType::InnerUpperIterator uIt(m_lu, col);
+        uIt += uIt.size()-1;
+
+
+        while (uIt) {
+            x->coeffRef(uIt.row()) -= x_col * uIt.value();
+            //TODO : introduce --operator
+            uIt += -1;
+        }
+
+
+    }
+    x->coeffRef(0) = x->coeff(0) / m_lu.coeffDiag(0);
+
+    return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINELU_H
diff --git a/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrix.h
new file mode 100644
index 0000000..a2a8933
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrix.h
@@ -0,0 +1,862 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin at cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEMATRIX_H
+#define EIGEN_SKYLINEMATRIX_H
+
+#include "SkylineStorage.h"
+#include "SkylineMatrixBase.h"
+
+namespace Eigen { 
+
+/** \ingroup Skyline_Module
+ *
+ * \class SkylineMatrix
+ *
+ * \brief The main skyline matrix class
+ *
+ * This class implements a skyline matrix using the very uncommon storage
+ * scheme.
+ *
+ * \param _Scalar the scalar type, i.e. the type of the coefficients
+ * \param _Options Union of bit flags controlling the storage scheme. Currently the only possibility
+ *                 is RowMajor. The default is 0 which means column-major.
+ *
+ *
+ */
+namespace internal {
+template<typename _Scalar, int _Options>
+struct traits<SkylineMatrix<_Scalar, _Options> > {
+    typedef _Scalar Scalar;
+    typedef Sparse StorageKind;
+
+    enum {
+        RowsAtCompileTime = Dynamic,
+        ColsAtCompileTime = Dynamic,
+        MaxRowsAtCompileTime = Dynamic,
+        MaxColsAtCompileTime = Dynamic,
+        Flags = SkylineBit | _Options,
+        CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    };
+};
+}
+
+template<typename _Scalar, int _Options>
+class SkylineMatrix
+: public SkylineMatrixBase<SkylineMatrix<_Scalar, _Options> > {
+public:
+    EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(SkylineMatrix)
+    EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, +=)
+    EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, -=)
+
+    using Base::IsRowMajor;
+
+protected:
+
+    typedef SkylineMatrix<Scalar, (Flags&~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0) > TransposedSkylineMatrix;
+
+    Index m_outerSize;
+    Index m_innerSize;
+
+public:
+    Index* m_colStartIndex;
+    Index* m_rowStartIndex;
+    SkylineStorage<Scalar> m_data;
+
+public:
+
+    inline Index rows() const {
+        return IsRowMajor ? m_outerSize : m_innerSize;
+    }
+
+    inline Index cols() const {
+        return IsRowMajor ? m_innerSize : m_outerSize;
+    }
+
+    inline Index innerSize() const {
+        return m_innerSize;
+    }
+
+    inline Index outerSize() const {
+        return m_outerSize;
+    }
+
+    inline Index upperNonZeros() const {
+        return m_data.upperSize();
+    }
+
+    inline Index lowerNonZeros() const {
+        return m_data.lowerSize();
+    }
+
+    inline Index upperNonZeros(Index j) const {
+        return m_colStartIndex[j + 1] - m_colStartIndex[j];
+    }
+
+    inline Index lowerNonZeros(Index j) const {
+        return m_rowStartIndex[j + 1] - m_rowStartIndex[j];
+    }
+
+    inline const Scalar* _diagPtr() const {
+        return &m_data.diag(0);
+    }
+
+    inline Scalar* _diagPtr() {
+        return &m_data.diag(0);
+    }
+
+    inline const Scalar* _upperPtr() const {
+        return &m_data.upper(0);
+    }
+
+    inline Scalar* _upperPtr() {
+        return &m_data.upper(0);
+    }
+
+    inline const Scalar* _lowerPtr() const {
+        return &m_data.lower(0);
+    }
+
+    inline Scalar* _lowerPtr() {
+        return &m_data.lower(0);
+    }
+
+    inline const Index* _upperProfilePtr() const {
+        return &m_data.upperProfile(0);
+    }
+
+    inline Index* _upperProfilePtr() {
+        return &m_data.upperProfile(0);
+    }
+
+    inline const Index* _lowerProfilePtr() const {
+        return &m_data.lowerProfile(0);
+    }
+
+    inline Index* _lowerProfilePtr() {
+        return &m_data.lowerProfile(0);
+    }
+
+    inline Scalar coeff(Index row, Index col) const {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+
+        if (outer == inner)
+            return this->m_data.diag(outer);
+
+        if (IsRowMajor) {
+            if (inner > outer) //upper matrix
+            {
+                const Index minOuterIndex = inner - m_data.upperProfile(inner);
+                if (outer >= minOuterIndex)
+                    return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+                else
+                    return Scalar(0);
+            }
+            if (inner < outer) //lower matrix
+            {
+                const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+                if (inner >= minInnerIndex)
+                    return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+                else
+                    return Scalar(0);
+            }
+            return m_data.upper(m_colStartIndex[inner] + outer - inner);
+        } else {
+            if (outer > inner) //upper matrix
+            {
+                const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+                if (outer <= maxOuterIndex)
+                    return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+                else
+                    return Scalar(0);
+            }
+            if (outer < inner) //lower matrix
+            {
+                const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+
+                if (inner <= maxInnerIndex)
+                    return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+                else
+                    return Scalar(0);
+            }
+        }
+    }
+
+    inline Scalar& coeffRef(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+
+        if (outer == inner)
+            return this->m_data.diag(outer);
+
+        if (IsRowMajor) {
+            if (col > row) //upper matrix
+            {
+                const Index minOuterIndex = inner - m_data.upperProfile(inner);
+                eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+            }
+            if (col < row) //lower matrix
+            {
+                const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+                eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+            }
+        } else {
+            if (outer > inner) //upper matrix
+            {
+                const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+                eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+            }
+            if (outer < inner) //lower matrix
+            {
+                const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+                eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+            }
+        }
+    }
+
+    inline Scalar coeffDiag(Index idx) const {
+        eigen_assert(idx < outerSize());
+        eigen_assert(idx < innerSize());
+        return this->m_data.diag(idx);
+    }
+
+    inline Scalar coeffLower(Index row, Index col) const {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+            if (inner >= minInnerIndex)
+                return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+            else
+                return Scalar(0);
+
+        } else {
+            const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+            if (inner <= maxInnerIndex)
+                return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+            else
+                return Scalar(0);
+        }
+    }
+
+    inline Scalar coeffUpper(Index row, Index col) const {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minOuterIndex = inner - m_data.upperProfile(inner);
+            if (outer >= minOuterIndex)
+                return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+            else
+                return Scalar(0);
+        } else {
+            const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+            if (outer <= maxOuterIndex)
+                return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+            else
+                return Scalar(0);
+        }
+    }
+
+    inline Scalar& coeffRefDiag(Index idx) {
+        eigen_assert(idx < outerSize());
+        eigen_assert(idx < innerSize());
+        return this->m_data.diag(idx);
+    }
+
+    inline Scalar& coeffRefLower(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+            eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+        } else {
+            const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+            eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+        }
+    }
+
+    inline bool coeffExistLower(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+            return inner >= minInnerIndex;
+        } else {
+            const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+            return inner <= maxInnerIndex;
+        }
+    }
+
+    inline Scalar& coeffRefUpper(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minOuterIndex = inner - m_data.upperProfile(inner);
+            eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+        } else {
+            const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+            eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+        }
+    }
+
+    inline bool coeffExistUpper(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minOuterIndex = inner - m_data.upperProfile(inner);
+            return outer >= minOuterIndex;
+        } else {
+            const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+            return outer <= maxOuterIndex;
+        }
+    }
+
+
+protected:
+
+public:
+    class InnerUpperIterator;
+    class InnerLowerIterator;
+
+    class OuterUpperIterator;
+    class OuterLowerIterator;
+
+    /** Removes all non zeros */
+    inline void setZero() {
+        m_data.clear();
+        memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
+        memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
+    }
+
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const {
+        return m_data.diagSize() + m_data.upperSize() + m_data.lowerSize();
+    }
+
+    /** Preallocates \a reserveSize non zeros */
+    inline void reserve(Index reserveSize, Index reserveUpperSize, Index reserveLowerSize) {
+        m_data.reserve(reserveSize, reserveUpperSize, reserveLowerSize);
+    }
+
+    /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
+
+     *
+     * \warning This function can be extremely slow if the non zero coefficients
+     * are not inserted in a coherent order.
+     *
+     * After an insertion session, you should call the finalize() function.
+     */
+    EIGEN_DONT_INLINE Scalar & insert(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+
+        if (outer == inner)
+            return m_data.diag(col);
+
+        if (IsRowMajor) {
+            if (outer < inner) //upper matrix
+            {
+                Index minOuterIndex = 0;
+                minOuterIndex = inner - m_data.upperProfile(inner);
+
+                if (outer < minOuterIndex) //The value does not yet exist
+                {
+                    const Index previousProfile = m_data.upperProfile(inner);
+
+                    m_data.upperProfile(inner) = inner - outer;
+
+
+                    const Index bandIncrement = m_data.upperProfile(inner) - previousProfile;
+                    //shift data stored after this new one
+                    const Index stop = m_colStartIndex[cols()];
+                    const Index start = m_colStartIndex[inner];
+
+
+                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+                        m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx);
+                    }
+
+                    for (Index innerIdx = cols(); innerIdx > inner; innerIdx--) {
+                        m_colStartIndex[innerIdx] += bandIncrement;
+                    }
+
+                    //zeros new data
+                    memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+
+                    return m_data.upper(m_colStartIndex[inner]);
+                } else {
+                    return m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+                }
+            }
+
+            if (outer > inner) //lower matrix
+            {
+                const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+                if (inner < minInnerIndex) //The value does not yet exist
+                {
+                    const Index previousProfile = m_data.lowerProfile(outer);
+                    m_data.lowerProfile(outer) = outer - inner;
+
+                    const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile;
+                    //shift data stored after this new one
+                    const Index stop = m_rowStartIndex[rows()];
+                    const Index start = m_rowStartIndex[outer];
+
+
+                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+                        m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx);
+                    }
+
+                    for (Index innerIdx = rows(); innerIdx > outer; innerIdx--) {
+                        m_rowStartIndex[innerIdx] += bandIncrement;
+                    }
+
+                    //zeros new data
+                    memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    return m_data.lower(m_rowStartIndex[outer]);
+                } else {
+                    return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+                }
+            }
+        } else {
+            if (outer > inner) //upper matrix
+            {
+                const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+                if (outer > maxOuterIndex) //The value does not yet exist
+                {
+                    const Index previousProfile = m_data.upperProfile(inner);
+                    m_data.upperProfile(inner) = outer - inner;
+
+                    const Index bandIncrement = m_data.upperProfile(inner) - previousProfile;
+                    //shift data stored after this new one
+                    const Index stop = m_rowStartIndex[rows()];
+                    const Index start = m_rowStartIndex[inner + 1];
+
+                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+                        m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx);
+                    }
+
+                    for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) {
+                        m_rowStartIndex[innerIdx] += bandIncrement;
+                    }
+                    memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner));
+                } else {
+                    return m_data.upper(m_rowStartIndex[inner] + (outer - inner));
+                }
+            }
+
+            if (outer < inner) //lower matrix
+            {
+                const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+                if (inner > maxInnerIndex) //The value does not yet exist
+                {
+                    const Index previousProfile = m_data.lowerProfile(outer);
+                    m_data.lowerProfile(outer) = inner - outer;
+
+                    const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile;
+                    //shift data stored after this new one
+                    const Index stop = m_colStartIndex[cols()];
+                    const Index start = m_colStartIndex[outer + 1];
+
+                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+                        m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx);
+                    }
+
+                    for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) {
+                        m_colStartIndex[innerIdx] += bandIncrement;
+                    }
+                    memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer));
+                } else {
+                    return m_data.lower(m_colStartIndex[outer] + (inner - outer));
+                }
+            }
+        }
+    }
+
+    /** Must be called after inserting a set of non zero entries.
+     */
+    inline void finalize() {
+        if (IsRowMajor) {
+            if (rows() > cols())
+                m_data.resize(cols(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1);
+            else
+                m_data.resize(rows(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1);
+
+            //            eigen_assert(rows() == cols() && "memory reorganisatrion only works with suare matrix");
+            //
+            //            Scalar* newArray = new Scalar[m_colStartIndex[cols()] + 1 + m_rowStartIndex[rows()] + 1];
+            //            Index dataIdx = 0;
+            //            for (Index row = 0; row < rows(); row++) {
+            //
+            //                const Index nbLowerElts = m_rowStartIndex[row + 1] - m_rowStartIndex[row];
+            //                //                std::cout << "nbLowerElts" << nbLowerElts << std::endl;
+            //                memcpy(newArray + dataIdx, m_data.m_lower + m_rowStartIndex[row], nbLowerElts * sizeof (Scalar));
+            //                m_rowStartIndex[row] = dataIdx;
+            //                dataIdx += nbLowerElts;
+            //
+            //                const Index nbUpperElts = m_colStartIndex[row + 1] - m_colStartIndex[row];
+            //                memcpy(newArray + dataIdx, m_data.m_upper + m_colStartIndex[row], nbUpperElts * sizeof (Scalar));
+            //                m_colStartIndex[row] = dataIdx;
+            //                dataIdx += nbUpperElts;
+            //
+            //
+            //            }
+            //            //todo : don't access m_data profile directly : add an accessor from SkylineMatrix
+            //            m_rowStartIndex[rows()] = m_rowStartIndex[rows()-1] + m_data.lowerProfile(rows()-1);
+            //            m_colStartIndex[cols()] = m_colStartIndex[cols()-1] + m_data.upperProfile(cols()-1);
+            //
+            //            delete[] m_data.m_lower;
+            //            delete[] m_data.m_upper;
+            //
+            //            m_data.m_lower = newArray;
+            //            m_data.m_upper = newArray;
+        } else {
+            if (rows() > cols())
+                m_data.resize(cols(), rows(), cols(), m_rowStartIndex[cols()] + 1, m_colStartIndex[cols()] + 1);
+            else
+                m_data.resize(rows(), rows(), cols(), m_rowStartIndex[rows()] + 1, m_colStartIndex[rows()] + 1);
+        }
+    }
+
+    inline void squeeze() {
+        finalize();
+        m_data.squeeze();
+    }
+
+    void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar > ()) {
+        //TODO
+    }
+
+    /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero
+     * \sa resizeNonZeros(Index), reserve(), setZero()
+     */
+    void resize(size_t rows, size_t cols) {
+        const Index diagSize = rows > cols ? cols : rows;
+        m_innerSize = IsRowMajor ? cols : rows;
+
+        eigen_assert(rows == cols && "Skyline matrix must be square matrix");
+
+        if (diagSize % 2) { // diagSize is odd
+            const Index k = (diagSize - 1) / 2;
+
+            m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols,
+                    2 * k * k + k + 1,
+                    2 * k * k + k + 1);
+
+        } else // diagSize is even
+        {
+            const Index k = diagSize / 2;
+            m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols,
+                    2 * k * k - k + 1,
+                    2 * k * k - k + 1);
+        }
+
+        if (m_colStartIndex && m_rowStartIndex) {
+            delete[] m_colStartIndex;
+            delete[] m_rowStartIndex;
+        }
+        m_colStartIndex = new Index [cols + 1];
+        m_rowStartIndex = new Index [rows + 1];
+        m_outerSize = diagSize;
+
+        m_data.reset();
+        m_data.clear();
+
+        m_outerSize = diagSize;
+        memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index));
+        memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index));
+    }
+
+    void resizeNonZeros(Index size) {
+        m_data.resize(size);
+    }
+
+    inline SkylineMatrix()
+    : m_outerSize(-1), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+        resize(0, 0);
+    }
+
+    inline SkylineMatrix(size_t rows, size_t cols)
+    : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+        resize(rows, cols);
+    }
+
+    template<typename OtherDerived>
+    inline SkylineMatrix(const SkylineMatrixBase<OtherDerived>& other)
+    : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+        *this = other.derived();
+    }
+
+    inline SkylineMatrix(const SkylineMatrix & other)
+    : Base(), m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+        *this = other.derived();
+    }
+
+    inline void swap(SkylineMatrix & other) {
+        //EIGEN_DBG_SKYLINE(std::cout << "SkylineMatrix:: swap\n");
+        std::swap(m_colStartIndex, other.m_colStartIndex);
+        std::swap(m_rowStartIndex, other.m_rowStartIndex);
+        std::swap(m_innerSize, other.m_innerSize);
+        std::swap(m_outerSize, other.m_outerSize);
+        m_data.swap(other.m_data);
+    }
+
+    inline SkylineMatrix & operator=(const SkylineMatrix & other) {
+        std::cout << "SkylineMatrix& operator=(const SkylineMatrix& other)\n";
+        if (other.isRValue()) {
+            swap(other.const_cast_derived());
+        } else {
+            resize(other.rows(), other.cols());
+            memcpy(m_colStartIndex, other.m_colStartIndex, (m_outerSize + 1) * sizeof (Index));
+            memcpy(m_rowStartIndex, other.m_rowStartIndex, (m_outerSize + 1) * sizeof (Index));
+            m_data = other.m_data;
+        }
+        return *this;
+    }
+
+    template<typename OtherDerived>
+            inline SkylineMatrix & operator=(const SkylineMatrixBase<OtherDerived>& other) {
+        const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+        if (needToTranspose) {
+            //         TODO
+            //            return *this;
+        } else {
+            // there is no special optimization
+            return SkylineMatrixBase<SkylineMatrix>::operator=(other.derived());
+        }
+    }
+
+    friend std::ostream & operator <<(std::ostream & s, const SkylineMatrix & m) {
+
+        EIGEN_DBG_SKYLINE(
+        std::cout << "upper elements : " << std::endl;
+        for (Index i = 0; i < m.m_data.upperSize(); i++)
+            std::cout << m.m_data.upper(i) << "\t";
+        std::cout << std::endl;
+        std::cout << "upper profile : " << std::endl;
+        for (Index i = 0; i < m.m_data.upperProfileSize(); i++)
+            std::cout << m.m_data.upperProfile(i) << "\t";
+        std::cout << std::endl;
+        std::cout << "lower startIdx : " << std::endl;
+        for (Index i = 0; i < m.m_data.upperProfileSize(); i++)
+            std::cout << (IsRowMajor ? m.m_colStartIndex[i] : m.m_rowStartIndex[i]) << "\t";
+        std::cout << std::endl;
+
+
+        std::cout << "lower elements : " << std::endl;
+        for (Index i = 0; i < m.m_data.lowerSize(); i++)
+            std::cout << m.m_data.lower(i) << "\t";
+        std::cout << std::endl;
+        std::cout << "lower profile : " << std::endl;
+        for (Index i = 0; i < m.m_data.lowerProfileSize(); i++)
+            std::cout << m.m_data.lowerProfile(i) << "\t";
+        std::cout << std::endl;
+        std::cout << "lower startIdx : " << std::endl;
+        for (Index i = 0; i < m.m_data.lowerProfileSize(); i++)
+            std::cout << (IsRowMajor ? m.m_rowStartIndex[i] : m.m_colStartIndex[i]) << "\t";
+        std::cout << std::endl;
+        );
+        for (Index rowIdx = 0; rowIdx < m.rows(); rowIdx++) {
+            for (Index colIdx = 0; colIdx < m.cols(); colIdx++) {
+                s << m.coeff(rowIdx, colIdx) << "\t";
+            }
+            s << std::endl;
+        }
+        return s;
+    }
+
+    /** Destructor */
+    inline ~SkylineMatrix() {
+        delete[] m_colStartIndex;
+        delete[] m_rowStartIndex;
+    }
+
+    /** Overloaded for performance */
+    Scalar sum() const;
+};
+
+template<typename Scalar, int _Options>
+class SkylineMatrix<Scalar, _Options>::InnerUpperIterator {
+public:
+
+    InnerUpperIterator(const SkylineMatrix& mat, Index outer)
+    : m_matrix(mat), m_outer(outer),
+    m_id(_Options == RowMajor ? mat.m_colStartIndex[outer] : mat.m_rowStartIndex[outer] + 1),
+    m_start(m_id),
+    m_end(_Options == RowMajor ? mat.m_colStartIndex[outer + 1] : mat.m_rowStartIndex[outer + 1] + 1) {
+    }
+
+    inline InnerUpperIterator & operator++() {
+        m_id++;
+        return *this;
+    }
+
+    inline InnerUpperIterator & operator+=(Index shift) {
+        m_id += shift;
+        return *this;
+    }
+
+    inline Scalar value() const {
+        return m_matrix.m_data.upper(m_id);
+    }
+
+    inline Scalar* valuePtr() {
+        return const_cast<Scalar*> (&(m_matrix.m_data.upper(m_id)));
+    }
+
+    inline Scalar& valueRef() {
+        return const_cast<Scalar&> (m_matrix.m_data.upper(m_id));
+    }
+
+    inline Index index() const {
+        return IsRowMajor ? m_outer - m_matrix.m_data.upperProfile(m_outer) + (m_id - m_start) :
+                m_outer + (m_id - m_start) + 1;
+    }
+
+    inline Index row() const {
+        return IsRowMajor ? index() : m_outer;
+    }
+
+    inline Index col() const {
+        return IsRowMajor ? m_outer : index();
+    }
+
+    inline size_t size() const {
+        return m_matrix.m_data.upperProfile(m_outer);
+    }
+
+    inline operator bool() const {
+        return (m_id < m_end) && (m_id >= m_start);
+    }
+
+protected:
+    const SkylineMatrix& m_matrix;
+    const Index m_outer;
+    Index m_id;
+    const Index m_start;
+    const Index m_end;
+};
+
+template<typename Scalar, int _Options>
+class SkylineMatrix<Scalar, _Options>::InnerLowerIterator {
+public:
+
+    InnerLowerIterator(const SkylineMatrix& mat, Index outer)
+    : m_matrix(mat),
+    m_outer(outer),
+    m_id(_Options == RowMajor ? mat.m_rowStartIndex[outer] : mat.m_colStartIndex[outer] + 1),
+    m_start(m_id),
+    m_end(_Options == RowMajor ? mat.m_rowStartIndex[outer + 1] : mat.m_colStartIndex[outer + 1] + 1) {
+    }
+
+    inline InnerLowerIterator & operator++() {
+        m_id++;
+        return *this;
+    }
+
+    inline InnerLowerIterator & operator+=(Index shift) {
+        m_id += shift;
+        return *this;
+    }
+
+    inline Scalar value() const {
+        return m_matrix.m_data.lower(m_id);
+    }
+
+    inline Scalar* valuePtr() {
+        return const_cast<Scalar*> (&(m_matrix.m_data.lower(m_id)));
+    }
+
+    inline Scalar& valueRef() {
+        return const_cast<Scalar&> (m_matrix.m_data.lower(m_id));
+    }
+
+    inline Index index() const {
+        return IsRowMajor ? m_outer - m_matrix.m_data.lowerProfile(m_outer) + (m_id - m_start) :
+                m_outer + (m_id - m_start) + 1;
+        ;
+    }
+
+    inline Index row() const {
+        return IsRowMajor ? m_outer : index();
+    }
+
+    inline Index col() const {
+        return IsRowMajor ? index() : m_outer;
+    }
+
+    inline size_t size() const {
+        return m_matrix.m_data.lowerProfile(m_outer);
+    }
+
+    inline operator bool() const {
+        return (m_id < m_end) && (m_id >= m_start);
+    }
+
+protected:
+    const SkylineMatrix& m_matrix;
+    const Index m_outer;
+    Index m_id;
+    const Index m_start;
+    const Index m_end;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SkylineMatrix_H
diff --git a/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
new file mode 100644
index 0000000..b3a2372
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
@@ -0,0 +1,212 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin at cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEMATRIXBASE_H
+#define EIGEN_SKYLINEMATRIXBASE_H
+
+#include "SkylineUtil.h"
+
+namespace Eigen { 
+
+/** \ingroup Skyline_Module
+ *
+ * \class SkylineMatrixBase
+ *
+ * \brief Base class of any skyline matrices or skyline expressions
+ *
+ * \param Derived
+ *
+ */
+template<typename Derived> class SkylineMatrixBase : public EigenBase<Derived> {
+public:
+
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::index<StorageKind>::type Index;
+
+    enum {
+        RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+        /**< The number of rows at compile-time. This is just a copy of the value provided
+         * by the \a Derived type. If a value is not known at compile-time,
+         * it is set to the \a Dynamic constant.
+         * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
+
+        ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+        /**< The number of columns at compile-time. This is just a copy of the value provided
+         * by the \a Derived type. If a value is not known at compile-time,
+         * it is set to the \a Dynamic constant.
+         * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
+
+
+        SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+        internal::traits<Derived>::ColsAtCompileTime>::ret),
+        /**< This is equal to the number of coefficients, i.e. the number of
+         * rows times the number of columns, or to \a Dynamic if this is not
+         * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+        MaxRowsAtCompileTime = RowsAtCompileTime,
+        MaxColsAtCompileTime = ColsAtCompileTime,
+
+        MaxSizeAtCompileTime = (internal::size_at_compile_time<MaxRowsAtCompileTime,
+        MaxColsAtCompileTime>::ret),
+
+        IsVectorAtCompileTime = RowsAtCompileTime == 1 || ColsAtCompileTime == 1,
+        /**< This is set to true if either the number of rows or the number of
+         * columns is known at compile-time to be equal to 1. Indeed, in that case,
+         * we are dealing with a column-vector (if there is only one column) or with
+         * a row-vector (if there is only one row). */
+
+        Flags = internal::traits<Derived>::Flags,
+        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
+         * constructed from this one. See the \ref flags "list of flags".
+         */
+
+        CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+        /**< This is a rough measure of how expensive it is to read one coefficient from
+         * this expression.
+         */
+
+        IsRowMajor = Flags & RowMajorBit ? 1 : 0
+    };
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is the "real scalar" type; if the \a Scalar type is already real numbers
+     * (e.g. int, float or double) then \a RealScalar is just the same as \a Scalar. If
+     * \a Scalar is \a std::complex<T> then RealScalar is \a T.
+     *
+     * \sa class NumTraits
+     */
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    /** type of the equivalent square matrix */
+    typedef Matrix<Scalar, EIGEN_SIZE_MAX(RowsAtCompileTime, ColsAtCompileTime),
+                           EIGEN_SIZE_MAX(RowsAtCompileTime, ColsAtCompileTime) > SquareMatrixType;
+
+    inline const Derived& derived() const {
+        return *static_cast<const Derived*> (this);
+    }
+
+    inline Derived& derived() {
+        return *static_cast<Derived*> (this);
+    }
+
+    inline Derived& const_cast_derived() const {
+        return *static_cast<Derived*> (const_cast<SkylineMatrixBase*> (this));
+    }
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
+    inline Index rows() const {
+        return derived().rows();
+    }
+
+    /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
+    inline Index cols() const {
+        return derived().cols();
+    }
+
+    /** \returns the number of coefficients, which is \a rows()*cols().
+     * \sa rows(), cols(), SizeAtCompileTime. */
+    inline Index size() const {
+        return rows() * cols();
+    }
+
+    /** \returns the number of nonzero coefficients which is in practice the number
+     * of stored coefficients. */
+    inline Index nonZeros() const {
+        return derived().nonZeros();
+    }
+
+    /** \returns the size of the storage major dimension,
+     * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
+    Index outerSize() const {
+        return (int(Flags) & RowMajorBit) ? this->rows() : this->cols();
+    }
+
+    /** \returns the size of the inner dimension according to the storage order,
+     * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
+    Index innerSize() const {
+        return (int(Flags) & RowMajorBit) ? this->cols() : this->rows();
+    }
+
+    bool isRValue() const {
+        return m_isRValue;
+    }
+
+    Derived& markAsRValue() {
+        m_isRValue = true;
+        return derived();
+    }
+
+    SkylineMatrixBase() : m_isRValue(false) {
+        /* TODO check flags */
+    }
+
+    inline Derived & operator=(const Derived& other) {
+        this->operator=<Derived > (other);
+        return derived();
+    }
+
+    template<typename OtherDerived>
+    inline void assignGeneric(const OtherDerived& other) {
+        derived().resize(other.rows(), other.cols());
+        for (Index row = 0; row < rows(); row++)
+            for (Index col = 0; col < cols(); col++) {
+                if (other.coeff(row, col) != Scalar(0))
+                    derived().insert(row, col) = other.coeff(row, col);
+            }
+        derived().finalize();
+    }
+
+    template<typename OtherDerived>
+            inline Derived & operator=(const SkylineMatrixBase<OtherDerived>& other) {
+        //TODO
+    }
+
+    template<typename Lhs, typename Rhs>
+            inline Derived & operator=(const SkylineProduct<Lhs, Rhs, SkylineTimeSkylineProduct>& product);
+
+    friend std::ostream & operator <<(std::ostream & s, const SkylineMatrixBase& m) {
+        s << m.derived();
+        return s;
+    }
+
+    template<typename OtherDerived>
+    const typename SkylineProductReturnType<Derived, OtherDerived>::Type
+    operator*(const MatrixBase<OtherDerived> &other) const;
+
+    /** \internal use operator= */
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived>& dst) const {
+        dst.setZero();
+        for (Index i = 0; i < rows(); i++)
+            for (Index j = 0; j < rows(); j++)
+                dst(i, j) = derived().coeff(i, j);
+    }
+
+    Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime> toDense() const {
+        return derived();
+    }
+
+    /** \returns the matrix or vector obtained by evaluating this expression.
+     *
+     * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
+     * a const reference, in order to avoid a useless copy.
+     */
+    EIGEN_STRONG_INLINE const typename internal::eval<Derived, IsSkyline>::type eval() const {
+        return typename internal::eval<Derived>::type(derived());
+    }
+
+protected:
+    bool m_isRValue;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SkylineMatrixBase_H
diff --git a/include/eigen3/unsupported/Eigen/src/Skyline/SkylineProduct.h b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineProduct.h
new file mode 100644
index 0000000..1ddf455
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineProduct.h
@@ -0,0 +1,295 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin at cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEPRODUCT_H
+#define EIGEN_SKYLINEPRODUCT_H
+
+namespace Eigen { 
+
+template<typename Lhs, typename Rhs, int ProductMode>
+struct SkylineProductReturnType {
+    typedef const typename internal::nested<Lhs, Rhs::RowsAtCompileTime>::type LhsNested;
+    typedef const typename internal::nested<Rhs, Lhs::RowsAtCompileTime>::type RhsNested;
+
+    typedef SkylineProduct<LhsNested, RhsNested, ProductMode> Type;
+};
+
+template<typename LhsNested, typename RhsNested, int ProductMode>
+struct internal::traits<SkylineProduct<LhsNested, RhsNested, ProductMode> > {
+    // clean the nested types:
+    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
+    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
+    typedef typename _LhsNested::Scalar Scalar;
+
+    enum {
+        LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+        RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+        LhsFlags = _LhsNested::Flags,
+        RhsFlags = _RhsNested::Flags,
+
+        RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+        ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+        InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+        MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+        MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+        EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
+        ResultIsSkyline = ProductMode == SkylineTimeSkylineProduct,
+
+        RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit) | (ResultIsSkyline ? 0 : SkylineBit)),
+
+        Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+        | EvalBeforeAssigningBit
+        | EvalBeforeNestingBit,
+
+        CoeffReadCost = Dynamic
+    };
+
+    typedef typename internal::conditional<ResultIsSkyline,
+            SkylineMatrixBase<SkylineProduct<LhsNested, RhsNested, ProductMode> >,
+            MatrixBase<SkylineProduct<LhsNested, RhsNested, ProductMode> > >::type Base;
+};
+
+namespace internal {
+template<typename LhsNested, typename RhsNested, int ProductMode>
+class SkylineProduct : no_assignment_operator,
+public traits<SkylineProduct<LhsNested, RhsNested, ProductMode> >::Base {
+public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(SkylineProduct)
+
+private:
+
+    typedef typename traits<SkylineProduct>::_LhsNested _LhsNested;
+    typedef typename traits<SkylineProduct>::_RhsNested _RhsNested;
+
+public:
+
+    template<typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE SkylineProduct(const Lhs& lhs, const Rhs& rhs)
+    : m_lhs(lhs), m_rhs(rhs) {
+        eigen_assert(lhs.cols() == rhs.rows());
+
+        enum {
+            ProductIsValid = _LhsNested::ColsAtCompileTime == Dynamic
+            || _RhsNested::RowsAtCompileTime == Dynamic
+            || int(_LhsNested::ColsAtCompileTime) == int(_RhsNested::RowsAtCompileTime),
+            AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime,
+            SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested, _RhsNested)
+        };
+        // note to the lost user:
+        //    * for a dot product use: v1.dot(v2)
+        //    * for a coeff-wise product use: v1.cwise()*v2
+        EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
+                INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+                EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
+                INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+                EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const {
+        return m_lhs.rows();
+    }
+
+    EIGEN_STRONG_INLINE Index cols() const {
+        return m_rhs.cols();
+    }
+
+    EIGEN_STRONG_INLINE const _LhsNested& lhs() const {
+        return m_lhs;
+    }
+
+    EIGEN_STRONG_INLINE const _RhsNested& rhs() const {
+        return m_rhs;
+    }
+
+protected:
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+};
+
+// dense = skyline * dense
+// Note that here we force no inlining and separate the setZero() because GCC messes up otherwise
+
+template<typename Lhs, typename Rhs, typename Dest>
+EIGEN_DONT_INLINE void skyline_row_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) {
+    typedef typename remove_all<Lhs>::type _Lhs;
+    typedef typename remove_all<Rhs>::type _Rhs;
+    typedef typename traits<Lhs>::Scalar Scalar;
+
+    enum {
+        LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit,
+        LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit,
+        ProcessFirstHalf = LhsIsSelfAdjoint
+        && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0)
+        || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor)
+        || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)),
+        ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf)
+    };
+
+    //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix.
+    for (Index col = 0; col < rhs.cols(); col++) {
+        for (Index row = 0; row < lhs.rows(); row++) {
+            dst(row, col) = lhs.coeffDiag(row) * rhs(row, col);
+        }
+    }
+    //Use matrix lower triangular part
+    for (Index row = 0; row < lhs.rows(); row++) {
+        typename _Lhs::InnerLowerIterator lIt(lhs, row);
+        const Index stop = lIt.col() + lIt.size();
+        for (Index col = 0; col < rhs.cols(); col++) {
+
+            Index k = lIt.col();
+            Scalar tmp = 0;
+            while (k < stop) {
+                tmp +=
+                        lIt.value() *
+                        rhs(k++, col);
+                ++lIt;
+            }
+            dst(row, col) += tmp;
+            lIt += -lIt.size();
+        }
+
+    }
+
+    //Use matrix upper triangular part
+    for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) {
+        typename _Lhs::InnerUpperIterator uIt(lhs, lhscol);
+        const Index stop = uIt.size() + uIt.row();
+        for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) {
+
+
+            const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol);
+            Index k = uIt.row();
+            while (k < stop) {
+                dst(k++, rhscol) +=
+                        uIt.value() *
+                        rhsCoeff;
+                ++uIt;
+            }
+            uIt += -uIt.size();
+        }
+    }
+
+}
+
+template<typename Lhs, typename Rhs, typename Dest>
+EIGEN_DONT_INLINE void skyline_col_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) {
+    typedef typename remove_all<Lhs>::type _Lhs;
+    typedef typename remove_all<Rhs>::type _Rhs;
+    typedef typename traits<Lhs>::Scalar Scalar;
+
+    enum {
+        LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit,
+        LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit,
+        ProcessFirstHalf = LhsIsSelfAdjoint
+        && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0)
+        || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor)
+        || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)),
+        ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf)
+    };
+
+    //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix.
+    for (Index col = 0; col < rhs.cols(); col++) {
+        for (Index row = 0; row < lhs.rows(); row++) {
+            dst(row, col) = lhs.coeffDiag(row) * rhs(row, col);
+        }
+    }
+
+    //Use matrix upper triangular part
+    for (Index row = 0; row < lhs.rows(); row++) {
+        typename _Lhs::InnerUpperIterator uIt(lhs, row);
+        const Index stop = uIt.col() + uIt.size();
+        for (Index col = 0; col < rhs.cols(); col++) {
+
+            Index k = uIt.col();
+            Scalar tmp = 0;
+            while (k < stop) {
+                tmp +=
+                        uIt.value() *
+                        rhs(k++, col);
+                ++uIt;
+            }
+
+
+            dst(row, col) += tmp;
+            uIt += -uIt.size();
+        }
+    }
+
+    //Use matrix lower triangular part
+    for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) {
+        typename _Lhs::InnerLowerIterator lIt(lhs, lhscol);
+        const Index stop = lIt.size() + lIt.row();
+        for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) {
+
+            const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol);
+            Index k = lIt.row();
+            while (k < stop) {
+                dst(k++, rhscol) +=
+                        lIt.value() *
+                        rhsCoeff;
+                ++lIt;
+            }
+            lIt += -lIt.size();
+        }
+    }
+
+}
+
+template<typename Lhs, typename Rhs, typename ResultType,
+        int LhsStorageOrder = traits<Lhs>::Flags&RowMajorBit>
+        struct skyline_product_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct skyline_product_selector<Lhs, Rhs, ResultType, RowMajor> {
+    typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
+
+    static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) {
+        skyline_row_major_time_dense_product<Lhs, Rhs, ResultType > (lhs, rhs, res);
+    }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct skyline_product_selector<Lhs, Rhs, ResultType, ColMajor> {
+    typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
+
+    static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) {
+        skyline_col_major_time_dense_product<Lhs, Rhs, ResultType > (lhs, rhs, res);
+    }
+};
+
+} // end namespace internal
+
+// template<typename Derived>
+// template<typename Lhs, typename Rhs >
+// Derived & MatrixBase<Derived>::lazyAssign(const SkylineProduct<Lhs, Rhs, SkylineTimeDenseProduct>& product) {
+//     typedef typename internal::remove_all<Lhs>::type _Lhs;
+//     internal::skyline_product_selector<typename internal::remove_all<Lhs>::type,
+//             typename internal::remove_all<Rhs>::type,
+//             Derived>::run(product.lhs(), product.rhs(), derived());
+// 
+//     return derived();
+// }
+
+// skyline * dense
+
+template<typename Derived>
+template<typename OtherDerived >
+EIGEN_STRONG_INLINE const typename SkylineProductReturnType<Derived, OtherDerived>::Type
+SkylineMatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const {
+
+    return typename SkylineProductReturnType<Derived, OtherDerived>::Type(derived(), other.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEPRODUCT_H
diff --git a/include/eigen3/unsupported/Eigen/src/Skyline/SkylineStorage.h b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineStorage.h
new file mode 100644
index 0000000..378a8de
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineStorage.h
@@ -0,0 +1,259 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin at cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINE_STORAGE_H
+#define EIGEN_SKYLINE_STORAGE_H
+
+namespace Eigen { 
+
+/** Stores a skyline set of values in three structures :
+ * The diagonal elements
+ * The upper elements
+ * The lower elements
+ *
+ */
+template<typename Scalar>
+class SkylineStorage {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef SparseIndex Index;
+public:
+
+    SkylineStorage()
+    : m_diag(0),
+    m_lower(0),
+    m_upper(0),
+    m_lowerProfile(0),
+    m_upperProfile(0),
+    m_diagSize(0),
+    m_upperSize(0),
+    m_lowerSize(0),
+    m_upperProfileSize(0),
+    m_lowerProfileSize(0),
+    m_allocatedSize(0) {
+    }
+
+    SkylineStorage(const SkylineStorage& other)
+    : m_diag(0),
+    m_lower(0),
+    m_upper(0),
+    m_lowerProfile(0),
+    m_upperProfile(0),
+    m_diagSize(0),
+    m_upperSize(0),
+    m_lowerSize(0),
+    m_upperProfileSize(0),
+    m_lowerProfileSize(0),
+    m_allocatedSize(0) {
+        *this = other;
+    }
+
+    SkylineStorage & operator=(const SkylineStorage& other) {
+        resize(other.diagSize(), other.m_upperProfileSize, other.m_lowerProfileSize, other.upperSize(), other.lowerSize());
+        memcpy(m_diag, other.m_diag, m_diagSize * sizeof (Scalar));
+        memcpy(m_upper, other.m_upper, other.upperSize() * sizeof (Scalar));
+        memcpy(m_lower, other.m_lower, other.lowerSize() * sizeof (Scalar));
+        memcpy(m_upperProfile, other.m_upperProfile, m_upperProfileSize * sizeof (Index));
+        memcpy(m_lowerProfile, other.m_lowerProfile, m_lowerProfileSize * sizeof (Index));
+        return *this;
+    }
+
+    void swap(SkylineStorage& other) {
+        std::swap(m_diag, other.m_diag);
+        std::swap(m_upper, other.m_upper);
+        std::swap(m_lower, other.m_lower);
+        std::swap(m_upperProfile, other.m_upperProfile);
+        std::swap(m_lowerProfile, other.m_lowerProfile);
+        std::swap(m_diagSize, other.m_diagSize);
+        std::swap(m_upperSize, other.m_upperSize);
+        std::swap(m_lowerSize, other.m_lowerSize);
+        std::swap(m_allocatedSize, other.m_allocatedSize);
+    }
+
+    ~SkylineStorage() {
+        delete[] m_diag;
+        delete[] m_upper;
+        if (m_upper != m_lower)
+            delete[] m_lower;
+        delete[] m_upperProfile;
+        delete[] m_lowerProfile;
+    }
+
+    void reserve(Index size, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) {
+        Index newAllocatedSize = size + upperSize + lowerSize;
+        if (newAllocatedSize > m_allocatedSize)
+            reallocate(size, upperProfileSize, lowerProfileSize, upperSize, lowerSize);
+    }
+
+    void squeeze() {
+        if (m_allocatedSize > m_diagSize + m_upperSize + m_lowerSize)
+            reallocate(m_diagSize, m_upperProfileSize, m_lowerProfileSize, m_upperSize, m_lowerSize);
+    }
+
+    void resize(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize, float reserveSizeFactor = 0) {
+        if (m_allocatedSize < diagSize + upperSize + lowerSize)
+            reallocate(diagSize, upperProfileSize, lowerProfileSize, upperSize + Index(reserveSizeFactor * upperSize), lowerSize + Index(reserveSizeFactor * lowerSize));
+        m_diagSize = diagSize;
+        m_upperSize = upperSize;
+        m_lowerSize = lowerSize;
+        m_upperProfileSize = upperProfileSize;
+        m_lowerProfileSize = lowerProfileSize;
+    }
+
+    inline Index diagSize() const {
+        return m_diagSize;
+    }
+
+    inline Index upperSize() const {
+        return m_upperSize;
+    }
+
+    inline Index lowerSize() const {
+        return m_lowerSize;
+    }
+
+    inline Index upperProfileSize() const {
+        return m_upperProfileSize;
+    }
+
+    inline Index lowerProfileSize() const {
+        return m_lowerProfileSize;
+    }
+
+    inline Index allocatedSize() const {
+        return m_allocatedSize;
+    }
+
+    inline void clear() {
+        m_diagSize = 0;
+    }
+
+    inline Scalar& diag(Index i) {
+        return m_diag[i];
+    }
+
+    inline const Scalar& diag(Index i) const {
+        return m_diag[i];
+    }
+
+    inline Scalar& upper(Index i) {
+        return m_upper[i];
+    }
+
+    inline const Scalar& upper(Index i) const {
+        return m_upper[i];
+    }
+
+    inline Scalar& lower(Index i) {
+        return m_lower[i];
+    }
+
+    inline const Scalar& lower(Index i) const {
+        return m_lower[i];
+    }
+
+    inline Index& upperProfile(Index i) {
+        return m_upperProfile[i];
+    }
+
+    inline const Index& upperProfile(Index i) const {
+        return m_upperProfile[i];
+    }
+
+    inline Index& lowerProfile(Index i) {
+        return m_lowerProfile[i];
+    }
+
+    inline const Index& lowerProfile(Index i) const {
+        return m_lowerProfile[i];
+    }
+
+    static SkylineStorage Map(Index* upperProfile, Index* lowerProfile, Scalar* diag, Scalar* upper, Scalar* lower, Index size, Index upperSize, Index lowerSize) {
+        SkylineStorage res;
+        res.m_upperProfile = upperProfile;
+        res.m_lowerProfile = lowerProfile;
+        res.m_diag = diag;
+        res.m_upper = upper;
+        res.m_lower = lower;
+        res.m_allocatedSize = res.m_diagSize = size;
+        res.m_upperSize = upperSize;
+        res.m_lowerSize = lowerSize;
+        return res;
+    }
+
+    inline void reset() {
+        memset(m_diag, 0, m_diagSize * sizeof (Scalar));
+        memset(m_upper, 0, m_upperSize * sizeof (Scalar));
+        memset(m_lower, 0, m_lowerSize * sizeof (Scalar));
+        memset(m_upperProfile, 0, m_diagSize * sizeof (Index));
+        memset(m_lowerProfile, 0, m_diagSize * sizeof (Index));
+    }
+
+    void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) {
+        //TODO
+    }
+
+protected:
+
+    inline void reallocate(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) {
+
+        Scalar* diag = new Scalar[diagSize];
+        Scalar* upper = new Scalar[upperSize];
+        Scalar* lower = new Scalar[lowerSize];
+        Index* upperProfile = new Index[upperProfileSize];
+        Index* lowerProfile = new Index[lowerProfileSize];
+
+        Index copyDiagSize = (std::min)(diagSize, m_diagSize);
+        Index copyUpperSize = (std::min)(upperSize, m_upperSize);
+        Index copyLowerSize = (std::min)(lowerSize, m_lowerSize);
+        Index copyUpperProfileSize = (std::min)(upperProfileSize, m_upperProfileSize);
+        Index copyLowerProfileSize = (std::min)(lowerProfileSize, m_lowerProfileSize);
+
+        // copy
+        memcpy(diag, m_diag, copyDiagSize * sizeof (Scalar));
+        memcpy(upper, m_upper, copyUpperSize * sizeof (Scalar));
+        memcpy(lower, m_lower, copyLowerSize * sizeof (Scalar));
+        memcpy(upperProfile, m_upperProfile, copyUpperProfileSize * sizeof (Index));
+        memcpy(lowerProfile, m_lowerProfile, copyLowerProfileSize * sizeof (Index));
+
+
+
+        // delete old stuff
+        delete[] m_diag;
+        delete[] m_upper;
+        delete[] m_lower;
+        delete[] m_upperProfile;
+        delete[] m_lowerProfile;
+        m_diag = diag;
+        m_upper = upper;
+        m_lower = lower;
+        m_upperProfile = upperProfile;
+        m_lowerProfile = lowerProfile;
+        m_allocatedSize = diagSize + upperSize + lowerSize;
+        m_upperSize = upperSize;
+        m_lowerSize = lowerSize;
+    }
+
+public:
+    Scalar* m_diag;
+    Scalar* m_upper;
+    Scalar* m_lower;
+    Index* m_upperProfile;
+    Index* m_lowerProfile;
+    Index m_diagSize;
+    Index m_upperSize;
+    Index m_lowerSize;
+    Index m_upperProfileSize;
+    Index m_lowerProfileSize;
+    Index m_allocatedSize;
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPRESSED_STORAGE_H
diff --git a/include/eigen3/unsupported/Eigen/src/Skyline/SkylineUtil.h b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineUtil.h
new file mode 100644
index 0000000..75eb612
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Skyline/SkylineUtil.h
@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Guillaume Saupin <guillaume.saupin at cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEUTIL_H
+#define EIGEN_SKYLINEUTIL_H
+
+namespace Eigen { 
+
+#ifdef NDEBUG
+#define EIGEN_DBG_SKYLINE(X)
+#else
+#define EIGEN_DBG_SKYLINE(X) X
+#endif
+
+const unsigned int SkylineBit = 0x1200;
+template<typename Lhs, typename Rhs, int ProductMode> class SkylineProduct;
+enum AdditionalProductEvaluationMode {SkylineTimeDenseProduct, SkylineTimeSkylineProduct, DenseTimeSkylineProduct};
+enum {IsSkyline = SkylineBit};
+
+
+#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, Op) \
+template<typename OtherDerived> \
+EIGEN_STRONG_INLINE Derived& operator Op(const Eigen::SkylineMatrixBase<OtherDerived>& other) \
+{ \
+  return Base::operator Op(other.derived()); \
+} \
+EIGEN_STRONG_INLINE Derived& operator Op(const Derived& other) \
+{ \
+  return Base::operator Op(other); \
+}
+
+#define EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, Op) \
+template<typename Other> \
+EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
+{ \
+  return Base::operator Op(scalar); \
+}
+
+#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
+  EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
+  EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
+  EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
+  EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
+  EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
+
+#define _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, BaseClass) \
+  typedef BaseClass Base; \
+  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; \
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
+  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
+  typedef typename Eigen::internal::index<StorageKind>::type Index; \
+  enum {  Flags = Eigen::internal::traits<Derived>::Flags, };
+
+#define EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived) \
+  _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, Eigen::SkylineMatrixBase<Derived>)
+
+template<typename Derived> class SkylineMatrixBase;
+template<typename _Scalar, int _Flags = 0> class SkylineMatrix;
+template<typename _Scalar, int _Flags = 0> class DynamicSkylineMatrix;
+template<typename _Scalar, int _Flags = 0> class SkylineVector;
+template<typename _Scalar, int _Flags = 0> class MappedSkylineMatrix;
+
+namespace internal {
+
+template<typename Lhs, typename Rhs> struct skyline_product_mode;
+template<typename Lhs, typename Rhs, int ProductMode = skyline_product_mode<Lhs,Rhs>::value> struct SkylineProductReturnType;
+
+template<typename T> class eval<T,IsSkyline>
+{
+    typedef typename traits<T>::Scalar _Scalar;
+    enum {
+          _Flags = traits<T>::Flags
+    };
+
+  public:
+    typedef SkylineMatrix<_Scalar, _Flags> type;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEUTIL_H
diff --git a/include/eigen3/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h b/include/eigen3/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
new file mode 100644
index 0000000..e9ec746
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
@@ -0,0 +1,122 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
+#define EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
+
+namespace Eigen { 
+
+#if 0
+
+// NOTE Have to be reimplemented as a specialization of BlockImpl< DynamicSparseMatrix<_Scalar, _Options, _Index>, ... >
+// See SparseBlock.h for an example
+
+
+/***************************************************************************
+* specialisation for DynamicSparseMatrix
+***************************************************************************/
+
+template<typename _Scalar, int _Options, typename _Index, int Size>
+class SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options, _Index>, Size>
+  : public SparseMatrixBase<SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options, _Index>, Size> >
+{
+    typedef DynamicSparseMatrix<_Scalar, _Options, _Index> MatrixType;
+  public:
+
+    enum { IsRowMajor = internal::traits<SparseInnerVectorSet>::IsRowMajor };
+
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseInnerVectorSet)
+    class InnerIterator: public MatrixType::InnerIterator
+    {
+      public:
+        inline InnerIterator(const SparseInnerVectorSet& xpr, Index outer)
+          : MatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
+        {}
+        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
+        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
+      protected:
+        Index m_outer;
+    };
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, Index outerStart, Index outerSize)
+      : m_matrix(matrix), m_outerStart(outerStart), m_outerSize(outerSize)
+    {
+      eigen_assert( (outerStart>=0) && ((outerStart+outerSize)<=matrix.outerSize()) );
+    }
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, Index outer)
+      : m_matrix(matrix), m_outerStart(outer), m_outerSize(Size)
+    {
+      eigen_assert(Size!=Dynamic);
+      eigen_assert( (outer>=0) && (outer<matrix.outerSize()) );
+    }
+
+    template<typename OtherDerived>
+    inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+    {
+      if (IsRowMajor != ((OtherDerived::Flags&RowMajorBit)==RowMajorBit))
+      {
+        // need to transpose => perform a block evaluation followed by a big swap
+        DynamicSparseMatrix<Scalar,IsRowMajor?RowMajorBit:0> aux(other);
+        *this = aux.markAsRValue();
+      }
+      else
+      {
+        // evaluate/copy vector per vector
+        for (Index j=0; j<m_outerSize.value(); ++j)
+        {
+          SparseVector<Scalar,IsRowMajor ? RowMajorBit : 0> aux(other.innerVector(j));
+          m_matrix.const_cast_derived()._data()[m_outerStart+j].swap(aux._data());
+        }
+      }
+      return *this;
+    }
+
+    inline SparseInnerVectorSet& operator=(const SparseInnerVectorSet& other)
+    {
+      return operator=<SparseInnerVectorSet>(other);
+    }
+
+    Index nonZeros() const
+    {
+      Index count = 0;
+      for (Index j=0; j<m_outerSize.value(); ++j)
+        count += m_matrix._data()[m_outerStart+j].size();
+      return count;
+    }
+
+    const Scalar& lastCoeff() const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(SparseInnerVectorSet);
+      eigen_assert(m_matrix.data()[m_outerStart].size()>0);
+      return m_matrix.data()[m_outerStart].vale(m_matrix.data()[m_outerStart].size()-1);
+    }
+
+//     template<typename Sparse>
+//     inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+//     {
+//       return *this;
+//     }
+
+    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+  protected:
+
+    const typename MatrixType::Nested m_matrix;
+    Index m_outerStart;
+    const internal::variable_if_dynamic<Index, Size> m_outerSize;
+
+};
+
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
diff --git a/include/eigen3/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/include/eigen3/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
new file mode 100644
index 0000000..dec16df
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DYNAMIC_SPARSEMATRIX_H
+#define EIGEN_DYNAMIC_SPARSEMATRIX_H
+
+namespace Eigen { 
+
+/** \deprecated use a SparseMatrix in an uncompressed mode
+  *
+  * \class DynamicSparseMatrix
+  *
+  * \brief A sparse matrix class designed for matrix assembly purpose
+  *
+  * \param _Scalar the scalar type, i.e. the type of the coefficients
+  *
+  * Unlike SparseMatrix, this class provides a much higher degree of flexibility. In particular, it allows
+  * random read/write accesses in log(rho*outer_size) where \c rho is the probability that a coefficient is
+  * nonzero and outer_size is the number of columns if the matrix is column-major and the number of rows
+  * otherwise.
+  *
+  * Internally, the data are stored as a std::vector of compressed vector. The performances of random writes might
+  * decrease as the number of nonzeros per inner-vector increase. In practice, we observed very good performance
+  * till about 100 nonzeros/vector, and the performance remains relatively good till 500 nonzeros/vectors.
+  *
+  * \see SparseMatrix
+  */
+
+namespace internal {
+template<typename _Scalar, int _Options, typename _Index>
+struct traits<DynamicSparseMatrix<_Scalar, _Options, _Index> >
+{
+  typedef _Scalar Scalar;
+  typedef _Index Index;
+  typedef Sparse StorageKind;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    Flags = _Options | NestByRefBit | LvalueBit,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    SupportedAccessPatterns = OuterRandomAccessPattern
+  };
+};
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+ class  DynamicSparseMatrix
+  : public SparseMatrixBase<DynamicSparseMatrix<_Scalar, _Options, _Index> >
+{
+  public:
+    EIGEN_SPARSE_PUBLIC_INTERFACE(DynamicSparseMatrix)
+    // FIXME: why are these operator already alvailable ???
+    // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, +=)
+    // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, -=)
+    typedef MappedSparseMatrix<Scalar,Flags> Map;
+    using Base::IsRowMajor;
+    using Base::operator=;
+    enum {
+      Options = _Options
+    };
+
+  protected:
+
+    typedef DynamicSparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0)> TransposedSparseMatrix;
+
+    Index m_innerSize;
+    std::vector<internal::CompressedStorage<Scalar,Index> > m_data;
+
+  public:
+
+    inline Index rows() const { return IsRowMajor ? outerSize() : m_innerSize; }
+    inline Index cols() const { return IsRowMajor ? m_innerSize : outerSize(); }
+    inline Index innerSize() const { return m_innerSize; }
+    inline Index outerSize() const { return static_cast<Index>(m_data.size()); }
+    inline Index innerNonZeros(Index j) const { return m_data[j].size(); }
+
+    std::vector<internal::CompressedStorage<Scalar,Index> >& _data() { return m_data; }
+    const std::vector<internal::CompressedStorage<Scalar,Index> >& _data() const { return m_data; }
+
+    /** \returns the coefficient value at given position \a row, \a col
+      * This operation involes a log(rho*outer_size) binary search.
+      */
+    inline Scalar coeff(Index row, Index col) const
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+      return m_data[outer].at(inner);
+    }
+
+    /** \returns a reference to the coefficient value at given position \a row, \a col
+      * This operation involes a log(rho*outer_size) binary search. If the coefficient does not
+      * exist yet, then a sorted insertion into a sequential buffer is performed.
+      */
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+      return m_data[outer].atWithInsertion(inner);
+    }
+
+    class InnerIterator;
+    class ReverseInnerIterator;
+
+    void setZero()
+    {
+      for (Index j=0; j<outerSize(); ++j)
+        m_data[j].clear();
+    }
+
+    /** \returns the number of non zero coefficients */
+    Index nonZeros() const
+    {
+      Index res = 0;
+      for (Index j=0; j<outerSize(); ++j)
+        res += static_cast<Index>(m_data[j].size());
+      return res;
+    }
+
+
+
+    void reserve(Index reserveSize = 1000)
+    {
+      if (outerSize()>0)
+      {
+        Index reserveSizePerVector = (std::max)(reserveSize/outerSize(),Index(4));
+        for (Index j=0; j<outerSize(); ++j)
+        {
+          m_data[j].reserve(reserveSizePerVector);
+        }
+      }
+    }
+
+    /** Does nothing: provided for compatibility with SparseMatrix */
+    inline void startVec(Index /*outer*/) {}
+
+    /** \returns a reference to the non zero coefficient at position \a row, \a col assuming that:
+      * - the nonzero does not already exist
+      * - the new coefficient is the last one of the given inner vector.
+      *
+      * \sa insert, insertBackByOuterInner */
+    inline Scalar& insertBack(Index row, Index col)
+    {
+      return insertBackByOuterInner(IsRowMajor?row:col, IsRowMajor?col:row);
+    }
+
+    /** \sa insertBack */
+    inline Scalar& insertBackByOuterInner(Index outer, Index inner)
+    {
+      eigen_assert(outer<Index(m_data.size()) && inner<m_innerSize && "out of range");
+      eigen_assert(((m_data[outer].size()==0) || (m_data[outer].index(m_data[outer].size()-1)<inner))
+                && "wrong sorted insertion");
+      m_data[outer].append(0, inner);
+      return m_data[outer].value(m_data[outer].size()-1);
+    }
+
+    inline Scalar& insert(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index startId = 0;
+      Index id = static_cast<Index>(m_data[outer].size()) - 1;
+      m_data[outer].resize(id+2,1);
+
+      while ( (id >= startId) && (m_data[outer].index(id) > inner) )
+      {
+        m_data[outer].index(id+1) = m_data[outer].index(id);
+        m_data[outer].value(id+1) = m_data[outer].value(id);
+        --id;
+      }
+      m_data[outer].index(id+1) = inner;
+      m_data[outer].value(id+1) = 0;
+      return m_data[outer].value(id+1);
+    }
+
+    /** Does nothing: provided for compatibility with SparseMatrix */
+    inline void finalize() {}
+
+    /** Suppress all nonzeros which are smaller than \a reference under the tolerence \a epsilon */
+    void prune(Scalar reference, RealScalar epsilon = NumTraits<RealScalar>::dummy_precision())
+    {
+      for (Index j=0; j<outerSize(); ++j)
+        m_data[j].prune(reference,epsilon);
+    }
+
+    /** Resize the matrix without preserving the data (the matrix is set to zero)
+      */
+    void resize(Index rows, Index cols)
+    {
+      const Index outerSize = IsRowMajor ? rows : cols;
+      m_innerSize = IsRowMajor ? cols : rows;
+      setZero();
+      if (Index(m_data.size()) != outerSize)
+      {
+        m_data.resize(outerSize);
+      }
+    }
+
+    void resizeAndKeepData(Index rows, Index cols)
+    {
+      const Index outerSize = IsRowMajor ? rows : cols;
+      const Index innerSize = IsRowMajor ? cols : rows;
+      if (m_innerSize>innerSize)
+      {
+        // remove all coefficients with innerCoord>=innerSize
+        // TODO
+        //std::cerr << "not implemented yet\n";
+        exit(2);
+      }
+      if (m_data.size() != outerSize)
+      {
+        m_data.resize(outerSize);
+      }
+    }
+
+    /** The class DynamicSparseMatrix is deprectaed */
+    EIGEN_DEPRECATED inline DynamicSparseMatrix()
+      : m_innerSize(0), m_data(0)
+    {
+      eigen_assert(innerSize()==0 && outerSize()==0);
+    }
+
+    /** The class DynamicSparseMatrix is deprectaed */
+    EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
+      : m_innerSize(0)
+    {
+      resize(rows, cols);
+    }
+
+    /** The class DynamicSparseMatrix is deprectaed */
+    template<typename OtherDerived>
+    EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
+      : m_innerSize(0)
+    {
+    Base::operator=(other.derived());
+    }
+
+    inline DynamicSparseMatrix(const DynamicSparseMatrix& other)
+      : Base(), m_innerSize(0)
+    {
+      *this = other.derived();
+    }
+
+    inline void swap(DynamicSparseMatrix& other)
+    {
+      //EIGEN_DBG_SPARSE(std::cout << "SparseMatrix:: swap\n");
+      std::swap(m_innerSize, other.m_innerSize);
+      //std::swap(m_outerSize, other.m_outerSize);
+      m_data.swap(other.m_data);
+    }
+
+    inline DynamicSparseMatrix& operator=(const DynamicSparseMatrix& other)
+    {
+      if (other.isRValue())
+      {
+        swap(other.const_cast_derived());
+      }
+      else
+      {
+        resize(other.rows(), other.cols());
+        m_data = other.m_data;
+      }
+      return *this;
+    }
+
+    /** Destructor */
+    inline ~DynamicSparseMatrix() {}
+
+  public:
+
+    /** \deprecated
+      * Set the matrix to zero and reserve the memory for \a reserveSize nonzero coefficients. */
+    EIGEN_DEPRECATED void startFill(Index reserveSize = 1000)
+    {
+      setZero();
+      reserve(reserveSize);
+    }
+
+    /** \deprecated use insert()
+      * inserts a nonzero coefficient at given coordinates \a row, \a col and returns its reference assuming that:
+      *  1 - the coefficient does not exist yet
+      *  2 - this the coefficient with greater inner coordinate for the given outer coordinate.
+      * In other words, assuming \c *this is column-major, then there must not exists any nonzero coefficient of coordinates
+      * \c i \c x \a col such that \c i >= \a row. Otherwise the matrix is invalid.
+      *
+      * \see fillrand(), coeffRef()
+      */
+    EIGEN_DEPRECATED Scalar& fill(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+      return insertBack(outer,inner);
+    }
+
+    /** \deprecated use insert()
+      * Like fill() but with random inner coordinates.
+      * Compared to the generic coeffRef(), the unique limitation is that we assume
+      * the coefficient does not exist yet.
+      */
+    EIGEN_DEPRECATED Scalar& fillrand(Index row, Index col)
+    {
+      return insert(row,col);
+    }
+
+    /** \deprecated use finalize()
+      * Does nothing. Provided for compatibility with SparseMatrix. */
+    EIGEN_DEPRECATED void endFill() {}
+    
+#   ifdef EIGEN_DYNAMICSPARSEMATRIX_PLUGIN
+#     include EIGEN_DYNAMICSPARSEMATRIX_PLUGIN
+#   endif
+ };
+
+template<typename Scalar, int _Options, typename _Index>
+class DynamicSparseMatrix<Scalar,_Options,_Index>::InnerIterator : public SparseVector<Scalar,_Options,_Index>::InnerIterator
+{
+    typedef typename SparseVector<Scalar,_Options,_Index>::InnerIterator Base;
+  public:
+    InnerIterator(const DynamicSparseMatrix& mat, Index outer)
+      : Base(mat.m_data[outer]), m_outer(outer)
+    {}
+
+    inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
+    inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
+
+  protected:
+    const Index m_outer;
+};
+
+template<typename Scalar, int _Options, typename _Index>
+class DynamicSparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator : public SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator
+{
+    typedef typename SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator Base;
+  public:
+    ReverseInnerIterator(const DynamicSparseMatrix& mat, Index outer)
+      : Base(mat.m_data[outer]), m_outer(outer)
+    {}
+
+    inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
+    inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
+
+  protected:
+    const Index m_outer;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_DYNAMIC_SPARSEMATRIX_H
diff --git a/include/eigen3/unsupported/Eigen/src/SparseExtra/MarketIO.h b/include/eigen3/unsupported/Eigen/src/SparseExtra/MarketIO.h
new file mode 100644
index 0000000..7aafce9
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -0,0 +1,273 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud at inria.fr>
+// Copyright (C) 2012 Desire NUENTSA WAKAM <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MARKET_IO_H
+#define EIGEN_SPARSE_MARKET_IO_H
+
+#include <iostream>
+
+namespace Eigen { 
+
+namespace internal 
+{
+  template <typename Scalar>
+  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, Scalar& value)
+  {
+    line >> i >> j >> value;
+    i--;
+    j--;
+    if(i>=0 && j>=0 && i<M && j<N)
+    {
+      return true; 
+    }
+    else
+      return false;
+  }
+  template <typename Scalar>
+  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, std::complex<Scalar>& value)
+  {
+    Scalar valR, valI;
+    line >> i >> j >> valR >> valI;
+    i--;
+    j--;
+    if(i>=0 && j>=0 && i<M && j<N)
+    {
+      value = std::complex<Scalar>(valR, valI);
+      return true; 
+    }
+    else
+      return false;
+  }
+
+  template <typename RealScalar>
+  inline void  GetVectorElt (const std::string& line, RealScalar& val)
+  {
+    std::istringstream newline(line);
+    newline >> val;  
+  }
+
+  template <typename RealScalar>
+  inline void GetVectorElt (const std::string& line, std::complex<RealScalar>& val)
+  {
+    RealScalar valR, valI; 
+    std::istringstream newline(line);
+    newline >> valR >> valI; 
+    val = std::complex<RealScalar>(valR, valI);
+  }
+  
+  template<typename Scalar>
+  inline void putMarketHeader(std::string& header,int sym)
+  {
+    header= "%%MatrixMarket matrix coordinate ";
+    if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+    {
+      header += " complex"; 
+      if(sym == Symmetric) header += " symmetric";
+      else if (sym == SelfAdjoint) header += " Hermitian";
+      else header += " general";
+    }
+    else
+    {
+      header += " real"; 
+      if(sym == Symmetric) header += " symmetric";
+      else header += " general";
+    }
+  }
+
+  template<typename Scalar>
+  inline void PutMatrixElt(Scalar value, int row, int col, std::ofstream& out)
+  {
+    out << row << " "<< col << " " << value << "\n";
+  }
+  template<typename Scalar>
+  inline void PutMatrixElt(std::complex<Scalar> value, int row, int col, std::ofstream& out)
+  {
+    out << row << " " << col << " " << value.real() << " " << value.imag() << "\n";
+  }
+
+
+  template<typename Scalar>
+  inline void putVectorElt(Scalar value, std::ofstream& out)
+  {
+    out << value << "\n"; 
+  }
+  template<typename Scalar>
+  inline void putVectorElt(std::complex<Scalar> value, std::ofstream& out)
+  {
+    out << value.real << " " << value.imag()<< "\n"; 
+  }
+
+} // end namepsace internal
+
+inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector)
+{
+  sym = 0; 
+  isvector = false;
+  std::ifstream in(filename.c_str(),std::ios::in);
+  if(!in)
+    return false;
+  
+  std::string line; 
+  // The matrix header is always the first line in the file 
+  std::getline(in, line); eigen_assert(in.good());
+  
+  std::stringstream fmtline(line); 
+  std::string substr[5];
+  fmtline>> substr[0] >> substr[1] >> substr[2] >> substr[3] >> substr[4];
+  if(substr[2].compare("array") == 0) isvector = true;
+  if(substr[3].compare("complex") == 0) iscomplex = true;
+  if(substr[4].compare("symmetric") == 0) sym = Symmetric;
+  else if (substr[4].compare("Hermitian") == 0) sym = SelfAdjoint;
+  
+  return true;
+}
+  
+template<typename SparseMatrixType>
+bool loadMarket(SparseMatrixType& mat, const std::string& filename)
+{
+  typedef typename SparseMatrixType::Scalar Scalar;
+  std::ifstream input(filename.c_str(),std::ios::in);
+  if(!input)
+    return false;
+  
+  const int maxBuffersize = 2048;
+  char buffer[maxBuffersize];
+  
+  bool readsizes = false;
+
+  typedef Triplet<Scalar,int> T;
+  std::vector<T> elements;
+  
+  int M(-1), N(-1), NNZ(-1);
+  int count = 0;
+  while(input.getline(buffer, maxBuffersize))
+  {
+    // skip comments   
+    //NOTE An appropriate test should be done on the header to get the  symmetry
+    if(buffer[0]=='%')
+      continue;
+    
+    std::stringstream line(buffer);
+    
+    if(!readsizes)
+    {
+      line >> M >> N >> NNZ;
+      if(M > 0 && N > 0 && NNZ > 0) 
+      {
+        readsizes = true;
+        std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
+        mat.resize(M,N);
+        mat.reserve(NNZ);
+      }
+    }
+    else
+    { 
+      int i(-1), j(-1);
+      Scalar value; 
+      if( internal::GetMarketLine(line, M, N, i, j, value) ) 
+      {
+        ++ count;
+        elements.push_back(T(i,j,value));
+      }
+      else 
+        std::cerr << "Invalid read: " << i << "," << j << "\n";        
+    }
+  }
+  mat.setFromTriplets(elements.begin(), elements.end());
+  if(count!=NNZ)
+    std::cerr << count << "!=" << NNZ << "\n";
+  
+  input.close();
+  return true;
+}
+
+template<typename VectorType>
+bool loadMarketVector(VectorType& vec, const std::string& filename)
+{
+   typedef typename VectorType::Scalar Scalar;
+  std::ifstream in(filename.c_str(), std::ios::in);
+  if(!in)
+    return false;
+  
+  std::string line; 
+  int n(0), col(0); 
+  do 
+  { // Skip comments
+    std::getline(in, line); eigen_assert(in.good());
+  } while (line[0] == '%');
+  std::istringstream newline(line);
+  newline  >> n >> col; 
+  eigen_assert(n>0 && col>0);
+  vec.resize(n);
+  int i = 0; 
+  Scalar value; 
+  while ( std::getline(in, line) && (i < n) ){
+    internal::GetVectorElt(line, value); 
+    vec(i++) = value; 
+  }
+  in.close();
+  if (i!=n){
+    std::cerr<< "Unable to read all elements from file " << filename << "\n";
+    return false;
+  }
+  return true;
+}
+
+template<typename SparseMatrixType>
+bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0)
+{
+  typedef typename SparseMatrixType::Scalar Scalar;
+  std::ofstream out(filename.c_str(),std::ios::out);
+  if(!out)
+    return false;
+  
+  out.flags(std::ios_base::scientific);
+  out.precision(64);
+  std::string header; 
+  internal::putMarketHeader<Scalar>(header, sym); 
+  out << header << std::endl; 
+  out << mat.rows() << " " << mat.cols() << " " << mat.nonZeros() << "\n";
+  int count = 0;
+  for(int j=0; j<mat.outerSize(); ++j)
+    for(typename SparseMatrixType::InnerIterator it(mat,j); it; ++it)
+    {
+	++ count;
+	internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
+	// out << it.row()+1 << " " << it.col()+1 << " " << it.value() << "\n";
+    }
+  out.close();
+  return true;
+}
+
+template<typename VectorType>
+bool saveMarketVector (const VectorType& vec, const std::string& filename)
+{
+ typedef typename VectorType::Scalar Scalar; 
+ std::ofstream out(filename.c_str(),std::ios::out);
+  if(!out)
+    return false;
+  
+  out.flags(std::ios_base::scientific);
+  out.precision(64);
+  if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+      out << "%%MatrixMarket matrix array complex general\n"; 
+  else
+    out << "%%MatrixMarket matrix array real general\n"; 
+  out << vec.size() << " "<< 1 << "\n";
+  for (int i=0; i < vec.size(); i++){
+    internal::putVectorElt(vec(i), out); 
+  }
+  out.close();
+  return true; 
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_MARKET_IO_H
diff --git a/include/eigen3/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h b/include/eigen3/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
new file mode 100644
index 0000000..bf13cf2
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
@@ -0,0 +1,232 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire NUENTSA WAKAM <desire.nuentsa_wakam at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BROWSE_MATRICES_H
+#define EIGEN_BROWSE_MATRICES_H
+
+namespace Eigen {
+
+enum {
+  SPD = 0x100,
+  NonSymmetric = 0x0
+}; 
+
+/** 
+ * @brief Iterator to browse matrices from a specified folder
+ * 
+ * This is used to load all the matrices from a folder. 
+ * The matrices should be in Matrix Market format
+ * It is assumed that the matrices are named as matname.mtx
+ * and matname_SPD.mtx if the matrix is Symmetric and positive definite (or Hermitian)
+ * The right hand side vectors are loaded as well, if they exist.
+ * They should be named as matname_b.mtx. 
+ * Note that the right hand side for a SPD matrix is named as matname_SPD_b.mtx
+ * 
+ * Sometimes a reference solution is available. In this case, it should be named as matname_x.mtx
+ * 
+ * Sample code
+ * \code
+ * 
+ * \endcode
+ * 
+ * \tparam Scalar The scalar type 
+ */
+template <typename Scalar>
+class MatrixMarketIterator 
+{
+  public:
+    typedef Matrix<Scalar,Dynamic,1> VectorType; 
+    typedef SparseMatrix<Scalar,ColMajor> MatrixType; 
+  
+  public:
+    MatrixMarketIterator(const std::string folder):m_sym(0),m_isvalid(false),m_matIsLoaded(false),m_hasRhs(false),m_hasrefX(false),m_folder(folder)
+    {
+      m_folder_id = opendir(folder.c_str());
+      if (!m_folder_id){
+        m_isvalid = false;
+        std::cerr << "The provided Matrix folder could not be opened \n\n";
+        abort();
+      }
+      Getnextvalidmatrix();
+    }
+    
+    ~MatrixMarketIterator()
+    {
+      if (m_folder_id) closedir(m_folder_id); 
+    }
+    
+    inline MatrixMarketIterator& operator++()
+    {
+      m_matIsLoaded = false;
+      m_hasrefX = false;
+      m_hasRhs = false;
+      Getnextvalidmatrix();
+      return *this;
+    }
+    inline operator bool() const { return m_isvalid;}
+    
+    /** Return the sparse matrix corresponding to the current file */
+    inline MatrixType& matrix() 
+    { 
+      // Read the matrix
+      if (m_matIsLoaded) return m_mat;
+      
+      std::string matrix_file = m_folder + "/" + m_matname + ".mtx";
+      if ( !loadMarket(m_mat, matrix_file)) 
+      {
+        m_matIsLoaded = false;
+        return m_mat;
+      }
+      m_matIsLoaded = true; 
+      
+      if (m_sym != NonSymmetric) 
+      { // Store the upper part of the matrix. It is needed by the solvers dealing with nonsymmetric matrices ??
+        MatrixType B; 
+        B = m_mat;
+        m_mat = B.template selfadjointView<Lower>();
+      }
+      return m_mat; 
+    }
+    
+    /** Return the right hand side corresponding to the current matrix. 
+     * If the rhs file is not provided, a random rhs is generated
+     */
+    inline VectorType& rhs() 
+    { 
+       // Get the right hand side
+      if (m_hasRhs) return m_rhs;
+      
+      std::string rhs_file;
+      rhs_file = m_folder + "/" + m_matname + "_b.mtx"; // The pattern is matname_b.mtx
+      m_hasRhs = Fileexists(rhs_file);
+      if (m_hasRhs)
+      {
+        m_rhs.resize(m_mat.cols());
+        m_hasRhs = loadMarketVector(m_rhs, rhs_file);
+      }
+      if (!m_hasRhs)
+      {
+        // Generate a random right hand side
+        if (!m_matIsLoaded) this->matrix(); 
+        m_refX.resize(m_mat.cols());
+        m_refX.setRandom();
+        m_rhs = m_mat * m_refX;
+        m_hasrefX = true;
+        m_hasRhs = true;
+      }
+      return m_rhs; 
+    }
+    
+    /** Return a reference solution
+     * If it is not provided and if the right hand side is not available
+     * then refX is randomly generated such that A*refX = b 
+     * where A and b are the matrix and the rhs. 
+     * Note that when a rhs is provided, refX is not available 
+     */
+    inline VectorType& refX() 
+    { 
+      // Check if a reference solution is provided
+      if (m_hasrefX) return m_refX;
+      
+      std::string lhs_file;
+      lhs_file = m_folder + "/" + m_matname + "_x.mtx"; 
+      m_hasrefX = Fileexists(lhs_file);
+      if (m_hasrefX)
+      {
+        m_refX.resize(m_mat.cols());
+        m_hasrefX = loadMarketVector(m_refX, lhs_file);
+      }
+      return m_refX; 
+    }
+    
+    inline std::string& matname() { return m_matname; }
+    
+    inline int sym() { return m_sym; }
+    
+    inline bool hasRhs() {return m_hasRhs; }
+    inline bool hasrefX() {return m_hasrefX; }
+    
+  protected:
+    
+    inline bool Fileexists(std::string file)
+    {
+      std::ifstream file_id(file.c_str());
+      if (!file_id.good() ) 
+      {
+        return false;
+      }
+      else 
+      {
+        file_id.close();
+        return true;
+      }
+    }
+    
+    void Getnextvalidmatrix( )
+    {
+      m_isvalid = false;
+      // Here, we return with the next valid matrix in the folder
+      while ( (m_curs_id = readdir(m_folder_id)) != NULL) {
+        m_isvalid = false;
+        std::string curfile;
+        curfile = m_folder + "/" + m_curs_id->d_name;
+        // Discard if it is a folder
+        if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems
+//         struct stat st_buf; 
+//         stat (curfile.c_str(), &st_buf);
+//         if (S_ISDIR(st_buf.st_mode)) continue;
+        
+        // Determine from the header if it is a matrix or a right hand side 
+        bool isvector,iscomplex=false;
+        if(!getMarketHeader(curfile,m_sym,iscomplex,isvector)) continue;
+        if(isvector) continue;
+        if (!iscomplex)
+        {
+          if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+            continue; 
+        }
+        if (iscomplex)
+        {
+          if(internal::is_same<Scalar, float>::value || internal::is_same<Scalar, double>::value)
+            continue; 
+        }
+        
+        
+        // Get the matrix name
+        std::string filename = m_curs_id->d_name;
+        m_matname = filename.substr(0, filename.length()-4); 
+        
+        // Find if the matrix is SPD 
+        size_t found = m_matname.find("SPD");
+        if( (found!=std::string::npos) && (m_sym != NonSymmetric) )
+          m_sym = SPD;
+       
+        m_isvalid = true;
+        break; 
+      }
+    }
+    int m_sym; // Symmetry of the matrix
+    MatrixType m_mat; // Current matrix  
+    VectorType m_rhs;  // Current vector
+    VectorType m_refX; // The reference solution, if exists
+    std::string m_matname; // Matrix Name
+    bool m_isvalid; 
+    bool m_matIsLoaded; // Determine if the matrix has already been loaded from the file
+    bool m_hasRhs; // The right hand side exists
+    bool m_hasrefX; // A reference solution is provided
+    std::string m_folder;
+    DIR * m_folder_id;
+    struct dirent *m_curs_id; 
+    
+};
+
+} // end namespace Eigen
+
+#endif
diff --git a/include/eigen3/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/include/eigen3/unsupported/Eigen/src/SparseExtra/RandomSetter.h
new file mode 100644
index 0000000..dee1708
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/SparseExtra/RandomSetter.h
@@ -0,0 +1,327 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud at inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANDOMSETTER_H
+#define EIGEN_RANDOMSETTER_H
+
+namespace Eigen { 
+
+/** Represents a std::map
+  *
+  * \see RandomSetter
+  */
+template<typename Scalar> struct StdMapTraits
+{
+  typedef int KeyType;
+  typedef std::map<KeyType,Scalar> Type;
+  enum {
+    IsSorted = 1
+  };
+
+  static void setInvalidKey(Type&, const KeyType&) {}
+};
+
+#ifdef EIGEN_UNORDERED_MAP_SUPPORT
+/** Represents a std::unordered_map
+  *
+  * To use it you need to both define EIGEN_UNORDERED_MAP_SUPPORT and include the unordered_map header file
+  * yourself making sure that unordered_map is defined in the std namespace.
+  *
+  * For instance, with current version of gcc you can either enable C++0x standard (-std=c++0x) or do:
+  * \code
+  * #include <tr1/unordered_map>
+  * #define EIGEN_UNORDERED_MAP_SUPPORT
+  * namespace std {
+  *   using std::tr1::unordered_map;
+  * }
+  * \endcode
+  *
+  * \see RandomSetter
+  */
+template<typename Scalar> struct StdUnorderedMapTraits
+{
+  typedef int KeyType;
+  typedef std::unordered_map<KeyType,Scalar> Type;
+  enum {
+    IsSorted = 0
+  };
+
+  static void setInvalidKey(Type&, const KeyType&) {}
+};
+#endif // EIGEN_UNORDERED_MAP_SUPPORT
+
+#ifdef _DENSE_HASH_MAP_H_
+/** Represents a google::dense_hash_map
+  *
+  * \see RandomSetter
+  */
+template<typename Scalar> struct GoogleDenseHashMapTraits
+{
+  typedef int KeyType;
+  typedef google::dense_hash_map<KeyType,Scalar> Type;
+  enum {
+    IsSorted = 0
+  };
+
+  static void setInvalidKey(Type& map, const KeyType& k)
+  { map.set_empty_key(k); }
+};
+#endif
+
+#ifdef _SPARSE_HASH_MAP_H_
+/** Represents a google::sparse_hash_map
+  *
+  * \see RandomSetter
+  */
+template<typename Scalar> struct GoogleSparseHashMapTraits
+{
+  typedef int KeyType;
+  typedef google::sparse_hash_map<KeyType,Scalar> Type;
+  enum {
+    IsSorted = 0
+  };
+
+  static void setInvalidKey(Type&, const KeyType&) {}
+};
+#endif
+
+/** \class RandomSetter
+  *
+  * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access
+  *
+  * \param SparseMatrixType the type of the sparse matrix we are updating
+  * \param MapTraits a traits class representing the map implementation used for the temporary sparse storage.
+  *                  Its default value depends on the system.
+  * \param OuterPacketBits defines the number of rows (or columns) manage by a single map object
+  *                        as a power of two exponent.
+  *
+  * This class temporarily represents a sparse matrix object using a generic map implementation allowing for
+  * efficient random access. The conversion from the compressed representation to a hash_map object is performed
+  * in the RandomSetter constructor, while the sparse matrix is updated back at destruction time. This strategy
+  * suggest the use of nested blocks as in this example:
+  *
+  * \code
+  * SparseMatrix<double> m(rows,cols);
+  * {
+  *   RandomSetter<SparseMatrix<double> > w(m);
+  *   // don't use m but w instead with read/write random access to the coefficients:
+  *   for(;;)
+  *     w(rand(),rand()) = rand;
+  * }
+  * // when w is deleted, the data are copied back to m
+  * // and m is ready to use.
+  * \endcode
+  *
+  * Since hash_map objects are not fully sorted, representing a full matrix as a single hash_map would
+  * involve a big and costly sort to update the compressed matrix back. To overcome this issue, a RandomSetter
+  * use multiple hash_map, each representing 2^OuterPacketBits columns or rows according to the storage order.
+  * To reach optimal performance, this value should be adjusted according to the average number of nonzeros
+  * per rows/columns.
+  *
+  * The possible values for the template parameter MapTraits are:
+  *  - \b StdMapTraits: corresponds to std::map. (does not perform very well)
+  *  - \b GnuHashMapTraits: corresponds to __gnu_cxx::hash_map (available only with GCC)
+  *  - \b GoogleDenseHashMapTraits: corresponds to google::dense_hash_map (best efficiency, reasonable memory consumption)
+  *  - \b GoogleSparseHashMapTraits: corresponds to google::sparse_hash_map (best memory consumption, relatively good performance)
+  *
+  * The default map implementation depends on the availability, and the preferred order is:
+  * GoogleSparseHashMapTraits, GnuHashMapTraits, and finally StdMapTraits.
+  *
+  * For performance and memory consumption reasons it is highly recommended to use one of
+  * the Google's hash_map implementation. To enable the support for them, you have two options:
+  *  - \#include <google/dense_hash_map> yourself \b before Eigen/Sparse header
+  *  - define EIGEN_GOOGLEHASH_SUPPORT
+  * In the later case the inclusion of <google/dense_hash_map> is made for you.
+  *
+  * \see http://code.google.com/p/google-sparsehash/
+  */
+template<typename SparseMatrixType,
+         template <typename T> class MapTraits =
+#if defined _DENSE_HASH_MAP_H_
+          GoogleDenseHashMapTraits
+#elif defined _HASH_MAP
+          GnuHashMapTraits
+#else
+          StdMapTraits
+#endif
+         ,int OuterPacketBits = 6>
+class RandomSetter
+{
+    typedef typename SparseMatrixType::Scalar Scalar;
+    typedef typename SparseMatrixType::Index Index;
+
+    struct ScalarWrapper
+    {
+      ScalarWrapper() : value(0) {}
+      Scalar value;
+    };
+    typedef typename MapTraits<ScalarWrapper>::KeyType KeyType;
+    typedef typename MapTraits<ScalarWrapper>::Type HashMapType;
+    static const int OuterPacketMask = (1 << OuterPacketBits) - 1;
+    enum {
+      SwapStorage = 1 - MapTraits<ScalarWrapper>::IsSorted,
+      TargetRowMajor = (SparseMatrixType::Flags & RowMajorBit) ? 1 : 0,
+      SetterRowMajor = SwapStorage ? 1-TargetRowMajor : TargetRowMajor
+    };
+
+  public:
+
+    /** Constructs a random setter object from the sparse matrix \a target
+      *
+      * Note that the initial value of \a target are imported. If you want to re-set
+      * a sparse matrix from scratch, then you must set it to zero first using the
+      * setZero() function.
+      */
+    inline RandomSetter(SparseMatrixType& target)
+      : mp_target(&target)
+    {
+      const Index outerSize = SwapStorage ? target.innerSize() : target.outerSize();
+      const Index innerSize = SwapStorage ? target.outerSize() : target.innerSize();
+      m_outerPackets = outerSize >> OuterPacketBits;
+      if (outerSize&OuterPacketMask)
+        m_outerPackets += 1;
+      m_hashmaps = new HashMapType[m_outerPackets];
+      // compute number of bits needed to store inner indices
+      Index aux = innerSize - 1;
+      m_keyBitsOffset = 0;
+      while (aux)
+      {
+        ++m_keyBitsOffset;
+        aux = aux >> 1;
+      }
+      KeyType ik = (1<<(OuterPacketBits+m_keyBitsOffset));
+      for (Index k=0; k<m_outerPackets; ++k)
+        MapTraits<ScalarWrapper>::setInvalidKey(m_hashmaps[k],ik);
+
+      // insert current coeffs
+      for (Index j=0; j<mp_target->outerSize(); ++j)
+        for (typename SparseMatrixType::InnerIterator it(*mp_target,j); it; ++it)
+          (*this)(TargetRowMajor?j:it.index(), TargetRowMajor?it.index():j) = it.value();
+    }
+
+    /** Destructor updating back the sparse matrix target */
+    ~RandomSetter()
+    {
+      KeyType keyBitsMask = (1<<m_keyBitsOffset)-1;
+      if (!SwapStorage) // also means the map is sorted
+      {
+        mp_target->setZero();
+        mp_target->makeCompressed();
+        mp_target->reserve(nonZeros());
+        Index prevOuter = -1;
+        for (Index k=0; k<m_outerPackets; ++k)
+        {
+          const Index outerOffset = (1<<OuterPacketBits) * k;
+          typename HashMapType::iterator end = m_hashmaps[k].end();
+          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
+          {
+            const Index outer = (it->first >> m_keyBitsOffset) + outerOffset;
+            const Index inner = it->first & keyBitsMask;
+            if (prevOuter!=outer)
+            {
+              for (Index j=prevOuter+1;j<=outer;++j)
+                mp_target->startVec(j);
+              prevOuter = outer;
+            }
+            mp_target->insertBackByOuterInner(outer, inner) = it->second.value;
+          }
+        }
+        mp_target->finalize();
+      }
+      else
+      {
+        VectorXi positions(mp_target->outerSize());
+        positions.setZero();
+        // pass 1
+        for (Index k=0; k<m_outerPackets; ++k)
+        {
+          typename HashMapType::iterator end = m_hashmaps[k].end();
+          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
+          {
+            const Index outer = it->first & keyBitsMask;
+            ++positions[outer];
+          }
+        }
+        // prefix sum
+        Index count = 0;
+        for (Index j=0; j<mp_target->outerSize(); ++j)
+        {
+          Index tmp = positions[j];
+          mp_target->outerIndexPtr()[j] = count;
+          positions[j] = count;
+          count += tmp;
+        }
+        mp_target->makeCompressed();
+        mp_target->outerIndexPtr()[mp_target->outerSize()] = count;
+        mp_target->resizeNonZeros(count);
+        // pass 2
+        for (Index k=0; k<m_outerPackets; ++k)
+        {
+          const Index outerOffset = (1<<OuterPacketBits) * k;
+          typename HashMapType::iterator end = m_hashmaps[k].end();
+          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
+          {
+            const Index inner = (it->first >> m_keyBitsOffset) + outerOffset;
+            const Index outer = it->first & keyBitsMask;
+            // sorted insertion
+            // Note that we have to deal with at most 2^OuterPacketBits unsorted coefficients,
+            // moreover those 2^OuterPacketBits coeffs are likely to be sparse, an so only a
+            // small fraction of them have to be sorted, whence the following simple procedure:
+            Index posStart = mp_target->outerIndexPtr()[outer];
+            Index i = (positions[outer]++) - 1;
+            while ( (i >= posStart) && (mp_target->innerIndexPtr()[i] > inner) )
+            {
+              mp_target->valuePtr()[i+1] = mp_target->valuePtr()[i];
+              mp_target->innerIndexPtr()[i+1] = mp_target->innerIndexPtr()[i];
+              --i;
+            }
+            mp_target->innerIndexPtr()[i+1] = inner;
+            mp_target->valuePtr()[i+1] = it->second.value;
+          }
+        }
+      }
+      delete[] m_hashmaps;
+    }
+
+    /** \returns a reference to the coefficient at given coordinates \a row, \a col */
+    Scalar& operator() (Index row, Index col)
+    {
+      const Index outer = SetterRowMajor ? row : col;
+      const Index inner = SetterRowMajor ? col : row;
+      const Index outerMajor = outer >> OuterPacketBits; // index of the packet/map
+      const Index outerMinor = outer & OuterPacketMask;  // index of the inner vector in the packet
+      const KeyType key = (KeyType(outerMinor)<<m_keyBitsOffset) | inner;
+      return m_hashmaps[outerMajor][key].value;
+    }
+
+    /** \returns the number of non zero coefficients
+      *
+      * \note According to the underlying map/hash_map implementation,
+      * this function might be quite expensive.
+      */
+    Index nonZeros() const
+    {
+      Index nz = 0;
+      for (Index k=0; k<m_outerPackets; ++k)
+        nz += static_cast<Index>(m_hashmaps[k].size());
+      return nz;
+    }
+
+
+  protected:
+
+    HashMapType* m_hashmaps;
+    SparseMatrixType* mp_target;
+    Index m_outerPackets;
+    unsigned char m_keyBitsOffset;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_RANDOMSETTER_H
diff --git a/include/eigen3/unsupported/Eigen/src/Splines/Spline.h b/include/eigen3/unsupported/Eigen/src/Splines/Spline.h
new file mode 100644
index 0000000..771f104
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Splines/Spline.h
@@ -0,0 +1,474 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINE_H
+#define EIGEN_SPLINE_H
+
+#include "SplineFwd.h"
+
+namespace Eigen
+{
+    /**
+     * \ingroup Splines_Module
+     * \class Spline
+     * \brief A class representing multi-dimensional spline curves.
+     *
+     * The class represents B-splines with non-uniform knot vectors. Each control
+     * point of the B-spline is associated with a basis function
+     * \f{align*}
+     *   C(u) & = \sum_{i=0}^{n}N_{i,p}(u)P_i
+     * \f}
+     *
+     * \tparam _Scalar The underlying data type (typically float or double)
+     * \tparam _Dim The curve dimension (e.g. 2 or 3)
+     * \tparam _Degree Per default set to Dynamic; could be set to the actual desired
+     *                degree for optimization purposes (would result in stack allocation
+     *                of several temporary variables).
+     **/
+  template <typename _Scalar, int _Dim, int _Degree>
+  class Spline
+  {
+  public:
+    typedef _Scalar Scalar; /*!< The spline curve's scalar type. */
+    enum { Dimension = _Dim /*!< The spline curve's dimension. */ };
+    enum { Degree = _Degree /*!< The spline curve's degree. */ };
+
+    /** \brief The point type the spline is representing. */
+    typedef typename SplineTraits<Spline>::PointType PointType;
+    
+    /** \brief The data type used to store knot vectors. */
+    typedef typename SplineTraits<Spline>::KnotVectorType KnotVectorType;
+    
+    /** \brief The data type used to store non-zero basis functions. */
+    typedef typename SplineTraits<Spline>::BasisVectorType BasisVectorType;
+    
+    /** \brief The data type representing the spline's control points. */
+    typedef typename SplineTraits<Spline>::ControlPointVectorType ControlPointVectorType;
+    
+    /**
+    * \brief Creates a (constant) zero spline.
+    * For Splines with dynamic degree, the resulting degree will be 0.
+    **/
+    Spline() 
+    : m_knots(1, (Degree==Dynamic ? 2 : 2*Degree+2))
+    , m_ctrls(ControlPointVectorType::Zero(2,(Degree==Dynamic ? 1 : Degree+1))) 
+    {
+      // in theory this code can go to the initializer list but it will get pretty
+      // much unreadable ...
+      enum { MinDegree = (Degree==Dynamic ? 0 : Degree) };
+      m_knots.template segment<MinDegree+1>(0) = Array<Scalar,1,MinDegree+1>::Zero();
+      m_knots.template segment<MinDegree+1>(MinDegree+1) = Array<Scalar,1,MinDegree+1>::Ones();
+    }
+
+    /**
+    * \brief Creates a spline from a knot vector and control points.
+    * \param knots The spline's knot vector.
+    * \param ctrls The spline's control point vector.
+    **/
+    template <typename OtherVectorType, typename OtherArrayType>
+    Spline(const OtherVectorType& knots, const OtherArrayType& ctrls) : m_knots(knots), m_ctrls(ctrls) {}
+
+    /**
+    * \brief Copy constructor for splines.
+    * \param spline The input spline.
+    **/
+    template <int OtherDegree>
+    Spline(const Spline<Scalar, Dimension, OtherDegree>& spline) : 
+    m_knots(spline.knots()), m_ctrls(spline.ctrls()) {}
+
+    /**
+     * \brief Returns the knots of the underlying spline.
+     **/
+    const KnotVectorType& knots() const { return m_knots; }
+    
+    /**
+     * \brief Returns the knots of the underlying spline.
+     **/    
+    const ControlPointVectorType& ctrls() const { return m_ctrls; }
+
+    /**
+     * \brief Returns the spline value at a given site \f$u\f$.
+     *
+     * The function returns
+     * \f{align*}
+     *   C(u) & = \sum_{i=0}^{n}N_{i,p}P_i
+     * \f}
+     *
+     * \param u Parameter \f$u \in [0;1]\f$ at which the spline is evaluated.
+     * \return The spline value at the given location \f$u\f$.
+     **/
+    PointType operator()(Scalar u) const;
+
+    /**
+     * \brief Evaluation of spline derivatives of up-to given order.
+     *
+     * The function returns
+     * \f{align*}
+     *   \frac{d^i}{du^i}C(u) & = \sum_{i=0}^{n} \frac{d^i}{du^i} N_{i,p}(u)P_i
+     * \f}
+     * for i ranging between 0 and order.
+     *
+     * \param u Parameter \f$u \in [0;1]\f$ at which the spline derivative is evaluated.
+     * \param order The order up to which the derivatives are computed.
+     **/
+    typename SplineTraits<Spline>::DerivativeType
+      derivatives(Scalar u, DenseIndex order) const;
+
+    /**
+     * \copydoc Spline::derivatives
+     * Using the template version of this function is more efficieent since
+     * temporary objects are allocated on the stack whenever this is possible.
+     **/    
+    template <int DerivativeOrder>
+    typename SplineTraits<Spline,DerivativeOrder>::DerivativeType
+      derivatives(Scalar u, DenseIndex order = DerivativeOrder) const;
+
+    /**
+     * \brief Computes the non-zero basis functions at the given site.
+     *
+     * Splines have local support and a point from their image is defined
+     * by exactly \f$p+1\f$ control points \f$P_i\f$ where \f$p\f$ is the
+     * spline degree.
+     *
+     * This function computes the \f$p+1\f$ non-zero basis function values
+     * for a given parameter value \f$u\f$. It returns
+     * \f{align*}{
+     *   N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
+     * \f}
+     *
+     * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis functions 
+     *          are computed.
+     **/
+    typename SplineTraits<Spline>::BasisVectorType
+      basisFunctions(Scalar u) const;
+
+    /**
+     * \brief Computes the non-zero spline basis function derivatives up to given order.
+     *
+     * The function computes
+     * \f{align*}{
+     *   \frac{d^i}{du^i} N_{i,p}(u), \hdots, \frac{d^i}{du^i} N_{i+p+1,p}(u)
+     * \f}
+     * with i ranging from 0 up to the specified order.
+     *
+     * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis function
+     *          derivatives are computed.
+     * \param order The order up to which the basis function derivatives are computes.
+     **/
+    typename SplineTraits<Spline>::BasisDerivativeType
+      basisFunctionDerivatives(Scalar u, DenseIndex order) const;
+
+    /**
+     * \copydoc Spline::basisFunctionDerivatives
+     * Using the template version of this function is more efficieent since
+     * temporary objects are allocated on the stack whenever this is possible.
+     **/    
+    template <int DerivativeOrder>
+    typename SplineTraits<Spline,DerivativeOrder>::BasisDerivativeType
+      basisFunctionDerivatives(Scalar u, DenseIndex order = DerivativeOrder) const;
+
+    /**
+     * \brief Returns the spline degree.
+     **/ 
+    DenseIndex degree() const;
+
+    /** 
+     * \brief Returns the span within the knot vector in which u is falling.
+     * \param u The site for which the span is determined.
+     **/
+    DenseIndex span(Scalar u) const;
+
+    /**
+     * \brief Computes the spang within the provided knot vector in which u is falling.
+     **/
+    static DenseIndex Span(typename SplineTraits<Spline>::Scalar u, DenseIndex degree, const typename SplineTraits<Spline>::KnotVectorType& knots);
+    
+    /**
+     * \brief Returns the spline's non-zero basis functions.
+     *
+     * The function computes and returns
+     * \f{align*}{
+     *   N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
+     * \f}
+     *
+     * \param u The site at which the basis functions are computed.
+     * \param degree The degree of the underlying spline.
+     * \param knots The underlying spline's knot vector.
+     **/
+    static BasisVectorType BasisFunctions(Scalar u, DenseIndex degree, const KnotVectorType& knots);
+
+
+  private:
+    KnotVectorType m_knots; /*!< Knot vector. */
+    ControlPointVectorType  m_ctrls; /*!< Control points. */
+  };
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  DenseIndex Spline<_Scalar, _Dim, _Degree>::Span(
+    typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::Scalar u,
+    DenseIndex degree,
+    const typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::KnotVectorType& knots)
+  {
+    // Piegl & Tiller, "The NURBS Book", A2.1 (p. 68)
+    if (u <= knots(0)) return degree;
+    const Scalar* pos = std::upper_bound(knots.data()+degree-1, knots.data()+knots.size()-degree-1, u);
+    return static_cast<DenseIndex>( std::distance(knots.data(), pos) - 1 );
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename Spline<_Scalar, _Dim, _Degree>::BasisVectorType
+    Spline<_Scalar, _Dim, _Degree>::BasisFunctions(
+    typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+    DenseIndex degree,
+    const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots)
+  {
+    typedef typename Spline<_Scalar, _Dim, _Degree>::BasisVectorType BasisVectorType;
+
+    const DenseIndex p = degree;
+    const DenseIndex i = Spline::Span(u, degree, knots);
+
+    const KnotVectorType& U = knots;
+
+    BasisVectorType left(p+1); left(0) = Scalar(0);
+    BasisVectorType right(p+1); right(0) = Scalar(0);        
+
+    VectorBlock<BasisVectorType,Degree>(left,1,p) = u - VectorBlock<const KnotVectorType,Degree>(U,i+1-p,p).reverse();
+    VectorBlock<BasisVectorType,Degree>(right,1,p) = VectorBlock<const KnotVectorType,Degree>(U,i+1,p) - u;
+
+    BasisVectorType N(1,p+1);
+    N(0) = Scalar(1);
+    for (DenseIndex j=1; j<=p; ++j)
+    {
+      Scalar saved = Scalar(0);
+      for (DenseIndex r=0; r<j; r++)
+      {
+        const Scalar tmp = N(r)/(right(r+1)+left(j-r));
+        N[r] = saved + right(r+1)*tmp;
+        saved = left(j-r)*tmp;
+      }
+      N(j) = saved;
+    }
+    return N;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  DenseIndex Spline<_Scalar, _Dim, _Degree>::degree() const
+  {
+    if (_Degree == Dynamic)
+      return m_knots.size() - m_ctrls.cols() - 1;
+    else
+      return _Degree;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  DenseIndex Spline<_Scalar, _Dim, _Degree>::span(Scalar u) const
+  {
+    return Spline::Span(u, degree(), knots());
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename Spline<_Scalar, _Dim, _Degree>::PointType Spline<_Scalar, _Dim, _Degree>::operator()(Scalar u) const
+  {
+    enum { Order = SplineTraits<Spline>::OrderAtCompileTime };
+
+    const DenseIndex span = this->span(u);
+    const DenseIndex p = degree();
+    const BasisVectorType basis_funcs = basisFunctions(u);
+
+    const Replicate<BasisVectorType,Dimension,1> ctrl_weights(basis_funcs);
+    const Block<const ControlPointVectorType,Dimension,Order> ctrl_pts(ctrls(),0,span-p,Dimension,p+1);
+    return (ctrl_weights * ctrl_pts).rowwise().sum();
+  }
+
+  /* --------------------------------------------------------------------------------------------- */
+
+  template <typename SplineType, typename DerivativeType>
+  void derivativesImpl(const SplineType& spline, typename SplineType::Scalar u, DenseIndex order, DerivativeType& der)
+  {    
+    enum { Dimension = SplineTraits<SplineType>::Dimension };
+    enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
+    enum { DerivativeOrder = DerivativeType::ColsAtCompileTime };
+
+    typedef typename SplineTraits<SplineType>::ControlPointVectorType ControlPointVectorType;
+    typedef typename SplineTraits<SplineType,DerivativeOrder>::BasisDerivativeType BasisDerivativeType;
+    typedef typename BasisDerivativeType::ConstRowXpr BasisDerivativeRowXpr;    
+
+    const DenseIndex p = spline.degree();
+    const DenseIndex span = spline.span(u);
+
+    const DenseIndex n = (std::min)(p, order);
+
+    der.resize(Dimension,n+1);
+
+    // Retrieve the basis function derivatives up to the desired order...    
+    const BasisDerivativeType basis_func_ders = spline.template basisFunctionDerivatives<DerivativeOrder>(u, n+1);
+
+    // ... and perform the linear combinations of the control points.
+    for (DenseIndex der_order=0; der_order<n+1; ++der_order)
+    {
+      const Replicate<BasisDerivativeRowXpr,Dimension,1> ctrl_weights( basis_func_ders.row(der_order) );
+      const Block<const ControlPointVectorType,Dimension,Order> ctrl_pts(spline.ctrls(),0,span-p,Dimension,p+1);
+      der.col(der_order) = (ctrl_weights * ctrl_pts).rowwise().sum();
+    }
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::DerivativeType
+    Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const
+  {
+    typename SplineTraits< Spline >::DerivativeType res;
+    derivativesImpl(*this, u, order, res);
+    return res;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  template <int DerivativeOrder>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::DerivativeType
+    Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const
+  {
+    typename SplineTraits< Spline, DerivativeOrder >::DerivativeType res;
+    derivativesImpl(*this, u, order, res);
+    return res;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisVectorType
+    Spline<_Scalar, _Dim, _Degree>::basisFunctions(Scalar u) const
+  {
+    return Spline::BasisFunctions(u, degree(), knots());
+  }
+
+  /* --------------------------------------------------------------------------------------------- */
+
+  template <typename SplineType, typename DerivativeType>
+  void basisFunctionDerivativesImpl(const SplineType& spline, typename SplineType::Scalar u, DenseIndex order, DerivativeType& N_)
+  {
+    enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
+
+    typedef typename SplineTraits<SplineType>::Scalar Scalar;
+    typedef typename SplineTraits<SplineType>::BasisVectorType BasisVectorType;
+    typedef typename SplineTraits<SplineType>::KnotVectorType KnotVectorType;
+
+    const KnotVectorType& U = spline.knots();
+
+    const DenseIndex p = spline.degree();
+    const DenseIndex span = spline.span(u);
+
+    const DenseIndex n = (std::min)(p, order);
+
+    N_.resize(n+1, p+1);
+
+    BasisVectorType left = BasisVectorType::Zero(p+1);
+    BasisVectorType right = BasisVectorType::Zero(p+1);
+
+    Matrix<Scalar,Order,Order> ndu(p+1,p+1);
+
+    double saved, temp;
+
+    ndu(0,0) = 1.0;
+
+    DenseIndex j;
+    for (j=1; j<=p; ++j)
+    {
+      left[j] = u-U[span+1-j];
+      right[j] = U[span+j]-u;
+      saved = 0.0;
+
+      for (DenseIndex r=0; r<j; ++r)
+      {
+        /* Lower triangle */
+        ndu(j,r) = right[r+1]+left[j-r];
+        temp = ndu(r,j-1)/ndu(j,r);
+        /* Upper triangle */
+        ndu(r,j) = static_cast<Scalar>(saved+right[r+1] * temp);
+        saved = left[j-r] * temp;
+      }
+
+      ndu(j,j) = static_cast<Scalar>(saved);
+    }
+
+    for (j = p; j>=0; --j) 
+      N_(0,j) = ndu(j,p);
+
+    // Compute the derivatives
+    DerivativeType a(n+1,p+1);
+    DenseIndex r=0;
+    for (; r<=p; ++r)
+    {
+      DenseIndex s1,s2;
+      s1 = 0; s2 = 1; // alternate rows in array a
+      a(0,0) = 1.0;
+
+      // Compute the k-th derivative
+      for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
+      {
+        double d = 0.0;
+        DenseIndex rk,pk,j1,j2;
+        rk = r-k; pk = p-k;
+
+        if (r>=k)
+        {
+          a(s2,0) = a(s1,0)/ndu(pk+1,rk);
+          d = a(s2,0)*ndu(rk,pk);
+        }
+
+        if (rk>=-1) j1 = 1;
+        else        j1 = -rk;
+
+        if (r-1 <= pk) j2 = k-1;
+        else           j2 = p-r;
+
+        for (j=j1; j<=j2; ++j)
+        {
+          a(s2,j) = (a(s1,j)-a(s1,j-1))/ndu(pk+1,rk+j);
+          d += a(s2,j)*ndu(rk+j,pk);
+        }
+
+        if (r<=pk)
+        {
+          a(s2,k) = -a(s1,k-1)/ndu(pk+1,r);
+          d += a(s2,k)*ndu(r,pk);
+        }
+
+        N_(k,r) = static_cast<Scalar>(d);
+        j = s1; s1 = s2; s2 = j; // Switch rows
+      }
+    }
+
+    /* Multiply through by the correct factors */
+    /* (Eq. [2.9])                             */
+    r = p;
+    for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
+    {
+      for (DenseIndex j=p; j>=0; --j) N_(k,j) *= r;
+      r *= p-k;
+    }
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType
+    Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
+  {
+    typename SplineTraits< Spline >::BasisDerivativeType der;
+    basisFunctionDerivativesImpl(*this, u, order, der);
+    return der;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  template <int DerivativeOrder>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType
+    Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
+  {
+    typename SplineTraits< Spline, DerivativeOrder >::BasisDerivativeType der;
+    basisFunctionDerivativesImpl(*this, u, order, der);
+    return der;
+  }
+}
+
+#endif // EIGEN_SPLINE_H
diff --git a/include/eigen3/unsupported/Eigen/src/Splines/SplineFitting.h b/include/eigen3/unsupported/Eigen/src/Splines/SplineFitting.h
new file mode 100644
index 0000000..0265d53
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Splines/SplineFitting.h
@@ -0,0 +1,156 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINE_FITTING_H
+#define EIGEN_SPLINE_FITTING_H
+
+#include <numeric>
+
+#include "SplineFwd.h"
+
+#include <Eigen/QR>
+
+namespace Eigen
+{
+  /**
+   * \brief Computes knot averages.
+   * \ingroup Splines_Module
+   *
+   * The knots are computed as
+   * \f{align*}
+   *  u_0 & = \hdots = u_p = 0 \\
+   *  u_{m-p} & = \hdots = u_{m} = 1 \\
+   *  u_{j+p} & = \frac{1}{p}\sum_{i=j}^{j+p-1}\bar{u}_i \quad\quad j=1,\hdots,n-p
+   * \f}
+   * where \f$p\f$ is the degree and \f$m+1\f$ the number knots
+   * of the desired interpolating spline.
+   *
+   * \param[in] parameters The input parameters. During interpolation one for each data point.
+   * \param[in] degree The spline degree which is used during the interpolation.
+   * \param[out] knots The output knot vector.
+   *
+   * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
+   **/
+  template <typename KnotVectorType>
+  void KnotAveraging(const KnotVectorType& parameters, DenseIndex degree, KnotVectorType& knots)
+  {
+    knots.resize(parameters.size()+degree+1);      
+
+    for (DenseIndex j=1; j<parameters.size()-degree; ++j)
+      knots(j+degree) = parameters.segment(j,degree).mean();
+
+    knots.segment(0,degree+1) = KnotVectorType::Zero(degree+1);
+    knots.segment(knots.size()-degree-1,degree+1) = KnotVectorType::Ones(degree+1);
+  }
+
+  /**
+   * \brief Computes chord length parameters which are required for spline interpolation.
+   * \ingroup Splines_Module
+   *
+   * \param[in] pts The data points to which a spline should be fit.
+   * \param[out] chord_lengths The resulting chord lenggth vector.
+   *
+   * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
+   **/   
+  template <typename PointArrayType, typename KnotVectorType>
+  void ChordLengths(const PointArrayType& pts, KnotVectorType& chord_lengths)
+  {
+    typedef typename KnotVectorType::Scalar Scalar;
+
+    const DenseIndex n = pts.cols();
+
+    // 1. compute the column-wise norms
+    chord_lengths.resize(pts.cols());
+    chord_lengths[0] = 0;
+    chord_lengths.rightCols(n-1) = (pts.array().leftCols(n-1) - pts.array().rightCols(n-1)).matrix().colwise().norm();
+
+    // 2. compute the partial sums
+    std::partial_sum(chord_lengths.data(), chord_lengths.data()+n, chord_lengths.data());
+
+    // 3. normalize the data
+    chord_lengths /= chord_lengths(n-1);
+    chord_lengths(n-1) = Scalar(1);
+  }
+
+  /**
+   * \brief Spline fitting methods.
+   * \ingroup Splines_Module
+   **/     
+  template <typename SplineType>
+  struct SplineFitting
+  {
+    typedef typename SplineType::KnotVectorType KnotVectorType;
+
+    /**
+     * \brief Fits an interpolating Spline to the given data points.
+     *
+     * \param pts The points for which an interpolating spline will be computed.
+     * \param degree The degree of the interpolating spline.
+     *
+     * \returns A spline interpolating the initially provided points.
+     **/
+    template <typename PointArrayType>
+    static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree);
+
+    /**
+     * \brief Fits an interpolating Spline to the given data points.
+     *
+     * \param pts The points for which an interpolating spline will be computed.
+     * \param degree The degree of the interpolating spline.
+     * \param knot_parameters The knot parameters for the interpolation.
+     *
+     * \returns A spline interpolating the initially provided points.
+     **/
+    template <typename PointArrayType>
+    static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters);
+  };
+
+  template <typename SplineType>
+  template <typename PointArrayType>
+  SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters)
+  {
+    typedef typename SplineType::KnotVectorType::Scalar Scalar;      
+    typedef typename SplineType::ControlPointVectorType ControlPointVectorType;      
+
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+
+    KnotVectorType knots;
+    KnotAveraging(knot_parameters, degree, knots);
+
+    DenseIndex n = pts.cols();
+    MatrixType A = MatrixType::Zero(n,n);
+    for (DenseIndex i=1; i<n-1; ++i)
+    {
+      const DenseIndex span = SplineType::Span(knot_parameters[i], degree, knots);
+
+      // The segment call should somehow be told the spline order at compile time.
+      A.row(i).segment(span-degree, degree+1) = SplineType::BasisFunctions(knot_parameters[i], degree, knots);
+    }
+    A(0,0) = 1.0;
+    A(n-1,n-1) = 1.0;
+
+    HouseholderQR<MatrixType> qr(A);
+
+    // Here, we are creating a temporary due to an Eigen issue.
+    ControlPointVectorType ctrls = qr.solve(MatrixType(pts.transpose())).transpose();
+
+    return SplineType(knots, ctrls);
+  }
+
+  template <typename SplineType>
+  template <typename PointArrayType>
+  SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree)
+  {
+    KnotVectorType chord_lengths; // knot parameters
+    ChordLengths(pts, chord_lengths);
+    return Interpolate(pts, degree, chord_lengths);
+  }
+}
+
+#endif // EIGEN_SPLINE_FITTING_H
diff --git a/include/eigen3/unsupported/Eigen/src/Splines/SplineFwd.h b/include/eigen3/unsupported/Eigen/src/Splines/SplineFwd.h
new file mode 100644
index 0000000..49db8d3
--- /dev/null
+++ b/include/eigen3/unsupported/Eigen/src/Splines/SplineFwd.h
@@ -0,0 +1,86 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel at gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINES_FWD_H
+#define EIGEN_SPLINES_FWD_H
+
+#include <Eigen/Core>
+
+namespace Eigen
+{
+    template <typename Scalar, int Dim, int Degree = Dynamic> class Spline;
+
+    template < typename SplineType, int DerivativeOrder = Dynamic > struct SplineTraits {};
+
+    /**
+     * \ingroup Splines_Module
+     * \brief Compile-time attributes of the Spline class for Dynamic degree.
+     **/
+    template <typename _Scalar, int _Dim, int _Degree>
+    struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, Dynamic >
+    {
+      typedef _Scalar Scalar; /*!< The spline curve's scalar type. */
+      enum { Dimension = _Dim /*!< The spline curve's dimension. */ };
+      enum { Degree = _Degree /*!< The spline curve's degree. */ };
+
+      enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
+      enum { NumOfDerivativesAtCompileTime = OrderAtCompileTime /*!< The number of derivatives defined for the current spline. */ };
+
+      /** \brief The data type used to store non-zero basis functions. */
+      typedef Array<Scalar,1,OrderAtCompileTime> BasisVectorType;
+
+      /** \brief The data type used to store the values of the basis function derivatives. */
+      typedef Array<Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
+      
+      /** \brief The data type used to store the spline's derivative values. */
+      typedef Array<Scalar,Dimension,Dynamic,ColMajor,Dimension,NumOfDerivativesAtCompileTime> DerivativeType;
+
+      /** \brief The point type the spline is representing. */
+      typedef Array<Scalar,Dimension,1> PointType;
+      
+      /** \brief The data type used to store knot vectors. */
+      typedef Array<Scalar,1,Dynamic> KnotVectorType;
+      
+      /** \brief The data type representing the spline's control points. */
+      typedef Array<Scalar,Dimension,Dynamic> ControlPointVectorType;
+    };
+
+    /**
+     * \ingroup Splines_Module
+     * \brief Compile-time attributes of the Spline class for fixed degree.
+     *
+     * The traits class inherits all attributes from the SplineTraits of Dynamic degree.
+     **/
+    template < typename _Scalar, int _Dim, int _Degree, int _DerivativeOrder >
+    struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, _DerivativeOrder > : public SplineTraits< Spline<_Scalar, _Dim, _Degree> >
+    {
+      enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
+      enum { NumOfDerivativesAtCompileTime = _DerivativeOrder==Dynamic ? Dynamic : _DerivativeOrder+1 /*!< The number of derivatives defined for the current spline. */ };
+
+      /** \brief The data type used to store the values of the basis function derivatives. */
+      typedef Array<_Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
+      
+      /** \brief The data type used to store the spline's derivative values. */      
+      typedef Array<_Scalar,_Dim,Dynamic,ColMajor,_Dim,NumOfDerivativesAtCompileTime> DerivativeType;
+    };
+
+    /** \brief 2D float B-spline with dynamic degree. */
+    typedef Spline<float,2> Spline2f;
+    
+    /** \brief 3D float B-spline with dynamic degree. */
+    typedef Spline<float,3> Spline3f;
+
+    /** \brief 2D double B-spline with dynamic degree. */
+    typedef Spline<double,2> Spline2d;
+    
+    /** \brief 3D double B-spline with dynamic degree. */
+    typedef Spline<double,3> Spline3d;
+}
+
+#endif // EIGEN_SPLINES_FWD_H
diff --git a/include/ezETAProgressBar.hpp b/include/ezETAProgressBar.hpp
new file mode 100755
index 0000000..d62c9fe
--- /dev/null
+++ b/include/ezETAProgressBar.hpp
@@ -0,0 +1,204 @@
+/*
+Copyright (C) 2011,2012 Remik Ziemlinski. See MIT-LICENSE.
+
+CHANGELOG
+
+v0.0.0 20110502 rsz Created.
+V1.0.0 20110522 rsz Extended to show eta with growing bar.
+v2.0.0 20110525 rsz Added time elapsed.
+v2.0.1 20111006 rsz Added default constructor value.
+v2.0.2 20130123 rob Switched over to C++11 timer facilities
+*/
+
+#ifndef EZ_ETAPROGRESSBAR_H
+#define EZ_ETAPROGRESSBAR_H
+
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <chrono>
+#include <cstdio>
+#include <cstring>
+#include <cassert>
+
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <sys/time.h>
+#include <sys/ioctl.h>
+#endif
+
+namespace ez {
+// One-line refreshing progress bar inspired by wget that shows ETA (time remaining).
+// 90% [========================================>     ] ETA 12d 23h 56s
+class ezETAProgressBar {
+private:
+	using system_clock = std::chrono::system_clock;
+	using duration = std::chrono::duration<size_t, system_clock::period>;
+	using partialDuration = std::chrono::duration<double, system_clock::period>;
+public:
+	ezETAProgressBar(unsigned int _n=0) : n(_n), cur(0), pct(0), width(80) {}
+	void reset( uint64_t _n ) { n = _n; pct = 0; cur = 0; }
+	void start() {
+		startTime = system_clock::now();
+		lastCheck = startTime;
+		setPct(0);
+	}
+
+	void operator++() {
+		if (cur >= n) return;
+		++cur;
+		endTime = system_clock::now();
+		if ( (endTime - lastCheck) >= std::chrono::seconds(1) or (cur == n) ) {
+			setPct( static_cast<double>(cur)/n );
+			lastCheck = endTime;
+		}
+	};
+
+	void operator+=( const unsigned int d ) {
+		if (cur >= n) return;
+		cur += d;
+		endTime = system_clock::now();
+		if ( (endTime - lastCheck) >= std::chrono::seconds(1) or (cur == n) ) {
+			setPct(static_cast<double>(cur)/n);
+			lastCheck = endTime;
+		}
+	};
+
+	bool isDone() { return cur == n; }
+	void done() {
+		cur = n;
+		setPct(1.0);
+	}
+
+	std::string durationString( duration t ) {
+		using std::chrono::duration_cast;
+		using days = std::chrono::duration<size_t, std::ratio<86400>>;
+		using std::chrono::hours;
+		using std::chrono::minutes;
+		using std::chrono::seconds;
+
+		std::stringstream out(std::stringstream::out);
+		//std::string out;
+
+		if ( t >= days(1) ) {
+			auto numDays = duration_cast<days>(t);
+			out << numDays.count() << "d ";
+			t -= numDays;
+		}
+
+		if ( t >= hours(1) ) {
+			auto numHours = duration_cast<hours>(t);
+			out << numHours.count() << "h ";
+			t -= numHours;
+		}
+
+		if ( t >= minutes(1) ) {
+			auto numMins = duration_cast<minutes>(t);
+			out << numMins.count() << "m ";
+			t -= numMins;
+		}
+
+		if ( t >= seconds(1) ) {
+			auto numSecs = duration_cast<seconds>(t);
+			out << numSecs.count() << "s";
+		}
+
+		std::string tstring = out.str();
+		if (tstring.empty()) { tstring = "0s"; }
+		return tstring;
+	}
+
+	// Set 0.0-1.0, where 1.0 equals 100%.
+	void setPct(double Pct) {
+		using std::chrono::duration_cast;
+		using std::chrono::seconds;
+		using weeks = std::chrono::duration<size_t, std::ratio<604800>>;
+
+		endTime = system_clock::now();
+		char pctstr[5];
+		sprintf(pctstr, "%3d%%", (int)(100*Pct));
+
+		// Compute how many tics we can display.
+		int nticsMax = (width-27);
+		int ntics = std::max(1, static_cast<int>(nticsMax*Pct));
+		std::string out(pctstr);
+		out.append(" [");
+
+		#ifdef HAVE_ANSI_TERM
+		// Green!
+		out.append("\e[0;32m");
+		#endif //HAVE_ANSI_TERM
+
+		out.append(std::max(0,ntics-1),'=');
+		out.append(Pct == 1.0 ? "=" : ">");
+		out.append(nticsMax-ntics,' ');
+
+		#ifdef HAVE_ANSI_TERM
+		// Not-Green :(
+		out.append("\e[0m");
+		#endif //HAVE_ANSI_TERM
+
+		out.append("] ");
+		out.append((Pct<1.0) ? "ETA " : "in ");
+
+		// Time since we started the progress bar (or reset)
+		auto dt = endTime-startTime;
+
+		std::string tstr;
+		if (Pct >= 1.0) {
+			// Print overall time and newline.
+			tstr = durationString(dt);
+			out.append(tstr);
+			size_t effLen = out.length();
+    	    #ifdef HAVE_ANSI_TERM
+	   		 effLen -= 11;
+     		#endif //HAVE_ANSI_TERM
+			if (effLen < width) {
+				out.append(width-effLen,' ');
+			}
+
+			out.append("\r\n");
+			std::cerr << out;
+			return;
+		} else {
+			duration eta = std::chrono::seconds::max();
+
+			if (Pct > 0.0) {
+				duration esecs = duration_cast<seconds>(dt);
+				eta = duration_cast<duration>( ((esecs * n) / cur) - esecs );
+			}
+
+			if ( eta > weeks(1) ) {
+				out.append("> 1 week");
+			} else {
+				tstr = durationString(eta);
+				out.append(tstr);
+			}
+		}
+
+		size_t effLen = out.length();
+	    #ifdef HAVE_ANSI_TERM
+		 effLen -= 11;
+		#endif //HAVE_ANSI_TERM
+
+		// Pad end with spaces to overwrite previous string that may have been longer.
+		if (effLen < width) {
+			out.append(width-effLen,' ');
+		}
+
+
+		out.append("\r");
+		std::cerr << out;
+		std::cerr.flush();
+	}
+
+	private:
+		uint64_t n;
+		uint64_t cur;
+	    unsigned short pct; // Stored as 0-1000, so 2.5% is encoded as 25.
+	    unsigned char width; // How many chars the entire line can be.
+	    std::chrono::system_clock::time_point startTime, endTime, lastCheck;
+};
+}
+#endif // EZ_ETAPROGRESSBAR_H
diff --git a/include/fastapprox.h b/include/fastapprox.h
new file mode 100644
index 0000000..ad26a1f
--- /dev/null
+++ b/include/fastapprox.h
@@ -0,0 +1,1608 @@
+/*=====================================================================*
+ *                   Copyright (C) 2012 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __CAST_H_
+
+#ifdef __cplusplus
+#define cast_uint32_t static_cast<uint32_t>
+#else
+#define cast_uint32_t (uint32_t)
+#endif
+
+#endif // __CAST_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __SSE_H_
+#define __SSE_H_
+
+#ifdef __SSE2__
+
+#include <emmintrin.h>
+
+#ifdef __cplusplus
+namespace {
+#endif // __cplusplus
+
+typedef __m128 v4sf;
+typedef __m128i v4si;
+
+#define v4si_to_v4sf _mm_cvtepi32_ps
+#define v4sf_to_v4si _mm_cvttps_epi32
+
+#define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
+#define v2dil(x) ((const v4si) { (x), (x) })
+#define v4sil(x) v2dil((((unsigned long long) (x)) << 32) | (x))
+
+typedef union { v4sf f; float array[4]; } v4sfindexer;
+#define v4sf_index(_findx, _findi)      \
+  ({                                    \
+     v4sfindexer _findvx = { _findx } ; \
+     _findvx.array[_findi];             \
+  })
+typedef union { v4si i; int array[4]; } v4siindexer;
+#define v4si_index(_iindx, _iindi)      \
+  ({                                    \
+     v4siindexer _iindvx = { _iindx } ; \
+     _iindvx.array[_iindi];             \
+  })
+
+typedef union { v4sf f; v4si i; } v4sfv4sipun;
+#define v4sf_fabs(x)                    \
+  ({                                    \
+     v4sfv4sipun vx;                    \
+     vx.f = x;                          \
+     vx.i &= v4sil (0x7FFFFFFF);        \
+     vx.f;                              \
+  })
+
+#ifdef __cplusplus
+} // end namespace
+#endif // __cplusplus
+
+#endif // __SSE2__
+
+#endif // __SSE_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_EXP_H_
+#define __FAST_EXP_H_
+
+#include <stdint.h>
+
+// Underflow of exponential is common practice in numerical routines,
+// so handle it here.
+
+static inline float
+fastpow2 (float p)
+{
+  float offset = (p < 0) ? 1.0f : 0.0f;
+  float clipp = (p < -126) ? -126.0f : p;
+  int w = clipp;
+  float z = clipp - w + offset;
+  union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
+
+  return v.f;
+}
+
+static inline float
+fastexp (float p)
+{
+  return fastpow2 (1.442695040f * p);
+}
+
+static inline float
+fasterpow2 (float p)
+{
+  float clipp = (p < -126) ? -126.0f : p;
+  union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) };
+  return v.f;
+}
+
+static inline float
+fasterexp (float p)
+{
+  return fasterpow2 (1.442695040f * p);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastpow2 (const v4sf p)
+{
+  v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f));
+  v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f));
+  v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
+  v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
+  v4si w = v4sf_to_v4si (clipp);
+  v4sf z = clipp - v4si_to_v4sf (w) + offset;
+
+  const v4sf c_121_2740838 = v4sfl (121.2740575f);
+  const v4sf c_27_7280233 = v4sfl (27.7280233f);
+  const v4sf c_4_84252568 = v4sfl (4.84252568f);
+  const v4sf c_1_49012907 = v4sfl (1.49012907f);
+  union { v4si i; v4sf f; } v = {
+    v4sf_to_v4si (
+      v4sfl (1 << 23) * 
+      (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z)
+    )
+  };
+
+  return v.f;
+}
+
+static inline v4sf
+vfastexp (const v4sf p)
+{
+  const v4sf c_invlog_2 = v4sfl (1.442695040f);
+
+  return vfastpow2 (c_invlog_2 * p);
+}
+
+static inline v4sf
+vfasterpow2 (const v4sf p)
+{
+  const v4sf c_126_94269504 = v4sfl (126.94269504f);
+  v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
+  v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
+  union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) };
+  return v.f;
+}
+
+static inline v4sf
+vfasterexp (const v4sf p)
+{
+  const v4sf c_invlog_2 = v4sfl (1.442695040f);
+
+  return vfasterpow2 (c_invlog_2 * p);
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_EXP_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_LOG_H_
+#define __FAST_LOG_H_
+
+#include <stdint.h>
+
+static inline float 
+fastlog2 (float x)
+{
+  union { float f; uint32_t i; } vx = { x };
+  union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
+  float y = vx.i;
+  y *= 1.1920928955078125e-7f;
+
+  return y - 124.22551499f
+           - 1.498030302f * mx.f 
+           - 1.72587999f / (0.3520887068f + mx.f);
+}
+
+static inline float
+fastlog (float x)
+{
+  return 0.69314718f * fastlog2 (x);
+}
+
+static inline float 
+fasterlog2 (float x)
+{
+  union { float f; uint32_t i; } vx = { x };
+  float y = vx.i;
+  y *= 1.1920928955078125e-7f;
+  return y - 126.94269504f;
+}
+
+static inline float
+fasterlog (float x)
+{
+//  return 0.69314718f * fasterlog2 (x);
+
+  union { float f; uint32_t i; } vx = { x };
+  float y = vx.i;
+  y *= 8.2629582881927490e-8f;
+  return y - 87.989971088f;
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastlog2 (v4sf x)
+{
+  union { v4sf f; v4si i; } vx = { x };
+  union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000);
+  v4sf y = v4si_to_v4sf (vx.i);
+  y *= v4sfl (1.1920928955078125e-7f);
+
+  const v4sf c_124_22551499 = v4sfl (124.22551499f);
+  const v4sf c_1_498030302 = v4sfl (1.498030302f);
+  const v4sf c_1_725877999 = v4sfl (1.72587999f);
+  const v4sf c_0_3520087068 = v4sfl (0.3520887068f);
+
+  return y - c_124_22551499
+           - c_1_498030302 * mx.f 
+           - c_1_725877999 / (c_0_3520087068 + mx.f);
+}
+
+static inline v4sf
+vfastlog (v4sf x)
+{
+  const v4sf c_0_69314718 = v4sfl (0.69314718f);
+
+  return c_0_69314718 * vfastlog2 (x);
+}
+
+static inline v4sf 
+vfasterlog2 (v4sf x)
+{
+  union { v4sf f; v4si i; } vx = { x };
+  v4sf y = v4si_to_v4sf (vx.i);
+  y *= v4sfl (1.1920928955078125e-7f);
+
+  const v4sf c_126_94269504 = v4sfl (126.94269504f);
+
+  return y - c_126_94269504;
+}
+
+static inline v4sf
+vfasterlog (v4sf x)
+{
+//  const v4sf c_0_69314718 = v4sfl (0.69314718f);
+//
+//  return c_0_69314718 * vfasterlog2 (x);
+
+  union { v4sf f; v4si i; } vx = { x };
+  v4sf y = v4si_to_v4sf (vx.i);
+  y *= v4sfl (8.2629582881927490e-8f);
+
+  const v4sf c_87_989971088 = v4sfl (87.989971088f);
+
+  return y - c_87_989971088;
+}
+
+#endif // __SSE2__
+
+#endif // __FAST_LOG_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_ERF_H_
+#define __FAST_ERF_H_
+
+#include <math.h>
+#include <stdint.h>
+
+// fasterfc: not actually faster than erfcf(3) on newer machines!
+// ... although vectorized version is interesting
+//     and fastererfc is very fast
+
+static inline float
+fasterfc (float x)
+{
+  static const float k = 3.3509633149424609f;
+  static const float a = 0.07219054755431126f;
+  static const float b = 15.418191568719577f;
+  static const float c = 5.609846028328545f;
+
+  union { float f; uint32_t i; } vc = { c * x };
+  float xsq = x * x;
+  float xquad = xsq * xsq;
+
+  vc.i |= 0x80000000;
+
+  return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f);
+}
+
+static inline float
+fastererfc (float x)
+{
+  static const float k = 3.3509633149424609f;
+
+  return 2.0f / (1.0f + fasterpow2 (k * x));
+}
+
+// fasterf: not actually faster than erff(3) on newer machines! 
+// ... although vectorized version is interesting
+//     and fastererf is very fast
+
+static inline float
+fasterf (float x)
+{
+  return 1.0f - fasterfc (x);
+}
+
+static inline float
+fastererf (float x)
+{
+  return 1.0f - fastererfc (x);
+}
+
+static inline float
+fastinverseerf (float x)
+{
+  static const float invk = 0.30004578719350504f;
+  static const float a = 0.020287853348211326f;
+  static const float b = 0.07236892874789555f;
+  static const float c = 0.9913030456864257f;
+  static const float d = 0.8059775923760193f;
+
+  float xsq = x * x;
+
+  return invk * fastlog2 ((1.0f + x) / (1.0f - x)) 
+       + x * (a - b * xsq) / (c - d * xsq);
+}
+
+static inline float
+fasterinverseerf (float x)
+{
+  static const float invk = 0.30004578719350504f;
+
+  return invk * fasterlog2 ((1.0f + x) / (1.0f - x));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfasterfc (v4sf x)
+{
+  const v4sf k = v4sfl (3.3509633149424609f);
+  const v4sf a = v4sfl (0.07219054755431126f);
+  const v4sf b = v4sfl (15.418191568719577f);
+  const v4sf c = v4sfl (5.609846028328545f);
+
+  union { v4sf f; v4si i; } vc; vc.f = c * x;
+  vc.i |= v4sil (0x80000000);
+
+  v4sf xsq = x * x;
+  v4sf xquad = xsq * xsq;
+
+  return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f);
+}
+
+static inline v4sf
+vfastererfc (const v4sf x)
+{
+  const v4sf k = v4sfl (3.3509633149424609f);
+
+  return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x));
+}
+
+static inline v4sf
+vfasterf (v4sf x)
+{
+  return v4sfl (1.0f) - vfasterfc (x);
+}
+
+static inline v4sf
+vfastererf (const v4sf x)
+{
+  return v4sfl (1.0f) - vfastererfc (x);
+}
+
+static inline v4sf
+vfastinverseerf (v4sf x)
+{
+  const v4sf invk = v4sfl (0.30004578719350504f);
+  const v4sf a = v4sfl (0.020287853348211326f);
+  const v4sf b = v4sfl (0.07236892874789555f);
+  const v4sf c = v4sfl (0.9913030456864257f);
+  const v4sf d = v4sfl (0.8059775923760193f);
+
+  v4sf xsq = x * x;
+
+  return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)) 
+       + x * (a - b * xsq) / (c - d * xsq);
+}
+
+static inline v4sf
+vfasterinverseerf (v4sf x)
+{
+  const v4sf invk = v4sfl (0.30004578719350504f);
+
+  return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_ERF_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_GAMMA_H_
+#define __FAST_GAMMA_H_
+
+#include <stdint.h>
+
+/* gamma/digamma functions only work for positive inputs */
+
+static inline float
+fastlgamma (float x)
+{
+  float logterm = fastlog (x * (1.0f + x) * (2.0f + x));
+  float xp3 = 3.0f + x;
+
+  return - 2.081061466f 
+         - x 
+         + 0.0833333f / xp3 
+         - logterm 
+         + (2.5f + x) * fastlog (xp3);
+}
+
+static inline float
+fasterlgamma (float x)
+{
+  return - 0.0810614667f 
+         - x
+         - fasterlog (x)
+         + (0.5f + x) * fasterlog (1.0f + x);
+}
+
+static inline float
+fastdigamma (float x)
+{
+  float twopx = 2.0f + x;
+  float logterm = fastlog (twopx);
+
+  return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) /
+         (12.0f * x * (1.0f + x) * twopx * twopx)
+         + logterm;
+}
+
+static inline float
+fasterdigamma (float x)
+{
+  float onepx = 1.0f + x;
+
+  return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastlgamma (v4sf x)
+{
+  const v4sf c_1_0 = v4sfl (1.0f);
+  const v4sf c_2_0 = v4sfl (2.0f);
+  const v4sf c_3_0 = v4sfl (3.0f);
+  const v4sf c_2_081061466 = v4sfl (2.081061466f);
+  const v4sf c_0_0833333 = v4sfl (0.0833333f);
+  const v4sf c_2_5 = v4sfl (2.5f);
+
+  v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x));
+  v4sf xp3 = c_3_0 + x;
+
+  return - c_2_081061466
+         - x 
+         + c_0_0833333 / xp3 
+         - logterm 
+         + (c_2_5 + x) * vfastlog (xp3);
+}
+
+static inline v4sf
+vfasterlgamma (v4sf x)
+{
+  const v4sf c_0_0810614667 = v4sfl (0.0810614667f);
+  const v4sf c_0_5 = v4sfl (0.5f);
+  const v4sf c_1 = v4sfl (1.0f);
+
+  return - c_0_0810614667
+         - x
+         - vfasterlog (x)
+         + (c_0_5 + x) * vfasterlog (c_1 + x);
+}
+
+static inline v4sf
+vfastdigamma (v4sf x)
+{
+  v4sf twopx = v4sfl (2.0f) + x;
+  v4sf logterm = vfastlog (twopx);
+
+  return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) /
+         (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx)
+         + logterm;
+}
+
+static inline v4sf
+vfasterdigamma (v4sf x)
+{
+  const v4sf c_1_0 = v4sfl (1.0f);
+  const v4sf c_2_0 = v4sfl (2.0f);
+  v4sf onepx = c_1_0 + x;
+
+  return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx);
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_GAMMA_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_HYPERBOLIC_H_
+#define __FAST_HYPERBOLIC_H_
+
+#include <stdint.h>
+
+static inline float
+fastsinh (float p)
+{
+  return 0.5f * (fastexp (p) - fastexp (-p));
+}
+
+static inline float
+fastersinh (float p)
+{
+  return 0.5f * (fasterexp (p) - fasterexp (-p));
+}
+
+static inline float
+fastcosh (float p)
+{
+  return 0.5f * (fastexp (p) + fastexp (-p));
+}
+
+static inline float
+fastercosh (float p)
+{
+  return 0.5f * (fasterexp (p) + fasterexp (-p));
+}
+
+static inline float
+fasttanh (float p)
+{
+  return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
+}
+
+static inline float
+fastertanh (float p)
+{
+  return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastsinh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfastexp (p) - vfastexp (-p));
+}
+
+static inline v4sf
+vfastersinh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfasterexp (p) - vfasterexp (-p));
+}
+
+static inline v4sf
+vfastcosh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfastexp (p) + vfastexp (-p));
+}
+
+static inline v4sf
+vfastercosh (const v4sf p)
+{
+  const v4sf c_0_5 = v4sfl (0.5f);
+
+  return c_0_5 * (vfasterexp (p) + vfasterexp (-p));
+}
+
+static inline v4sf
+vfasttanh (const v4sf p)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+  const v4sf c_2 = v4sfl (2.0f);
+
+  return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p));
+}
+
+static inline v4sf
+vfastertanh (const v4sf p)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+  const v4sf c_2 = v4sfl (2.0f);
+
+  return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_HYPERBOLIC_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_LAMBERT_W_H_
+#define __FAST_LAMBERT_W_H_
+
+#include <stdint.h>
+
+// these functions compute the upper branch aka W_0
+
+static inline float
+fastlambertw (float x)
+{
+  static const float threshold = 2.26445f;
+
+  float c = (x < threshold) ? 1.546865557f : 1.0f;
+  float d = (x < threshold) ? 2.250366841f : 0.0f;
+  float a = (x < threshold) ? -0.737769969f : 0.0f;
+
+  float logterm = fastlog (c * x + d);
+  float loglogterm = fastlog (logterm);
+
+  float minusw = -a - logterm + loglogterm - loglogterm / logterm;
+  float expminusw = fastexp (minusw);
+  float xexpminusw = x * expminusw;
+  float pexpminusw = xexpminusw - minusw;
+
+  return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) /
+         (2.0f + pexpminusw * (2.0f - minusw));
+}
+
+static inline float
+fasterlambertw (float x)
+{
+  static const float threshold = 2.26445f;
+
+  float c = (x < threshold) ? 1.546865557f : 1.0f;
+  float d = (x < threshold) ? 2.250366841f : 0.0f;
+  float a = (x < threshold) ? -0.737769969f : 0.0f;
+
+  float logterm = fasterlog (c * x + d);
+  float loglogterm = fasterlog (logterm);
+
+  float w = a + logterm - loglogterm + loglogterm / logterm;
+  float expw = fasterexp (-w);
+
+  return (w * w + expw * x) / (1.0f + w);
+}
+
+static inline float
+fastlambertwexpx (float x)
+{
+  static const float k = 1.1765631309f;
+  static const float a = 0.94537622168f;
+
+  float logarg = fmaxf (x, k);
+  float powarg = (x < k) ? a * (x - k) : 0;
+
+  float logterm = fastlog (logarg);
+  float powterm = fasterpow2 (powarg);  // don't need accuracy here
+
+  float w = powterm * (logarg - logterm + logterm / logarg);
+  float logw = fastlog (w);
+  float p = x - logw;
+
+  return w * (2.0f + p + w * (3.0f + 2.0f * p)) /
+         (2.0f - p + w * (5.0f + 2.0f * w));
+}
+
+static inline float
+fasterlambertwexpx (float x)
+{
+  static const float k = 1.1765631309f;
+  static const float a = 0.94537622168f;
+
+  float logarg = fmaxf (x, k);
+  float powarg = (x < k) ? a * (x - k) : 0;
+
+  float logterm = fasterlog (logarg);
+  float powterm = fasterpow2 (powarg);
+
+  float w = powterm * (logarg - logterm + logterm / logarg);
+  float logw = fasterlog (w);
+
+  return w * (1.0f + x - logw) / (1.0f + w);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastlambertw (v4sf x)
+{
+  const v4sf threshold = v4sfl (2.26445f);
+
+  v4sf under = _mm_cmplt_ps (x, threshold);
+  v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
+                      _mm_andnot_ps (under, v4sfl (1.0f)));
+  v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
+  v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
+
+  v4sf logterm = vfastlog (c * x + d);
+  v4sf loglogterm = vfastlog (logterm);
+
+  v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm;
+  v4sf expminusw = vfastexp (minusw);
+  v4sf xexpminusw = x * expminusw;
+  v4sf pexpminusw = xexpminusw - minusw;
+
+  return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) / 
+         (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw));
+}
+
+static inline v4sf
+vfasterlambertw (v4sf x)
+{
+  const v4sf threshold = v4sfl (2.26445f);
+
+  v4sf under = _mm_cmplt_ps (x, threshold);
+  v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
+                      _mm_andnot_ps (under, v4sfl (1.0f)));
+  v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
+  v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
+
+  v4sf logterm = vfasterlog (c * x + d);
+  v4sf loglogterm = vfasterlog (logterm);
+
+  v4sf w = a + logterm - loglogterm + loglogterm / logterm;
+  v4sf expw = vfasterexp (-w);
+
+  return (w * w + expw * x) / (v4sfl (1.0f) + w);
+}
+
+static inline v4sf
+vfastlambertwexpx (v4sf x)
+{
+  const v4sf k = v4sfl (1.1765631309f);
+  const v4sf a = v4sfl (0.94537622168f);
+  const v4sf two = v4sfl (2.0f);
+  const v4sf three = v4sfl (3.0f);
+  const v4sf five = v4sfl (5.0f);
+
+  v4sf logarg = _mm_max_ps (x, k);
+  v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
+
+  v4sf logterm = vfastlog (logarg);
+  v4sf powterm = vfasterpow2 (powarg);  // don't need accuracy here
+
+  v4sf w = powterm * (logarg - logterm + logterm / logarg);
+  v4sf logw = vfastlog (w);
+  v4sf p = x - logw;
+
+  return w * (two + p + w * (three + two * p)) /
+         (two - p + w * (five + two * w));
+}
+
+static inline v4sf
+vfasterlambertwexpx (v4sf x)
+{
+  const v4sf k = v4sfl (1.1765631309f);
+  const v4sf a = v4sfl (0.94537622168f);
+
+  v4sf logarg = _mm_max_ps (x, k);
+  v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
+
+  v4sf logterm = vfasterlog (logarg);
+  v4sf powterm = vfasterpow2 (powarg);
+
+  v4sf w = powterm * (logarg - logterm + logterm / logarg);
+  v4sf logw = vfasterlog (w);
+
+  return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w);
+}
+
+#endif // __SSE2__
+
+#endif // __FAST_LAMBERT_W_H_
+
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_POW_H_
+#define __FAST_POW_H_
+
+#include <stdint.h>
+
+static inline float
+fastpow (float x,
+         float p)
+{
+  return fastpow2 (p * fastlog2 (x));
+}
+
+static inline float
+fasterpow (float x,
+           float p)
+{
+  return fasterpow2 (p * fasterlog2 (x));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastpow (const v4sf x,
+          const v4sf p)
+{
+  return vfastpow2 (p * vfastlog2 (x));
+}
+
+static inline v4sf
+vfasterpow (const v4sf x,
+            const v4sf p)
+{
+  return vfasterpow2 (p * vfasterlog2 (x));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_POW_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_SIGMOID_H_
+#define __FAST_SIGMOID_H_
+
+#include <stdint.h>
+
+static inline float
+fastsigmoid (float x)
+{
+  return 1.0f / (1.0f + fastexp (-x));
+}
+
+static inline float
+fastersigmoid (float x)
+{
+  return 1.0f / (1.0f + fasterexp (-x));
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastsigmoid (const v4sf x)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+
+  return c_1 / (c_1 + vfastexp (-x));
+}
+
+static inline v4sf
+vfastersigmoid (const v4sf x)
+{
+  const v4sf c_1 = v4sfl (1.0f);
+
+  return c_1 / (c_1 + vfasterexp (-x));
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_SIGMOID_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul at mineiro.com>                            *
+ *=====================================================================*/
+
+#ifndef __FAST_TRIG_H_
+#define __FAST_TRIG_H_
+
+#include <stdint.h>
+
+// http://www.devmaster.net/forums/showthread.php?t=5784
+// fast sine variants are for x \in [ -\pi, pi ]
+// fast cosine variants are for x \in [ -\pi, pi ]
+// fast tangent variants are for x \in [ -\pi / 2, pi / 2 ]
+// "full" versions of functions handle the entire range of inputs
+// although the range reduction technique used here will be hopelessly
+// inaccurate for |x| >> 1000
+//
+// WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than
+// libc calls on older machines (!) and on newer machines are only 
+// slighly faster.  however:
+//   * vectorized versions are competitive
+//   * faster full versions are competitive
+
+static inline float
+fastsin (float x)
+{
+  static const float fouroverpi = 1.2732395447351627f;
+  static const float fouroverpisq = 0.40528473456935109f;
+  static const float q = 0.78444488374548933f;
+  union { float f; uint32_t i; } p = { 0.20363937680730309f };
+  union { float f; uint32_t i; } r = { 0.015124940802184233f };
+  union { float f; uint32_t i; } s = { -0.0032225901625579573f };
+
+  union { float f; uint32_t i; } vx = { x };
+  uint32_t sign = vx.i & 0x80000000;
+  vx.i = vx.i & 0x7FFFFFFF;
+
+  float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+  float qpproxsq = qpprox * qpprox;
+
+  p.i |= sign;
+  r.i |= sign;
+  s.i ^= sign;
+
+  return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f));
+}
+
+static inline float
+fastersin (float x)
+{
+  static const float fouroverpi = 1.2732395447351627f;
+  static const float fouroverpisq = 0.40528473456935109f;
+  static const float q = 0.77633023248007499f;
+  union { float f; uint32_t i; } p = { 0.22308510060189463f };
+
+  union { float f; uint32_t i; } vx = { x };
+  uint32_t sign = vx.i & 0x80000000;
+  vx.i &= 0x7FFFFFFF;
+
+  float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+
+  p.i |= sign;
+
+  return qpprox * (q + p.f * qpprox);
+}
+
+static inline float
+fastsinfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  return fastsin ((half + k) * twopi - x);
+}
+
+static inline float
+fastersinfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  return fastersin ((half + k) * twopi - x);
+}
+
+static inline float
+fastcos (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  static const float halfpiminustwopi = -4.7123889803846899f;
+  float offset = (x > halfpi) ? halfpiminustwopi : halfpi;
+  return fastsin (x + offset);
+}
+
+static inline float
+fastercos (float x)
+{
+  static const float twooverpi = 0.63661977236758134f;
+  static const float p = 0.54641335845679634f;
+
+  union { float f; uint32_t i; } vx = { x };
+  vx.i &= 0x7FFFFFFF;
+
+  float qpprox = 1.0f - twooverpi * vx.f;
+
+  return qpprox + p * qpprox * (1.0f - qpprox * qpprox);
+}
+
+static inline float
+fastcosfull (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  return fastsinfull (x + halfpi);
+}
+
+static inline float
+fastercosfull (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  return fastersinfull (x + halfpi);
+}
+
+static inline float
+fasttan (float x)
+{
+  static const float halfpi = 1.5707963267948966f;
+  return fastsin (x) / fastsin (x + halfpi);
+}
+
+static inline float
+fastertan (float x)
+{
+  return fastersin (x) / fastercos (x);
+}
+
+static inline float
+fasttanfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  float xnew = x - (half + k) * twopi;
+
+  return fastsin (xnew) / fastcos (xnew);
+}
+
+static inline float
+fastertanfull (float x)
+{
+  static const float twopi = 6.2831853071795865f;
+  static const float invtwopi = 0.15915494309189534f;
+
+  int k = x * invtwopi;
+  float half = (x < 0) ? -0.5f : 0.5f;
+  float xnew = x - (half + k) * twopi;
+
+  return fastersin (xnew) / fastercos (xnew);
+}
+
+#ifdef __SSE2__
+
+static inline v4sf
+vfastsin (const v4sf x)
+{
+  const v4sf fouroverpi = v4sfl (1.2732395447351627f);
+  const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
+  const v4sf q = v4sfl (0.78444488374548933f);
+  const v4sf p = v4sfl (0.20363937680730309f);
+  const v4sf r = v4sfl (0.015124940802184233f);
+  const v4sf s = v4sfl (-0.0032225901625579573f);
+
+  union { v4sf f; v4si i; } vx = { x };
+  v4si sign = vx.i & v4sil (0x80000000);
+  vx.i &= v4sil (0x7FFFFFFF);
+
+  v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+  v4sf qpproxsq = qpprox * qpprox;
+  union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s));
+  vy.i ^= sign;
+
+  return q * qpprox + vy.f;
+}
+
+static inline v4sf
+vfastersin (const v4sf x)
+{
+  const v4sf fouroverpi = v4sfl (1.2732395447351627f);
+  const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
+  const v4sf q = v4sfl (0.77633023248007499f);
+  const v4sf plit = v4sfl (0.22308510060189463f);
+  union { v4sf f; v4si i; } p = { plit };
+
+  union { v4sf f; v4si i; } vx = { x };
+  v4si sign = vx.i & v4sil (0x80000000);
+  vx.i &= v4sil (0x7FFFFFFF);
+
+  v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
+
+  p.i |= sign;
+
+  return qpprox * (q + p.f * qpprox);
+}
+
+static inline v4sf
+vfastsinfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+
+  return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x);
+}
+
+static inline v4sf
+vfastersinfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+
+  return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x);
+}
+
+static inline v4sf
+vfastcos (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f);
+  v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi);
+  v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi),
+                           _mm_andnot_ps (lthalfpi, halfpi));
+  return vfastsin (x + offset);
+}
+
+static inline v4sf
+vfastercos (v4sf x)
+{
+  const v4sf twooverpi = v4sfl (0.63661977236758134f);
+  const v4sf p = v4sfl (0.54641335845679634);
+
+  v4sf vx = v4sf_fabs (x);
+  v4sf qpprox = v4sfl (1.0f) - twooverpi * vx;
+
+  return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox);
+}
+
+static inline v4sf
+vfastcosfull (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  return vfastsinfull (x + halfpi);
+}
+
+static inline v4sf
+vfastercosfull (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  return vfastersinfull (x + halfpi);
+}
+
+static inline v4sf
+vfasttan (const v4sf x)
+{
+  const v4sf halfpi = v4sfl (1.5707963267948966f);
+  return vfastsin (x) / vfastsin (x + halfpi);
+}
+
+static inline v4sf
+vfastertan (const v4sf x)
+{
+  return vfastersin (x) / vfastercos (x);
+}
+
+static inline v4sf
+vfasttanfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+  v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
+
+  return vfastsin (xnew) / vfastcos (xnew);
+}
+
+static inline v4sf
+vfastertanfull (const v4sf x)
+{
+  const v4sf twopi = v4sfl (6.2831853071795865f);
+  const v4sf invtwopi = v4sfl (0.15915494309189534f);
+
+  v4si k = v4sf_to_v4si (x * invtwopi);
+
+  v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
+  v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
+                         _mm_andnot_ps (ltzero, v4sfl (0.5f)));
+  v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
+
+  return vfastersin (xnew) / vfastercos (xnew);
+}
+
+#endif //__SSE2__
+
+#endif // __FAST_TRIG_H_
diff --git a/include/format.h b/include/format.h
new file mode 100644
index 0000000..195f9c3
--- /dev/null
+++ b/include/format.h
@@ -0,0 +1,2762 @@
+/*
+ Formatting library for C++
+
+ Copyright (c) 2012 - 2015, Victor Zverovich
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FMT_FORMAT_H_
+#define FMT_FORMAT_H_
+
+#include <stdint.h>
+
+#include <cassert>
+#include <cmath>
+#include <cstddef>  // for std::ptrdiff_t
+#include <cstdio>
+#include <algorithm>
+#include <limits>
+#include <stdexcept>
+#include <string>
+#include <sstream>
+
+#if _SECURE_SCL
+# include <iterator>
+#endif
+
+#ifdef _MSC_VER
+# include <intrin.h>  // _BitScanReverse, _BitScanReverse64
+
+namespace fmt {
+namespace internal {
+# pragma intrinsic(_BitScanReverse)
+inline uint32_t clz(uint32_t x) {
+  unsigned long r = 0;
+  _BitScanReverse(&r, x);
+  return 31 - r;
+}
+# define FMT_BUILTIN_CLZ(n) fmt::internal::clz(n)
+
+# ifdef _WIN64
+#  pragma intrinsic(_BitScanReverse64)
+# endif
+
+inline uint32_t clzll(uint64_t x) {
+  unsigned long r = 0;
+# ifdef _WIN64
+  _BitScanReverse64(&r, x);
+# else
+  // Scan the high 32 bits.
+  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32)))
+    return 63 - (r + 32);
+
+  // Scan the low 32 bits.
+  _BitScanReverse(&r, static_cast<uint32_t>(x));
+# endif
+  return 63 - r;
+}
+# define FMT_BUILTIN_CLZLL(n) fmt::internal::clzll(n)
+}
+}
+#endif
+
+#ifdef __GNUC__
+# define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+# define FMT_GCC_EXTENSION __extension__
+# if FMT_GCC_VERSION >= 406
+#  pragma GCC diagnostic push
+// Disable the warning about "long long" which is sometimes reported even
+// when using __extension__.
+#  pragma GCC diagnostic ignored "-Wlong-long"
+// Disable the warning about declaration shadowing because it affects too
+// many valid cases.
+#  pragma GCC diagnostic ignored "-Wshadow"
+# endif
+# if __cplusplus >= 201103L || defined __GXX_EXPERIMENTAL_CXX0X__
+#  define FMT_HAS_GXX_CXX11 1
+# endif
+#else
+# define FMT_GCC_EXTENSION
+#endif
+
+#ifdef __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wdocumentation"
+#endif
+
+#ifdef __GNUC_LIBSTD__
+# define FMT_GNUC_LIBSTD_VERSION (__GNUC_LIBSTD__ * 100 + __GNUC_LIBSTD_MINOR__)
+#endif
+
+#ifdef __has_feature
+# define FMT_HAS_FEATURE(x) __has_feature(x)
+#else
+# define FMT_HAS_FEATURE(x) 0
+#endif
+
+#ifdef __has_builtin
+# define FMT_HAS_BUILTIN(x) __has_builtin(x)
+#else
+# define FMT_HAS_BUILTIN(x) 0
+#endif
+
+#ifdef __has_cpp_attribute
+# define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define FMT_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+#ifndef FMT_USE_VARIADIC_TEMPLATES
+// Variadic templates are available in GCC since version 4.4
+// (http://gcc.gnu.org/projects/cxx0x.html) and in Visual C++
+// since version 2013.
+# define FMT_USE_VARIADIC_TEMPLATES \
+   (FMT_HAS_FEATURE(cxx_variadic_templates) || \
+       (FMT_GCC_VERSION >= 404 && FMT_HAS_GXX_CXX11) || _MSC_VER >= 1800)
+#endif
+
+#ifndef FMT_USE_RVALUE_REFERENCES
+// Don't use rvalue references when compiling with clang and an old libstdc++
+// as the latter doesn't provide std::move.
+# if defined(FMT_GNUC_LIBSTD_VERSION) && FMT_GNUC_LIBSTD_VERSION <= 402
+#  define FMT_USE_RVALUE_REFERENCES 0
+# else
+#  define FMT_USE_RVALUE_REFERENCES \
+    (FMT_HAS_FEATURE(cxx_rvalue_references) || \
+        (FMT_GCC_VERSION >= 403 && FMT_HAS_GXX_CXX11) || _MSC_VER >= 1600)
+# endif
+#endif
+
+#if FMT_USE_RVALUE_REFERENCES
+# include <utility>  // for std::move
+#endif
+
+// Define FMT_USE_NOEXCEPT to make C++ Format use noexcept (C++11 feature).
+#if FMT_USE_NOEXCEPT || FMT_HAS_FEATURE(cxx_noexcept) || \
+  (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11)
+# define FMT_NOEXCEPT noexcept
+#else
+# define FMT_NOEXCEPT throw()
+#endif
+
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#if FMT_USE_DELETED_FUNCTIONS || FMT_HAS_FEATURE(cxx_deleted_functions) || \
+  (FMT_GCC_VERSION >= 404 && FMT_HAS_GXX_CXX11) || _MSC_VER >= 1800
+# define FMT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+    TypeName(const TypeName&) = delete; \
+    TypeName& operator=(const TypeName&) = delete
+#else
+# define FMT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+    TypeName(const TypeName&); \
+    TypeName& operator=(const TypeName&)
+#endif
+
+namespace fmt {
+
+// Fix the warning about long long on older versions of GCC
+// that don't support the diagnostic pragma.
+FMT_GCC_EXTENSION typedef long long LongLong;
+FMT_GCC_EXTENSION typedef unsigned long long ULongLong;
+
+#if FMT_USE_RVALUE_REFERENCES
+using std::move;
+#endif
+
+template <typename Char>
+class BasicWriter;
+
+typedef BasicWriter<char> Writer;
+typedef BasicWriter<wchar_t> WWriter;
+
+template <typename Char>
+class BasicFormatter;
+
+template <typename Char, typename T>
+void format(BasicFormatter<Char> &f, const Char *&format_str, const T &value);
+
+/**
+  \rst
+  A string reference. It can be constructed from a C string or
+  ``std::string``.
+  
+  You can use one of the following typedefs for common character types:
+
+  +------------+-------------------------+
+  | Type       | Definition              |
+  +============+=========================+
+  | StringRef  | BasicStringRef<char>    |
+  +------------+-------------------------+
+  | WStringRef | BasicStringRef<wchar_t> |
+  +------------+-------------------------+
+
+  This class is most useful as a parameter type to allow passing
+  different types of strings to a function, for example::
+
+    template <typename... Args>
+    std::string format(StringRef format_str, const Args & ... args);
+
+    format("{}", 42);
+    format(std::string("{}"), 42);
+  \endrst
+ */
+template <typename Char>
+class BasicStringRef {
+ private:
+  const Char *data_;
+  std::size_t size_;
+
+ public:
+  /**
+    Constructs a string reference object from a C string and a size.
+   */
+  BasicStringRef(const Char *s, std::size_t size) : data_(s), size_(size) {}
+
+  /**
+    Constructs a string reference object from a C string computing
+    the size with ``std::char_traits<Char>::length``.
+   */
+  BasicStringRef(const Char *s)
+    : data_(s), size_(std::char_traits<Char>::length(s)) {}
+
+  /**
+    Constructs a string reference from an `std::string` object.
+   */
+  BasicStringRef(const std::basic_string<Char> &s)
+  : data_(s.c_str()), size_(s.size()) {}
+
+  /**
+    Converts a string reference to an `std::string` object.
+   */
+  operator std::basic_string<Char>() const {
+    return std::basic_string<Char>(data_, size());
+  }
+
+  /**
+    Returns the pointer to a C string.
+   */
+  const Char *c_str() const { return data_; }
+
+  /**
+    Returns the string size.
+   */
+  std::size_t size() const { return size_; }
+
+  friend bool operator==(BasicStringRef lhs, BasicStringRef rhs) {
+    return lhs.data_ == rhs.data_;
+  }
+  friend bool operator!=(BasicStringRef lhs, BasicStringRef rhs) {
+    return lhs.data_ != rhs.data_;
+  }
+};
+
+typedef BasicStringRef<char> StringRef;
+typedef BasicStringRef<wchar_t> WStringRef;
+
+/**
+  A formatting error such as invalid format string.
+*/
+class FormatError : public std::runtime_error {
+ public:
+  explicit FormatError(StringRef message)
+  : std::runtime_error(message.c_str()) {}
+};
+
+namespace internal {
+// The number of characters to store in the MemoryBuffer object itself
+// to avoid dynamic memory allocation.
+enum { INLINE_BUFFER_SIZE = 500 };
+
+#if _SECURE_SCL
+// Use checked iterator to avoid warnings on MSVC.
+template <typename T>
+inline stdext::checked_array_iterator<T*> make_ptr(T *ptr, std::size_t size) {
+  return stdext::checked_array_iterator<T*>(ptr, size);
+}
+#else
+template <typename T>
+inline T *make_ptr(T *ptr, std::size_t) { return ptr; }
+#endif
+}  // namespace internal
+
+/** A buffer supporting a subset of ``std::vector``'s operations. */
+template <typename T>
+class Buffer {
+ private:
+  FMT_DISALLOW_COPY_AND_ASSIGN(Buffer);
+
+ protected:
+  T *ptr_;
+  std::size_t size_;
+  std::size_t capacity_;
+
+  Buffer(T *ptr = 0, std::size_t capacity = 0)
+    : ptr_(ptr), size_(0), capacity_(capacity) {}
+
+  /**
+    Increases the buffer capacity to hold at least *size* elements updating
+    ``ptr_`` and ``capacity_``.
+   */
+  virtual void grow(std::size_t size) = 0;
+
+ public:
+  virtual ~Buffer() {}
+
+  /** Returns the size of this buffer. */
+  std::size_t size() const { return size_; }
+
+  /** Returns the capacity of this buffer. */
+  std::size_t capacity() const { return capacity_; }
+
+  /**
+    Resizes the buffer. If T is a POD type new elements may not be initialized.
+   */
+  void resize(std::size_t new_size) {
+    if (new_size > capacity_)
+      grow(new_size);
+    size_ = new_size;
+  }
+
+  /** Reserves space to store at least *capacity* elements. */
+  void reserve(std::size_t capacity) {
+    if (capacity > capacity_)
+      grow(capacity);
+  }
+
+  void clear() FMT_NOEXCEPT { size_ = 0; }
+
+  void push_back(const T &value) {
+    if (size_ == capacity_)
+      grow(size_ + 1);
+    ptr_[size_++] = value;
+  }
+
+  /** Appends data to the end of the buffer. */
+  void append(const T *begin, const T *end);
+
+  T &operator[](std::size_t index) { return ptr_[index]; }
+  const T &operator[](std::size_t index) const { return ptr_[index]; }
+};
+
+template <typename T>
+void Buffer<T>::append(const T *begin, const T *end) {
+  std::ptrdiff_t num_elements = end - begin;
+  if (size_ + num_elements > capacity_)
+    grow(size_ + num_elements);
+  std::copy(begin, end, internal::make_ptr(ptr_, capacity_) + size_);
+  size_ += num_elements;
+}
+
+namespace internal {
+
+// A memory buffer for POD types with the first SIZE elements stored in
+// the object itself.
+template <typename T, std::size_t SIZE, typename Allocator = std::allocator<T> >
+class MemoryBuffer : private Allocator, public Buffer<T> {
+ private:
+  T data_[SIZE];
+
+  // Free memory allocated by the buffer.
+  void free() {
+    if (this->ptr_ != data_) this->deallocate(this->ptr_, this->capacity_);
+  }
+
+ protected:
+  void grow(std::size_t size);
+
+ public:
+  explicit MemoryBuffer(const Allocator &alloc = Allocator())
+      : Allocator(alloc), Buffer<T>(data_, SIZE) {}
+  ~MemoryBuffer() { free(); }
+
+#if FMT_USE_RVALUE_REFERENCES
+ private:
+  // Move data from other to this buffer.
+  void move(MemoryBuffer &other) {
+    Allocator &this_alloc = *this, &other_alloc = other;
+    this_alloc = std::move(other_alloc);
+    this->size_ = other.size_;
+    this->capacity_ = other.capacity_;
+    if (other.ptr_ == other.data_) {
+      this->ptr_ = data_;
+      std::copy(other.data_,
+                other.data_ + this->size_, make_ptr(data_, this->capacity_));
+    } else {
+      this->ptr_ = other.ptr_;
+      // Set pointer to the inline array so that delete is not called
+      // when freeing.
+      other.ptr_ = other.data_;
+    }
+  }
+
+ public:
+  MemoryBuffer(MemoryBuffer &&other) {
+    move(other);
+  }
+
+  MemoryBuffer &operator=(MemoryBuffer &&other) {
+    assert(this != &other);
+    free();
+    move(other);
+    return *this;
+  }
+#endif
+
+  // Returns a copy of the allocator associated with this buffer.
+  Allocator get_allocator() const { return *this; }
+};
+
+template <typename T, std::size_t SIZE, typename Allocator>
+void MemoryBuffer<T, SIZE, Allocator>::grow(std::size_t size) {
+  std::size_t new_capacity =
+      (std::max)(size, this->capacity_ + this->capacity_ / 2);
+  T *new_ptr = this->allocate(new_capacity);
+  // The following code doesn't throw, so the raw pointer above doesn't leak.
+  std::copy(this->ptr_,
+            this->ptr_ + this->size_, make_ptr(new_ptr, new_capacity));
+  std::size_t old_capacity = this->capacity_;
+  T *old_ptr = this->ptr_;
+  this->capacity_ = new_capacity;
+  this->ptr_ = new_ptr;
+  // deallocate may throw (at least in principle), but it doesn't matter since
+  // the buffer already uses the new storage and will deallocate it in case
+  // of exception.
+  if (old_ptr != data_)
+    this->deallocate(old_ptr, old_capacity);
+}
+
+// A fixed-size buffer.
+template <typename Char>
+class FixedBuffer : public fmt::Buffer<Char> {
+ public:
+  FixedBuffer(Char *array, std::size_t size) : fmt::Buffer<Char>(array, size) {}
+
+ protected:
+  void grow(std::size_t size);
+};
+
+#ifndef _MSC_VER
+// Portable version of signbit.
+inline int getsign(double x) {
+  // When compiled in C++11 mode signbit is no longer a macro but a function
+  // defined in namespace std and the macro is undefined.
+# ifdef signbit
+  return signbit(x);
+# else
+  return std::signbit(x);
+# endif
+}
+
+// Portable version of isinf.
+# ifdef isinf
+inline int isinfinity(double x) { return isinf(x); }
+inline int isinfinity(long double x) { return isinf(x); }
+# else
+inline int isinfinity(double x) { return std::isinf(x); }
+inline int isinfinity(long double x) { return std::isinf(x); }
+# endif
+#else
+inline int getsign(double value) {
+  if (value < 0) return 1;
+  if (value == value) return 0;
+  int dec = 0, sign = 0;
+  char buffer[2];  // The buffer size must be >= 2 or _ecvt_s will fail.
+  _ecvt_s(buffer, sizeof(buffer), value, 0, &dec, &sign);
+  return sign;
+}
+inline int isinfinity(double x) { return !_finite(x); }
+inline int isinfinity(long double x) {
+  return !_finite(static_cast<double>(x));
+}
+#endif
+
+template <typename Char>
+class BasicCharTraits {
+ public:
+#if _SECURE_SCL
+  typedef stdext::checked_array_iterator<Char*> CharPtr;
+#else
+  typedef Char *CharPtr;
+#endif
+};
+
+template <typename Char>
+class CharTraits;
+
+template <>
+class CharTraits<char> : public BasicCharTraits<char> {
+ private:
+  // Conversion from wchar_t to char is not allowed.
+  static char convert(wchar_t);
+
+public:
+  static char convert(char value) { return value; }
+
+  // Formats a floating-point number.
+  template <typename T>
+  static int format_float(char *buffer, std::size_t size,
+      const char *format, unsigned width, int precision, T value);
+};
+
+template <>
+class CharTraits<wchar_t> : public BasicCharTraits<wchar_t> {
+ public:
+  static wchar_t convert(char value) { return value; }
+  static wchar_t convert(wchar_t value) { return value; }
+
+  template <typename T>
+  static int format_float(wchar_t *buffer, std::size_t size,
+      const wchar_t *format, unsigned width, int precision, T value);
+};
+
+// Checks if a number is negative - used to avoid warnings.
+template <bool IsSigned>
+struct SignChecker {
+  template <typename T>
+  static bool is_negative(T value) { return value < 0; }
+};
+
+template <>
+struct SignChecker<false> {
+  template <typename T>
+  static bool is_negative(T) { return false; }
+};
+
+// Returns true if value is negative, false otherwise.
+// Same as (value < 0) but doesn't produce warnings if T is an unsigned type.
+template <typename T>
+inline bool is_negative(T value) {
+  return SignChecker<std::numeric_limits<T>::is_signed>::is_negative(value);
+}
+
+// Selects uint32_t if FitsIn32Bits is true, uint64_t otherwise.
+template <bool FitsIn32Bits>
+struct TypeSelector { typedef uint32_t Type; };
+
+template <>
+struct TypeSelector<false> { typedef uint64_t Type; };
+
+template <typename T>
+struct IntTraits {
+  // Smallest of uint32_t and uint64_t that is large enough to represent
+  // all values of T.
+  typedef typename
+    TypeSelector<std::numeric_limits<T>::digits <= 32>::Type MainType;
+};
+
+// MakeUnsigned<T>::Type gives an unsigned type corresponding to integer type T.
+template <typename T>
+struct MakeUnsigned { typedef T Type; };
+
+#define FMT_SPECIALIZE_MAKE_UNSIGNED(T, U) \
+  template <> \
+  struct MakeUnsigned<T> { typedef U Type; }
+
+FMT_SPECIALIZE_MAKE_UNSIGNED(char, unsigned char);
+FMT_SPECIALIZE_MAKE_UNSIGNED(signed char, unsigned char);
+FMT_SPECIALIZE_MAKE_UNSIGNED(short, unsigned short);
+FMT_SPECIALIZE_MAKE_UNSIGNED(int, unsigned);
+FMT_SPECIALIZE_MAKE_UNSIGNED(long, unsigned long);
+FMT_SPECIALIZE_MAKE_UNSIGNED(LongLong, ULongLong);
+
+void report_unknown_type(char code, const char *type);
+
+// Static data is placed in this class template to allow header-only
+// configuration.
+template <typename T = void>
+struct BasicData {
+  static const uint32_t POWERS_OF_10_32[];
+  static const uint64_t POWERS_OF_10_64[];
+  static const char DIGITS[];
+};
+
+typedef BasicData<> Data;
+
+#if FMT_GCC_VERSION >= 400 || FMT_HAS_BUILTIN(__builtin_clz)
+# define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
+#endif
+
+#if FMT_GCC_VERSION >= 400 || FMT_HAS_BUILTIN(__builtin_clzll)
+# define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
+#endif
+
+#ifdef FMT_BUILTIN_CLZLL
+// Returns the number of decimal digits in n. Leading zeros are not counted
+// except for n == 0 in which case count_digits returns 1.
+inline unsigned count_digits(uint64_t n) {
+  // Based on http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10
+  // and the benchmark https://github.com/localvoid/cxx-benchmark-count-digits.
+  unsigned t = (64 - FMT_BUILTIN_CLZLL(n | 1)) * 1233 >> 12;
+  return t - (n < Data::POWERS_OF_10_64[t]) + 1;
+}
+#else
+// Fallback version of count_digits used when __builtin_clz is not available.
+inline unsigned count_digits(uint64_t n) {
+  unsigned count = 1;
+  for (;;) {
+    // Integer division is slow so do it for a group of four digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    if (n < 10) return count;
+    if (n < 100) return count + 1;
+    if (n < 1000) return count + 2;
+    if (n < 10000) return count + 3;
+    n /= 10000u;
+    count += 4;
+  }
+}
+#endif
+
+#ifdef FMT_BUILTIN_CLZ
+// Optional version of count_digits for better performance on 32-bit platforms.
+inline unsigned count_digits(uint32_t n) {
+  uint32_t t = (32 - FMT_BUILTIN_CLZ(n | 1)) * 1233 >> 12;
+  return t - (n < Data::POWERS_OF_10_32[t]) + 1;
+}
+#endif
+
+// Formats a decimal unsigned integer value writing into buffer.
+template <typename UInt, typename Char>
+inline void format_decimal(Char *buffer, UInt value, unsigned num_digits) {
+  --num_digits;
+  while (value >= 100) {
+    // Integer division is slow so do it for a group of two digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    unsigned index = (value % 100) * 2;
+    value /= 100;
+    buffer[num_digits] = Data::DIGITS[index + 1];
+    buffer[num_digits - 1] = Data::DIGITS[index];
+    num_digits -= 2;
+  }
+  if (value < 10) {
+    *buffer = static_cast<char>('0' + value);
+    return;
+  }
+  unsigned index = static_cast<unsigned>(value * 2);
+  buffer[1] = Data::DIGITS[index + 1];
+  buffer[0] = Data::DIGITS[index];
+}
+
+#ifdef _WIN32
+// A converter from UTF-8 to UTF-16.
+// It is only provided for Windows since other systems support UTF-8 natively.
+class UTF8ToUTF16 {
+ private:
+  MemoryBuffer<wchar_t, INLINE_BUFFER_SIZE> buffer_;
+
+ public:
+  explicit UTF8ToUTF16(StringRef s);
+  operator WStringRef() const { return WStringRef(&buffer_[0], size()); }
+  size_t size() const { return buffer_.size() - 1; }
+  const wchar_t *c_str() const { return &buffer_[0]; }
+  std::wstring str() const { return std::wstring(&buffer_[0], size()); }
+};
+
+// A converter from UTF-16 to UTF-8.
+// It is only provided for Windows since other systems support UTF-8 natively.
+class UTF16ToUTF8 {
+ private:
+  MemoryBuffer<char, INLINE_BUFFER_SIZE> buffer_;
+
+ public:
+  UTF16ToUTF8() {}
+  explicit UTF16ToUTF8(WStringRef s);
+  operator StringRef() const { return StringRef(&buffer_[0], size()); }
+  size_t size() const { return buffer_.size() - 1; }
+  const char *c_str() const { return &buffer_[0]; }
+  std::string str() const { return std::string(&buffer_[0], size()); }
+
+  // Performs conversion returning a system error code instead of
+  // throwing exception on conversion error. This method may still throw
+  // in case of memory allocation error.
+  int convert(WStringRef s);
+};
+#endif
+
+void format_system_error(fmt::Writer &out, int error_code,
+                         fmt::StringRef message) FMT_NOEXCEPT;
+
+#ifdef _WIN32
+void format_windows_error(fmt::Writer &out, int error_code,
+                          fmt::StringRef message) FMT_NOEXCEPT;
+#endif
+
+// A formatting argument value.
+struct Value {
+  template <typename Char>
+  struct StringValue {
+    const Char *value;
+    std::size_t size;
+  };
+
+  typedef void (*FormatFunc)(
+      void *formatter, const void *arg, void *format_str_ptr);
+
+  struct CustomValue {
+    const void *value;
+    FormatFunc format;
+  };
+
+  union {
+    int int_value;
+    unsigned uint_value;
+    LongLong long_long_value;
+    ULongLong ulong_long_value;
+    double double_value;
+    long double long_double_value;
+    const void *pointer;
+    StringValue<char> string;
+    StringValue<signed char> sstring;
+    StringValue<unsigned char> ustring;
+    StringValue<wchar_t> wstring;
+    CustomValue custom;
+  };
+
+  enum Type {
+    NONE,
+    // Integer types should go first,
+    INT, UINT, LONG_LONG, ULONG_LONG, CHAR, LAST_INTEGER_TYPE = CHAR,
+    // followed by floating-point types.
+    DOUBLE, LONG_DOUBLE, LAST_NUMERIC_TYPE = LONG_DOUBLE,
+    CSTRING, STRING, WSTRING, POINTER, CUSTOM
+  };
+};
+
+// A formatting argument. It is a POD type to allow storage in
+// internal::MemoryBuffer.
+struct Arg : Value {
+  Type type;
+};
+
+template <typename T = void>
+struct None {};
+
+// A helper class template to enable or disable overloads taking wide
+// characters and strings in MakeValue.
+template <typename T, typename Char>
+struct WCharHelper {
+  typedef None<T> Supported;
+  typedef T Unsupported;
+};
+
+template <typename T>
+struct WCharHelper<T, wchar_t> {
+  typedef T Supported;
+  typedef None<T> Unsupported;
+};
+
+template <typename T>
+class IsConvertibleToInt {
+ private:
+  typedef char yes[1];
+  typedef char no[2];
+
+  static const T &get();
+
+  static yes &check(fmt::ULongLong);
+  static no &check(...);
+  
+ public:
+  enum { value = (sizeof(check(get())) == sizeof(yes)) };
+};
+
+#define FMT_CONVERTIBLE_TO_INT(Type) \
+  template <> \
+  class IsConvertibleToInt<Type> { \
+   public: \
+    enum { value = 1 }; \
+  }
+
+// Silence warnings about convering float to int.
+FMT_CONVERTIBLE_TO_INT(float);
+FMT_CONVERTIBLE_TO_INT(double);
+FMT_CONVERTIBLE_TO_INT(long double);
+
+template<bool B, class T = void>
+struct EnableIf {};
+
+template<class T>
+struct EnableIf<true, T> { typedef T type; };
+
+template<bool B, class T, class F>
+struct Conditional { typedef T type; };
+
+template<class T, class F>
+struct Conditional<false, T, F> { typedef F type; };
+
+// A helper function to suppress bogus "conditional expression is constant"
+// warnings.
+inline bool check(bool value) { return value; }
+
+// Makes an Arg object from any type.
+template <typename Char>
+class MakeValue : public Arg {
+ private:
+  // The following two methods are private to disallow formatting of
+  // arbitrary pointers. If you want to output a pointer cast it to
+  // "void *" or "const void *". In particular, this forbids formatting
+  // of "[const] volatile char *" which is printed as bool by iostreams.
+  // Do not implement!
+  template <typename T>
+  MakeValue(const T *value);
+  template <typename T>
+  MakeValue(T *value);
+
+  // The following methods are private to disallow formatting of wide
+  // characters and strings into narrow strings as in
+  //   fmt::format("{}", L"test");
+  // To fix this, use a wide format string: fmt::format(L"{}", L"test").
+  MakeValue(typename WCharHelper<wchar_t, Char>::Unsupported);
+  MakeValue(typename WCharHelper<wchar_t *, Char>::Unsupported);
+  MakeValue(typename WCharHelper<const wchar_t *, Char>::Unsupported);
+  MakeValue(typename WCharHelper<const std::wstring &, Char>::Unsupported);
+  MakeValue(typename WCharHelper<WStringRef, Char>::Unsupported);
+
+  void set_string(StringRef str) {
+    string.value = str.c_str();
+    string.size = str.size();
+  }
+
+  void set_string(WStringRef str) {
+    wstring.value = str.c_str();
+    wstring.size = str.size();
+  }
+
+  // Formats an argument of a custom type, such as a user-defined class.
+  template <typename T>
+  static void format_custom_arg(
+      void *formatter, const void *arg, void *format_str_ptr) {
+    format(*static_cast<BasicFormatter<Char>*>(formatter),
+           *static_cast<const Char**>(format_str_ptr),
+           *static_cast<const T*>(arg));
+  }
+
+ public:
+  MakeValue() {}
+
+#define FMT_MAKE_VALUE(Type, field, TYPE) \
+  MakeValue(Type value) { field = value; } \
+  static uint64_t type(Type) { return Arg::TYPE; }
+
+  FMT_MAKE_VALUE(bool, int_value, INT)
+  FMT_MAKE_VALUE(short, int_value, INT)
+  FMT_MAKE_VALUE(unsigned short, uint_value, UINT)
+  FMT_MAKE_VALUE(int, int_value, INT)
+  FMT_MAKE_VALUE(unsigned, uint_value, UINT)
+
+  MakeValue(long value) {
+    // To minimize the number of types we need to deal with, long is
+    // translated either to int or to long long depending on its size.
+    if (check(sizeof(long) == sizeof(int)))
+      int_value = static_cast<int>(value);
+    else
+      long_long_value = value;
+  }
+  static uint64_t type(long) {
+    return sizeof(long) == sizeof(int) ? Arg::INT : Arg::LONG_LONG;
+  }
+
+  MakeValue(unsigned long value) {
+    if (check(sizeof(unsigned long) == sizeof(unsigned)))
+      uint_value = static_cast<unsigned>(value);
+    else
+      ulong_long_value = value;
+  }
+  static uint64_t type(unsigned long) {
+    return sizeof(unsigned long) == sizeof(unsigned) ?
+          Arg::UINT : Arg::ULONG_LONG;
+  }
+
+  FMT_MAKE_VALUE(LongLong, long_long_value, LONG_LONG)
+  FMT_MAKE_VALUE(ULongLong, ulong_long_value, ULONG_LONG)
+  FMT_MAKE_VALUE(float, double_value, DOUBLE)
+  FMT_MAKE_VALUE(double, double_value, DOUBLE)
+  FMT_MAKE_VALUE(long double, long_double_value, LONG_DOUBLE)
+  FMT_MAKE_VALUE(signed char, int_value, CHAR)
+  FMT_MAKE_VALUE(unsigned char, int_value, CHAR)
+  FMT_MAKE_VALUE(char, int_value, CHAR)
+
+  MakeValue(typename WCharHelper<wchar_t, Char>::Supported value) {
+    int_value = value;
+  }
+  static uint64_t type(wchar_t) { return Arg::CHAR; }
+
+#define FMT_MAKE_STR_VALUE(Type, TYPE) \
+  MakeValue(Type value) { set_string(value); } \
+  static uint64_t type(Type) { return Arg::TYPE; }
+
+  FMT_MAKE_VALUE(char *, string.value, CSTRING)
+  FMT_MAKE_VALUE(const char *, string.value, CSTRING)
+  FMT_MAKE_VALUE(const signed char *, sstring.value, CSTRING)
+  FMT_MAKE_VALUE(const unsigned char *, ustring.value, CSTRING)
+  FMT_MAKE_STR_VALUE(const std::string &, STRING)
+  FMT_MAKE_STR_VALUE(StringRef, STRING)
+
+#define FMT_MAKE_WSTR_VALUE(Type, TYPE) \
+  MakeValue(typename WCharHelper<Type, Char>::Supported value) { \
+    set_string(value); \
+  } \
+  static uint64_t type(Type) { return Arg::TYPE; }
+
+  FMT_MAKE_WSTR_VALUE(wchar_t *, WSTRING)
+  FMT_MAKE_WSTR_VALUE(const wchar_t *, WSTRING)
+  FMT_MAKE_WSTR_VALUE(const std::wstring &, WSTRING)
+  FMT_MAKE_WSTR_VALUE(WStringRef, WSTRING)
+
+  FMT_MAKE_VALUE(void *, pointer, POINTER)
+  FMT_MAKE_VALUE(const void *, pointer, POINTER)
+
+  template <typename T>
+  MakeValue(const T &value,
+            typename EnableIf<!IsConvertibleToInt<T>::value, int>::type = 0) {
+    custom.value = &value;
+    custom.format = &format_custom_arg<T>;
+  }
+
+  template <typename T>
+  MakeValue(const T &value,
+            typename EnableIf<IsConvertibleToInt<T>::value, int>::type = 0) {
+    int_value = value;
+  }
+
+  template <typename T>
+  static uint64_t type(const T &) {
+    return IsConvertibleToInt<T>::value ? Arg::INT : Arg::CUSTOM;
+  }
+};
+
+#define FMT_DISPATCH(call) static_cast<Impl*>(this)->call
+
+// An argument visitor.
+// To use ArgVisitor define a subclass that implements some or all of the
+// visit methods with the same signatures as the methods in ArgVisitor,
+// for example, visit_int(int).
+// Specify the subclass name as the Impl template parameter. Then calling
+// ArgVisitor::visit for some argument will dispatch to a visit method
+// specific to the argument type. For example, if the argument type is
+// double then visit_double(double) method of a subclass will be called.
+// If the subclass doesn't contain a method with this signature, then
+// a corresponding method of ArgVisitor will be called.
+//
+// Example:
+//  class MyArgVisitor : public ArgVisitor<MyArgVisitor, void> {
+//   public:
+//    void visit_int(int value) { print("{}", value); }
+//    void visit_double(double value) { print("{}", value ); }
+//  };
+//
+// ArgVisitor uses the curiously recurring template pattern:
+// http://en.wikipedia.org/wiki/Curiously_recurring_template_pattern
+template <typename Impl, typename Result>
+class ArgVisitor {
+ public:
+  void report_unhandled_arg() {}
+
+  Result visit_unhandled_arg() {
+    FMT_DISPATCH(report_unhandled_arg());
+    return Result();
+  }
+
+  Result visit_int(int value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+  Result visit_long_long(LongLong value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+  Result visit_uint(unsigned value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+  Result visit_ulong_long(ULongLong value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+  Result visit_char(int value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+  template <typename T>
+  Result visit_any_int(T) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  Result visit_double(double value) {
+    return FMT_DISPATCH(visit_any_double(value));
+  }
+  Result visit_long_double(long double value) {
+    return FMT_DISPATCH(visit_any_double(value));
+  }
+  template <typename T>
+  Result visit_any_double(T) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  Result visit_string(Arg::StringValue<char>) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+  Result visit_wstring(Arg::StringValue<wchar_t>) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+  Result visit_pointer(const void *) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+  Result visit_custom(Arg::CustomValue) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  Result visit(const Arg &arg) {
+    switch (arg.type) {
+    default:
+      assert(false);
+      return Result();
+    case Arg::INT:
+      return FMT_DISPATCH(visit_int(arg.int_value));
+    case Arg::UINT:
+      return FMT_DISPATCH(visit_uint(arg.uint_value));
+    case Arg::LONG_LONG:
+      return FMT_DISPATCH(visit_long_long(arg.long_long_value));
+    case Arg::ULONG_LONG:
+      return FMT_DISPATCH(visit_ulong_long(arg.ulong_long_value));
+    case Arg::DOUBLE:
+      return FMT_DISPATCH(visit_double(arg.double_value));
+    case Arg::LONG_DOUBLE:
+      return FMT_DISPATCH(visit_long_double(arg.long_double_value));
+    case Arg::CHAR:
+      return FMT_DISPATCH(visit_char(arg.int_value));
+    case Arg::CSTRING: {
+      Arg::StringValue<char> str = arg.string;
+      str.size = 0;
+      return FMT_DISPATCH(visit_string(str));
+    }
+    case Arg::STRING:
+      return FMT_DISPATCH(visit_string(arg.string));
+    case Arg::WSTRING:
+      return FMT_DISPATCH(visit_wstring(arg.wstring));
+    case Arg::POINTER:
+      return FMT_DISPATCH(visit_pointer(arg.pointer));
+    case Arg::CUSTOM:
+      return FMT_DISPATCH(visit_custom(arg.custom));
+    }
+  }
+};
+
+class RuntimeError : public std::runtime_error {
+ protected:
+  RuntimeError() : std::runtime_error("") {}
+};
+
+template <typename Char>
+class ArgFormatter;
+}  // namespace internal
+
+/** An argument list. */
+class ArgList {
+ private:
+  // To reduce compiled code size per formatting function call, types of first
+  // MAX_PACKED_ARGS arguments are passed in the types_ field.
+  uint64_t types_;
+  union {
+    // If the number of arguments is less than MAX_PACKED_ARGS, the argument
+    // values are stored in values_, otherwise they are stored in args_.
+    // This is done to reduce compiled code size as storing larger objects
+    // may require more code (at least on x86-64) even if the same amount of
+    // data is actually copied to stack. It saves ~10% on the bloat test.
+    const internal::Value *values_;
+    const internal::Arg *args_;
+  };
+
+  internal::Arg::Type type(unsigned index) const {
+    unsigned shift = index * 4;
+    uint64_t mask = 0xf;
+    return static_cast<internal::Arg::Type>(
+          (types_ & (mask << shift)) >> shift);
+  }
+
+ public:
+  // Maximum number of arguments with packed types.
+  enum { MAX_PACKED_ARGS = 16 };
+
+  ArgList() : types_(0) {}
+
+  // TODO: MakeArgList(const Args &...)
+  ArgList(ULongLong types, const internal::Value *values)
+  : types_(types), values_(values) {}
+  ArgList(ULongLong types, const internal::Arg *args)
+  : types_(types), args_(args) {}
+
+  /** Returns the argument at specified index. */
+  internal::Arg operator[](unsigned index) const {
+    using internal::Arg;
+    Arg arg;
+    bool use_values = type(MAX_PACKED_ARGS - 1) == Arg::NONE;
+    if (index < MAX_PACKED_ARGS) {
+      Arg::Type arg_type = type(index);
+      internal::Value &val = arg;
+      if (arg_type != Arg::NONE)
+        val = use_values ? values_[index] : args_[index];
+      arg.type = arg_type;
+      return arg;
+    }
+    if (use_values) {
+      // The index is greater than the number of arguments that can be stored
+      // in values, so return a "none" argument.
+      arg.type = Arg::NONE;
+      return arg;
+    }
+    for (unsigned i = MAX_PACKED_ARGS; i <= index; ++i) {
+      if (args_[i].type == Arg::NONE)
+        return args_[i];
+    }
+    return args_[index];
+  }
+};
+
+struct FormatSpec;
+
+namespace internal {
+
+class FormatterBase {
+ private:
+  ArgList args_;
+  int next_arg_index_;
+
+  // Returns the argument with specified index.
+  Arg do_get_arg(unsigned arg_index, const char *&error);
+
+ protected:
+  void set_args(const ArgList &args) {
+    args_ = args;
+    next_arg_index_ = 0;
+  }
+
+  // Returns the next argument.
+  Arg next_arg(const char *&error);
+
+  // Checks if manual indexing is used and returns the argument with
+  // specified index.
+  Arg get_arg(unsigned arg_index, const char *&error);
+
+  template <typename Char>
+  void write(BasicWriter<Char> &w, const Char *start, const Char *end) {
+    if (start != end)
+      w << BasicStringRef<Char>(start, end - start);
+  }
+};
+
+// A printf formatter.
+template <typename Char>
+class PrintfFormatter : private FormatterBase {
+ private:
+  void parse_flags(FormatSpec &spec, const Char *&s);
+
+  // Returns the argument with specified index or, if arg_index is equal
+  // to the maximum unsigned value, the next argument.
+  Arg get_arg(const Char *s,
+      unsigned arg_index = (std::numeric_limits<unsigned>::max)());
+
+  // Parses argument index, flags and width and returns the argument index.
+  unsigned parse_header(const Char *&s, FormatSpec &spec);
+
+ public:
+  void format(BasicWriter<Char> &writer,
+    BasicStringRef<Char> format_str, const ArgList &args);
+};
+}  // namespace internal
+
+// A formatter.
+template <typename Char>
+class BasicFormatter : private internal::FormatterBase {
+ private:
+  BasicWriter<Char> &writer_;
+  const Char *start_;
+  
+  FMT_DISALLOW_COPY_AND_ASSIGN(BasicFormatter);
+
+  // Parses argument index and returns corresponding argument.
+  internal::Arg parse_arg_index(const Char *&s);
+
+ public:
+  explicit BasicFormatter(BasicWriter<Char> &w) : writer_(w) {}
+
+  BasicWriter<Char> &writer() { return writer_; }
+
+  void format(BasicStringRef<Char> format_str, const ArgList &args);
+
+  const Char *format(const Char *&format_str, const internal::Arg &arg);
+};
+
+enum Alignment {
+  ALIGN_DEFAULT, ALIGN_LEFT, ALIGN_RIGHT, ALIGN_CENTER, ALIGN_NUMERIC
+};
+
+// Flags.
+enum {
+  SIGN_FLAG = 1, PLUS_FLAG = 2, MINUS_FLAG = 4, HASH_FLAG = 8,
+  CHAR_FLAG = 0x10  // Argument has char type - used in error reporting.
+};
+
+// An empty format specifier.
+struct EmptySpec {};
+
+// A type specifier.
+template <char TYPE>
+struct TypeSpec : EmptySpec {
+  Alignment align() const { return ALIGN_DEFAULT; }
+  unsigned width() const { return 0; }
+  int precision() const { return -1; }
+  bool flag(unsigned) const { return false; }
+  char type() const { return TYPE; }
+  char fill() const { return ' '; }
+};
+
+// A width specifier.
+struct WidthSpec {
+  unsigned width_;
+  // Fill is always wchar_t and cast to char if necessary to avoid having
+  // two specialization of WidthSpec and its subclasses.
+  wchar_t fill_;
+
+  WidthSpec(unsigned width, wchar_t fill) : width_(width), fill_(fill) {}
+
+  unsigned width() const { return width_; }
+  wchar_t fill() const { return fill_; }
+};
+
+// An alignment specifier.
+struct AlignSpec : WidthSpec {
+  Alignment align_;
+
+  AlignSpec(unsigned width, wchar_t fill, Alignment align = ALIGN_DEFAULT)
+  : WidthSpec(width, fill), align_(align) {}
+
+  Alignment align() const { return align_; }
+
+  int precision() const { return -1; }
+};
+
+// An alignment and type specifier.
+template <char TYPE>
+struct AlignTypeSpec : AlignSpec {
+  AlignTypeSpec(unsigned width, wchar_t fill) : AlignSpec(width, fill) {}
+
+  bool flag(unsigned) const { return false; }
+  char type() const { return TYPE; }
+};
+
+// A full format specifier.
+struct FormatSpec : AlignSpec {
+  unsigned flags_;
+  int precision_;
+  char type_;
+
+  FormatSpec(
+    unsigned width = 0, char type = 0, wchar_t fill = ' ')
+  : AlignSpec(width, fill), flags_(0), precision_(-1), type_(type) {}
+
+  bool flag(unsigned f) const { return (flags_ & f) != 0; }
+  int precision() const { return precision_; }
+  char type() const { return type_; }
+};
+
+// An integer format specifier.
+template <typename T, typename SpecT = TypeSpec<0>, typename Char = char>
+class IntFormatSpec : public SpecT {
+ private:
+  T value_;
+
+ public:
+  IntFormatSpec(T val, const SpecT &spec = SpecT())
+  : SpecT(spec), value_(val) {}
+
+  T value() const { return value_; }
+};
+
+// A string format specifier.
+template <typename T>
+class StrFormatSpec : public AlignSpec {
+ private:
+  const T *str_;
+
+ public:
+  StrFormatSpec(const T *str, unsigned width, wchar_t fill)
+  : AlignSpec(width, fill), str_(str) {}
+
+  const T *str() const { return str_; }
+};
+
+/**
+  Returns an integer format specifier to format the value in base 2.
+ */
+IntFormatSpec<int, TypeSpec<'b'> > bin(int value);
+
+/**
+  Returns an integer format specifier to format the value in base 8.
+ */
+IntFormatSpec<int, TypeSpec<'o'> > oct(int value);
+
+/**
+  Returns an integer format specifier to format the value in base 16 using
+  lower-case letters for the digits above 9.
+ */
+IntFormatSpec<int, TypeSpec<'x'> > hex(int value);
+
+/**
+  Returns an integer formatter format specifier to format in base 16 using
+  upper-case letters for the digits above 9.
+ */
+IntFormatSpec<int, TypeSpec<'X'> > hexu(int value);
+
+/**
+  \rst
+  Returns an integer format specifier to pad the formatted argument with the
+  fill character to the specified width using the default (right) numeric
+  alignment.
+
+  **Example**::
+
+    MemoryWriter out;
+    out << pad(hex(0xcafe), 8, '0');
+    // out.str() == "0000cafe"
+
+  \endrst
+ */
+template <char TYPE_CODE, typename Char>
+IntFormatSpec<int, AlignTypeSpec<TYPE_CODE>, Char> pad(
+    int value, unsigned width, Char fill = ' ');
+
+#define FMT_DEFINE_INT_FORMATTERS(TYPE) \
+inline IntFormatSpec<TYPE, TypeSpec<'b'> > bin(TYPE value) { \
+  return IntFormatSpec<TYPE, TypeSpec<'b'> >(value, TypeSpec<'b'>()); \
+} \
+ \
+inline IntFormatSpec<TYPE, TypeSpec<'o'> > oct(TYPE value) { \
+  return IntFormatSpec<TYPE, TypeSpec<'o'> >(value, TypeSpec<'o'>()); \
+} \
+ \
+inline IntFormatSpec<TYPE, TypeSpec<'x'> > hex(TYPE value) { \
+  return IntFormatSpec<TYPE, TypeSpec<'x'> >(value, TypeSpec<'x'>()); \
+} \
+ \
+inline IntFormatSpec<TYPE, TypeSpec<'X'> > hexu(TYPE value) { \
+  return IntFormatSpec<TYPE, TypeSpec<'X'> >(value, TypeSpec<'X'>()); \
+} \
+ \
+template <char TYPE_CODE> \
+inline IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE> > pad( \
+    IntFormatSpec<TYPE, TypeSpec<TYPE_CODE> > f, unsigned width) { \
+  return IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE> >( \
+      f.value(), AlignTypeSpec<TYPE_CODE>(width, ' ')); \
+} \
+ \
+/* For compatibility with older compilers we provide two overloads for pad, */ \
+/* one that takes a fill character and one that doesn't. In the future this */ \
+/* can be replaced with one overload making the template argument Char      */ \
+/* default to char (C++11). */ \
+template <char TYPE_CODE, typename Char> \
+inline IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE>, Char> pad( \
+    IntFormatSpec<TYPE, TypeSpec<TYPE_CODE>, Char> f, \
+    unsigned width, Char fill) { \
+  return IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE>, Char>( \
+      f.value(), AlignTypeSpec<TYPE_CODE>(width, fill)); \
+} \
+ \
+inline IntFormatSpec<TYPE, AlignTypeSpec<0> > pad( \
+    TYPE value, unsigned width) { \
+  return IntFormatSpec<TYPE, AlignTypeSpec<0> >( \
+      value, AlignTypeSpec<0>(width, ' ')); \
+} \
+ \
+template <typename Char> \
+inline IntFormatSpec<TYPE, AlignTypeSpec<0>, Char> pad( \
+   TYPE value, unsigned width, Char fill) { \
+ return IntFormatSpec<TYPE, AlignTypeSpec<0>, Char>( \
+     value, AlignTypeSpec<0>(width, fill)); \
+}
+
+FMT_DEFINE_INT_FORMATTERS(int)
+FMT_DEFINE_INT_FORMATTERS(long)
+FMT_DEFINE_INT_FORMATTERS(unsigned)
+FMT_DEFINE_INT_FORMATTERS(unsigned long)
+FMT_DEFINE_INT_FORMATTERS(LongLong)
+FMT_DEFINE_INT_FORMATTERS(ULongLong)
+
+/**
+  \rst
+  Returns a string formatter that pads the formatted argument with the fill
+  character to the specified width using the default (left) string alignment.
+
+  **Example**::
+
+    std::string s = str(MemoryWriter() << pad("abc", 8));
+    // s == "abc     "
+
+  \endrst
+ */
+template <typename Char>
+inline StrFormatSpec<Char> pad(
+    const Char *str, unsigned width, Char fill = ' ') {
+  return StrFormatSpec<Char>(str, width, fill);
+}
+
+inline StrFormatSpec<wchar_t> pad(
+    const wchar_t *str, unsigned width, char fill = ' ') {
+  return StrFormatSpec<wchar_t>(str, width, fill);
+}
+
+// Generates a comma-separated list with results of applying f to
+// numbers 0..n-1.
+# define FMT_GEN(n, f) FMT_GEN##n(f)
+# define FMT_GEN1(f)  f(0)
+# define FMT_GEN2(f)  FMT_GEN1(f),  f(1)
+# define FMT_GEN3(f)  FMT_GEN2(f),  f(2)
+# define FMT_GEN4(f)  FMT_GEN3(f),  f(3)
+# define FMT_GEN5(f)  FMT_GEN4(f),  f(4)
+# define FMT_GEN6(f)  FMT_GEN5(f),  f(5)
+# define FMT_GEN7(f)  FMT_GEN6(f),  f(6)
+# define FMT_GEN8(f)  FMT_GEN7(f),  f(7)
+# define FMT_GEN9(f)  FMT_GEN8(f),  f(8)
+# define FMT_GEN10(f) FMT_GEN9(f),  f(9)
+# define FMT_GEN11(f) FMT_GEN10(f), f(10)
+# define FMT_GEN12(f) FMT_GEN11(f), f(11)
+# define FMT_GEN13(f) FMT_GEN12(f), f(12)
+# define FMT_GEN14(f) FMT_GEN13(f), f(13)
+# define FMT_GEN15(f) FMT_GEN14(f), f(14)
+
+namespace internal {
+inline uint64_t make_type() { return 0; }
+
+template <typename T>
+inline uint64_t make_type(const T &arg) { return MakeValue<char>::type(arg); }
+
+#if FMT_USE_VARIADIC_TEMPLATES
+template <typename Arg, typename... Args>
+inline uint64_t make_type(const Arg &first, const Args & ... tail) {
+  return make_type(first) | (make_type(tail...) << 4);
+}
+#else
+
+struct ArgType {
+  uint64_t type;
+
+  ArgType() : type(0) {}
+
+  template <typename T>
+  ArgType(const T &arg) : type(make_type(arg)) {}
+};
+
+# define FMT_ARG_TYPE_DEFAULT(n) ArgType t##n = ArgType()
+
+inline uint64_t make_type(FMT_GEN15(FMT_ARG_TYPE_DEFAULT)) {
+  return t0.type | (t1.type << 4) | (t2.type << 8) | (t3.type << 12) |
+      (t4.type << 16) | (t5.type << 20) | (t6.type << 24) | (t7.type << 28) |
+      (t8.type << 32) | (t9.type << 36) | (t10.type << 40) | (t11.type << 44) |
+      (t12.type << 48) | (t13.type << 52) | (t14.type << 56);
+}
+#endif
+
+template <unsigned N>
+struct ArgArray {
+  // Computes the argument array size by adding 1 to N, which is the number of
+  // arguments, if N is zero, because array of zero size is invalid, or if N
+  // is greater than ArgList::MAX_PACKED_ARGS to accommodate for an extra
+  // argument that marks the end of the list.
+  enum { SIZE = N + (N == 0 || N >= ArgList::MAX_PACKED_ARGS ? 1 : 0) };
+
+  typedef typename Conditional<
+    (N < ArgList::MAX_PACKED_ARGS), Value, Arg>::type Type[SIZE];
+};
+}  // namespace internal
+
+# define FMT_MAKE_TEMPLATE_ARG(n) typename T##n
+# define FMT_MAKE_ARG_TYPE(n) T##n
+# define FMT_MAKE_ARG(n) const T##n &v##n
+# define FMT_MAKE_REF_char(n) fmt::internal::MakeValue<char>(v##n)
+# define FMT_MAKE_REF_wchar_t(n) fmt::internal::MakeValue<wchar_t>(v##n)
+
+#if FMT_USE_VARIADIC_TEMPLATES
+// Defines a variadic function returning void.
+# define FMT_VARIADIC_VOID(func, arg_type) \
+  template <typename... Args> \
+  void func(arg_type arg0, const Args & ... args) { \
+    typename fmt::internal::ArgArray<sizeof...(Args)>::Type array = { \
+      fmt::internal::MakeValue<Char>(args)... \
+    }; \
+    func(arg0, ArgList(fmt::internal::make_type(args...), array)); \
+  }
+
+// Defines a variadic constructor.
+# define FMT_VARIADIC_CTOR(ctor, func, arg0_type, arg1_type) \
+  template <typename... Args> \
+  ctor(arg0_type arg0, arg1_type arg1, const Args & ... args) { \
+    typename fmt::internal::ArgArray<sizeof...(Args)>::Type array = { \
+      fmt::internal::MakeValue<Char>(args)... \
+    }; \
+    func(arg0, arg1, ArgList(fmt::internal::make_type(args...), array)); \
+  }
+
+#else
+
+# define FMT_MAKE_REF(n) fmt::internal::MakeValue<Char>(v##n)
+# define FMT_MAKE_REF2(n) v##n
+
+// Defines a wrapper for a function taking one argument of type arg_type
+// and n additional arguments of arbitrary types.
+# define FMT_WRAP1(func, arg_type, n) \
+  template <FMT_GEN(n, FMT_MAKE_TEMPLATE_ARG)> \
+  inline void func(arg_type arg1, FMT_GEN(n, FMT_MAKE_ARG)) { \
+    const fmt::internal::Value values[] = {FMT_GEN(n, FMT_MAKE_REF)}; \
+    func(arg1, fmt::ArgList( \
+      fmt::internal::make_type(FMT_GEN(n, FMT_MAKE_REF2)), values)); \
+  }
+
+// Emulates a variadic function returning void on a pre-C++11 compiler.
+# define FMT_VARIADIC_VOID(func, arg_type) \
+  inline void func(arg_type arg) { func(arg, fmt::ArgList()); } \
+  FMT_WRAP1(func, arg_type, 1) FMT_WRAP1(func, arg_type, 2) \
+  FMT_WRAP1(func, arg_type, 3) FMT_WRAP1(func, arg_type, 4) \
+  FMT_WRAP1(func, arg_type, 5) FMT_WRAP1(func, arg_type, 6) \
+  FMT_WRAP1(func, arg_type, 7) FMT_WRAP1(func, arg_type, 8) \
+  FMT_WRAP1(func, arg_type, 9) FMT_WRAP1(func, arg_type, 10)
+
+# define FMT_CTOR(ctor, func, arg0_type, arg1_type, n) \
+  template <FMT_GEN(n, FMT_MAKE_TEMPLATE_ARG)> \
+  ctor(arg0_type arg0, arg1_type arg1, FMT_GEN(n, FMT_MAKE_ARG)) { \
+    const fmt::internal::Value values[] = {FMT_GEN(n, FMT_MAKE_REF)}; \
+    func(arg0, arg1, fmt::ArgList( \
+      fmt::internal::make_type(FMT_GEN(n, FMT_MAKE_REF2)), values)); \
+  }
+
+// Emulates a variadic constructor on a pre-C++11 compiler.
+# define FMT_VARIADIC_CTOR(ctor, func, arg0_type, arg1_type) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 1) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 2) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 3) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 4) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 5) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 6) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 7) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 8) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 9) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 10)
+#endif
+
+// Generates a comma-separated list with results of applying f to pairs
+// (argument, index).
+#define FMT_FOR_EACH1(f, x0) f(x0, 0)
+#define FMT_FOR_EACH2(f, x0, x1) \
+  FMT_FOR_EACH1(f, x0), f(x1, 1)
+#define FMT_FOR_EACH3(f, x0, x1, x2) \
+  FMT_FOR_EACH2(f, x0 ,x1), f(x2, 2)
+#define FMT_FOR_EACH4(f, x0, x1, x2, x3) \
+  FMT_FOR_EACH3(f, x0, x1, x2), f(x3, 3)
+#define FMT_FOR_EACH5(f, x0, x1, x2, x3, x4) \
+  FMT_FOR_EACH4(f, x0, x1, x2, x3), f(x4, 4)
+#define FMT_FOR_EACH6(f, x0, x1, x2, x3, x4, x5) \
+  FMT_FOR_EACH5(f, x0, x1, x2, x3, x4), f(x5, 5)
+#define FMT_FOR_EACH7(f, x0, x1, x2, x3, x4, x5, x6) \
+  FMT_FOR_EACH6(f, x0, x1, x2, x3, x4, x5), f(x6, 6)
+#define FMT_FOR_EACH8(f, x0, x1, x2, x3, x4, x5, x6, x7) \
+  FMT_FOR_EACH7(f, x0, x1, x2, x3, x4, x5, x6), f(x7, 7)
+#define FMT_FOR_EACH9(f, x0, x1, x2, x3, x4, x5, x6, x7, x8) \
+  FMT_FOR_EACH8(f, x0, x1, x2, x3, x4, x5, x6, x7), f(x8, 8)
+#define FMT_FOR_EACH10(f, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9) \
+  FMT_FOR_EACH9(f, x0, x1, x2, x3, x4, x5, x6, x7, x8), f(x9, 9)
+
+/**
+ An error returned by an operating system or a language runtime,
+ for example a file opening error.
+*/
+class SystemError : public internal::RuntimeError {
+ private:
+  void init(int err_code, StringRef format_str, ArgList args);
+
+ protected:
+  int error_code_;
+
+  typedef char Char;  // For FMT_VARIADIC_CTOR.
+
+  SystemError() {}
+
+ public:
+  /**
+   \rst
+   Constructs a :class:`fmt::SystemError` object with the description
+   of the form
+
+   .. parsed-literal::
+     *<message>*: *<system-message>*
+
+   where *<message>* is the formatted message and *<system-message>* is
+   the system message corresponding to the error code.
+   *error_code* is a system error code as given by ``errno``.
+   If *error_code* is not a valid error code such as -1, the system message
+   may look like "Unknown error -1" and is platform-dependent.
+   
+   **Example**::
+
+     // This throws a SystemError with the description
+     //   cannot open file 'madeup': No such file or directory
+     // or similar (system message may vary).
+     const char *filename = "madeup";
+     std::FILE *file = std::fopen(filename, "r");
+     if (!file)
+       throw fmt::SystemError(errno, "cannot open file '{}'", filename);
+   \endrst
+  */
+  SystemError(int error_code, StringRef message) {
+    init(error_code, message, ArgList());
+  }
+  FMT_VARIADIC_CTOR(SystemError, init, int, StringRef)
+
+  int error_code() const { return error_code_; }
+};
+
+/**
+  \rst
+  This template provides operations for formatting and writing data into
+  a character stream. The output is stored in a buffer provided by a subclass
+  such as :class:`fmt::BasicMemoryWriter`.
+
+  You can use one of the following typedefs for common character types:
+
+  +---------+----------------------+
+  | Type    | Definition           |
+  +=========+======================+
+  | Writer  | BasicWriter<char>    |
+  +---------+----------------------+
+  | WWriter | BasicWriter<wchar_t> |
+  +---------+----------------------+
+
+  \endrst
+ */
+template <typename Char>
+class BasicWriter {
+ private:
+  // Output buffer.
+  Buffer<Char> &buffer_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(BasicWriter);
+
+  typedef typename internal::CharTraits<Char>::CharPtr CharPtr;
+
+#if _SECURE_SCL
+  // Returns pointer value.
+  static Char *get(CharPtr p) { return p.base(); }
+#else
+  static Char *get(Char *p) { return p; }
+#endif
+
+  // Fills the padding around the content and returns the pointer to the
+  // content area.
+  static CharPtr fill_padding(CharPtr buffer,
+      unsigned total_size, std::size_t content_size, wchar_t fill);
+
+  // Grows the buffer by n characters and returns a pointer to the newly
+  // allocated area.
+  CharPtr grow_buffer(std::size_t n) {
+    std::size_t size = buffer_.size();
+    buffer_.resize(size + n);
+    return internal::make_ptr(&buffer_[size], n);
+  }
+
+  // Prepare a buffer for integer formatting.
+  CharPtr prepare_int_buffer(unsigned num_digits,
+      const EmptySpec &, const char *prefix, unsigned prefix_size) {
+    unsigned size = prefix_size + num_digits;
+    CharPtr p = grow_buffer(size);
+    std::copy(prefix, prefix + prefix_size, p);
+    return p + size - 1;
+  }
+
+  template <typename Spec>
+  CharPtr prepare_int_buffer(unsigned num_digits,
+    const Spec &spec, const char *prefix, unsigned prefix_size);
+
+  // Formats an integer.
+  template <typename T, typename Spec>
+  void write_int(T value, Spec spec);
+
+  // Formats a floating-point number (double or long double).
+  template <typename T>
+  void write_double(T value, const FormatSpec &spec);
+
+  // Writes a formatted string.
+  template <typename StrChar>
+  CharPtr write_str(
+      const StrChar *s, std::size_t size, const AlignSpec &spec);
+
+  template <typename StrChar>
+  void write_str(
+      const internal::Arg::StringValue<StrChar> &str, const FormatSpec &spec);
+
+  // This following methods are private to disallow writing wide characters
+  // and strings to a char stream. If you want to print a wide string as a
+  // pointer as std::ostream does, cast it to const void*.
+  // Do not implement!
+  void operator<<(typename internal::WCharHelper<wchar_t, Char>::Unsupported);
+  void operator<<(
+      typename internal::WCharHelper<const wchar_t *, Char>::Unsupported);
+
+  // Appends floating-point length specifier to the format string.
+  // The second argument is only used for overload resolution.
+  void append_float_length(Char *&format_ptr, long double) {
+    *format_ptr++ = 'L';
+  }
+
+  template<typename T>
+  void append_float_length(Char *&, T) {}
+
+  friend class internal::ArgFormatter<Char>;
+  friend class internal::PrintfFormatter<Char>;
+
+ protected:
+  /**
+    Constructs a ``BasicWriter`` object.
+   */
+  explicit BasicWriter(Buffer<Char> &b) : buffer_(b) {}
+
+ public:
+  /**
+    Destroys a ``BasicWriter`` object.
+   */
+  virtual ~BasicWriter() {}
+
+  /**
+    Returns the total number of characters written.
+   */
+  std::size_t size() const { return buffer_.size(); }
+
+  /**
+    Returns a pointer to the output buffer content. No terminating null
+    character is appended.
+   */
+  const Char *data() const FMT_NOEXCEPT { return &buffer_[0]; }
+
+  /**
+    Returns a pointer to the output buffer content with terminating null
+    character appended.
+   */
+  const Char *c_str() const {
+    std::size_t size = buffer_.size();
+    buffer_.reserve(size + 1);
+    buffer_[size] = '\0';
+    return &buffer_[0];
+  }
+
+  /**
+    Returns the content of the output buffer as an `std::string`.
+   */
+  std::basic_string<Char> str() const {
+    return std::basic_string<Char>(&buffer_[0], buffer_.size());
+  }
+
+  /**
+    \rst
+    Writes formatted data.
+    
+    *args* is an argument list representing arbitrary arguments.
+
+    **Example**::
+
+       MemoryWriter out;
+       out.write("Current point:\n");
+       out.write("({:+f}, {:+f})", -3.14, 3.14);
+
+    This will write the following output to the ``out`` object:
+
+    .. code-block:: none
+
+       Current point:
+       (-3.140000, +3.140000)
+
+    The output can be accessed using :func:`data()`, :func:`c_str` or
+    :func:`str` methods.
+
+    See also :ref:`syntax`.
+    \endrst
+   */
+  void write(BasicStringRef<Char> format, ArgList args) {
+    BasicFormatter<Char>(*this).format(format, args);
+  }
+  FMT_VARIADIC_VOID(write, BasicStringRef<Char>)
+
+  BasicWriter &operator<<(int value) {
+    return *this << IntFormatSpec<int>(value);
+  }
+  BasicWriter &operator<<(unsigned value) {
+    return *this << IntFormatSpec<unsigned>(value);
+  }
+  BasicWriter &operator<<(long value) {
+    return *this << IntFormatSpec<long>(value);
+  }
+  BasicWriter &operator<<(unsigned long value) {
+    return *this << IntFormatSpec<unsigned long>(value);
+  }
+  BasicWriter &operator<<(LongLong value) {
+    return *this << IntFormatSpec<LongLong>(value);
+  }
+
+  /**
+    Formats *value* and writes it to the stream.
+   */
+  BasicWriter &operator<<(ULongLong value) {
+    return *this << IntFormatSpec<ULongLong>(value);
+  }
+
+  BasicWriter &operator<<(double value) {
+    write_double(value, FormatSpec());
+    return *this;
+  }
+
+  /**
+    Formats *value* using the general format for floating-point numbers
+    (``'g'``) and writes it to the stream.
+   */
+  BasicWriter &operator<<(long double value) {
+    write_double(value, FormatSpec());
+    return *this;
+  }
+
+  /**
+    Writes a character to the stream.
+   */
+  BasicWriter &operator<<(char value) {
+    buffer_.push_back(value);
+    return *this;
+  }
+
+  BasicWriter &operator<<(
+      typename internal::WCharHelper<wchar_t, Char>::Supported value) {
+    buffer_.push_back(value);
+    return *this;
+  }
+
+  /**
+    Writes *value* to the stream.
+   */
+  BasicWriter &operator<<(fmt::BasicStringRef<Char> value) {
+    const Char *str = value.c_str();
+    buffer_.append(str, str + value.size());
+    return *this;
+  }
+
+  template <typename T, typename Spec, typename FillChar>
+  BasicWriter &operator<<(IntFormatSpec<T, Spec, FillChar> spec) {
+    internal::CharTraits<Char>::convert(FillChar());
+    write_int(spec.value(), spec);
+    return *this;
+  }
+
+  template <typename StrChar>
+  BasicWriter &operator<<(const StrFormatSpec<StrChar> &spec) {
+    const StrChar *s = spec.str();
+    // TODO: error if fill is not convertible to Char
+    write_str(s, std::char_traits<Char>::length(s), spec);
+    return *this;
+  }
+
+  void clear() FMT_NOEXCEPT { buffer_.clear(); }
+};
+
+template <typename Char>
+template <typename StrChar>
+typename BasicWriter<Char>::CharPtr BasicWriter<Char>::write_str(
+      const StrChar *s, std::size_t size, const AlignSpec &spec) {
+  CharPtr out = CharPtr();
+  if (spec.width() > size) {
+    out = grow_buffer(spec.width());
+    Char fill = static_cast<Char>(spec.fill());
+    if (spec.align() == ALIGN_RIGHT) {
+      std::fill_n(out, spec.width() - size, fill);
+      out += spec.width() - size;
+    } else if (spec.align() == ALIGN_CENTER) {
+      out = fill_padding(out, spec.width(), size, fill);
+    } else {
+      std::fill_n(out + size, spec.width() - size, fill);
+    }
+  } else {
+    out = grow_buffer(size);
+  }
+  std::copy(s, s + size, out);
+  return out;
+}
+
+template <typename Char>
+typename BasicWriter<Char>::CharPtr
+  BasicWriter<Char>::fill_padding(
+    CharPtr buffer, unsigned total_size,
+    std::size_t content_size, wchar_t fill) {
+  std::size_t padding = total_size - content_size;
+  std::size_t left_padding = padding / 2;
+  Char fill_char = static_cast<Char>(fill);
+  std::fill_n(buffer, left_padding, fill_char);
+  buffer += left_padding;
+  CharPtr content = buffer;
+  std::fill_n(buffer + content_size, padding - left_padding, fill_char);
+  return content;
+}
+
+template <typename Char>
+template <typename Spec>
+typename BasicWriter<Char>::CharPtr
+  BasicWriter<Char>::prepare_int_buffer(
+    unsigned num_digits, const Spec &spec,
+    const char *prefix, unsigned prefix_size) {
+  unsigned width = spec.width();
+  Alignment align = spec.align();
+  Char fill = static_cast<Char>(spec.fill());
+  if (spec.precision() > static_cast<int>(num_digits)) {
+    // Octal prefix '0' is counted as a digit, so ignore it if precision
+    // is specified.
+    if (prefix_size > 0 && prefix[prefix_size - 1] == '0')
+      --prefix_size;
+    unsigned number_size = prefix_size + spec.precision();
+    AlignSpec subspec(number_size, '0', ALIGN_NUMERIC);
+    if (number_size >= width)
+      return prepare_int_buffer(num_digits, subspec, prefix, prefix_size);
+    buffer_.reserve(width);
+    unsigned fill_size = width - number_size;
+    if (align != ALIGN_LEFT) {
+      CharPtr p = grow_buffer(fill_size);
+      std::fill(p, p + fill_size, fill);
+    }
+    CharPtr result = prepare_int_buffer(
+        num_digits, subspec, prefix, prefix_size);
+    if (align == ALIGN_LEFT) {
+      CharPtr p = grow_buffer(fill_size);
+      std::fill(p, p + fill_size, fill);
+    }
+    return result;
+  }
+  unsigned size = prefix_size + num_digits;
+  if (width <= size) {
+    CharPtr p = grow_buffer(size);
+    std::copy(prefix, prefix + prefix_size, p);
+    return p + size - 1;
+  }
+  CharPtr p = grow_buffer(width);
+  CharPtr end = p + width;
+  if (align == ALIGN_LEFT) {
+    std::copy(prefix, prefix + prefix_size, p);
+    p += size;
+    std::fill(p, end, fill);
+  } else if (align == ALIGN_CENTER) {
+    p = fill_padding(p, width, size, fill);
+    std::copy(prefix, prefix + prefix_size, p);
+    p += size;
+  } else {
+    if (align == ALIGN_NUMERIC) {
+      if (prefix_size != 0) {
+        p = std::copy(prefix, prefix + prefix_size, p);
+        size -= prefix_size;
+      }
+    } else {
+      std::copy(prefix, prefix + prefix_size, end - size);
+    }
+    std::fill(p, end - size, fill);
+    p = end;
+  }
+  return p - 1;
+}
+
+template <typename Char>
+template <typename T, typename Spec>
+void BasicWriter<Char>::write_int(T value, Spec spec) {
+  unsigned prefix_size = 0;
+  typedef typename internal::IntTraits<T>::MainType UnsignedType;
+  UnsignedType abs_value = value;
+  char prefix[4] = "";
+  if (internal::is_negative(value)) {
+    prefix[0] = '-';
+    ++prefix_size;
+    abs_value = 0 - abs_value;
+  } else if (spec.flag(SIGN_FLAG)) {
+    prefix[0] = spec.flag(PLUS_FLAG) ? '+' : ' ';
+    ++prefix_size;
+  }
+  switch (spec.type()) {
+  case 0: case 'd': {
+    unsigned num_digits = internal::count_digits(abs_value);
+    CharPtr p = prepare_int_buffer(
+      num_digits, spec, prefix, prefix_size) + 1 - num_digits;
+    internal::format_decimal(get(p), abs_value, num_digits);
+    break;
+  }
+  case 'x': case 'X': {
+    UnsignedType n = abs_value;
+    if (spec.flag(HASH_FLAG)) {
+      prefix[prefix_size++] = '0';
+      prefix[prefix_size++] = spec.type();
+    }
+    unsigned num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((n >>= 4) != 0);
+    Char *p = get(prepare_int_buffer(
+      num_digits, spec, prefix, prefix_size));
+    n = abs_value;
+    const char *digits = spec.type() == 'x' ?
+        "0123456789abcdef" : "0123456789ABCDEF";
+    do {
+      *p-- = digits[n & 0xf];
+    } while ((n >>= 4) != 0);
+    break;
+  }
+  case 'b': case 'B': {
+    UnsignedType n = abs_value;
+    if (spec.flag(HASH_FLAG)) {
+      prefix[prefix_size++] = '0';
+      prefix[prefix_size++] = spec.type();
+    }
+    unsigned num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((n >>= 1) != 0);
+    Char *p = get(prepare_int_buffer(num_digits, spec, prefix, prefix_size));
+    n = abs_value;
+    do {
+      *p-- = '0' + (n & 1);
+    } while ((n >>= 1) != 0);
+    break;
+  }
+  case 'o': {
+    UnsignedType n = abs_value;
+    if (spec.flag(HASH_FLAG))
+      prefix[prefix_size++] = '0';
+    unsigned num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((n >>= 3) != 0);
+    Char *p = get(prepare_int_buffer(num_digits, spec, prefix, prefix_size));
+    n = abs_value;
+    do {
+      *p-- = '0' + (n & 7);
+    } while ((n >>= 3) != 0);
+    break;
+  }
+  default:
+    internal::report_unknown_type(
+      spec.type(), spec.flag(CHAR_FLAG) ? "char" : "integer");
+    break;
+  }
+}
+
+template <typename Char>
+template <typename T>
+void BasicWriter<Char>::write_double(
+    T value, const FormatSpec &spec) {
+  // Check type.
+  char type = spec.type();
+  bool upper = false;
+  switch (type) {
+  case 0:
+    type = 'g';
+    break;
+  case 'e': case 'f': case 'g': case 'a':
+    break;
+  case 'F':
+#ifdef _MSC_VER
+    // MSVC's printf doesn't support 'F'.
+    type = 'f';
+#endif
+    // Fall through.
+  case 'E': case 'G': case 'A':
+    upper = true;
+    break;
+  default:
+    internal::report_unknown_type(type, "double");
+    break;
+  }
+
+  char sign = 0;
+  // Use getsign instead of value < 0 because the latter is always
+  // false for NaN.
+  if (internal::getsign(static_cast<double>(value))) {
+    sign = '-';
+    value = -value;
+  } else if (spec.flag(SIGN_FLAG)) {
+    sign = spec.flag(PLUS_FLAG) ? '+' : ' ';
+  }
+
+  if (value != value) {
+    // Format NaN ourselves because sprintf's output is not consistent
+    // across platforms.
+    std::size_t nan_size = 4;
+    const char *nan = upper ? " NAN" : " nan";
+    if (!sign) {
+      --nan_size;
+      ++nan;
+    }
+    CharPtr out = write_str(nan, nan_size, spec);
+    if (sign)
+      *out = sign;
+    return;
+  }
+
+  if (internal::isinfinity(value)) {
+    // Format infinity ourselves because sprintf's output is not consistent
+    // across platforms.
+    std::size_t inf_size = 4;
+    const char *inf = upper ? " INF" : " inf";
+    if (!sign) {
+      --inf_size;
+      ++inf;
+    }
+    CharPtr out = write_str(inf, inf_size, spec);
+    if (sign)
+      *out = sign;
+    return;
+  }
+
+  std::size_t offset = buffer_.size();
+  unsigned width = spec.width();
+  if (sign) {
+    buffer_.reserve(buffer_.size() + (std::max)(width, 1u));
+    if (width > 0)
+      --width;
+    ++offset;
+  }
+
+  // Build format string.
+  enum { MAX_FORMAT_SIZE = 10}; // longest format: %#-*.*Lg
+  Char format[MAX_FORMAT_SIZE];
+  Char *format_ptr = format;
+  *format_ptr++ = '%';
+  unsigned width_for_sprintf = width;
+  if (spec.flag(HASH_FLAG))
+    *format_ptr++ = '#';
+  if (spec.align() == ALIGN_CENTER) {
+    width_for_sprintf = 0;
+  } else {
+    if (spec.align() == ALIGN_LEFT)
+      *format_ptr++ = '-';
+    if (width != 0)
+      *format_ptr++ = '*';
+  }
+  if (spec.precision() >= 0) {
+    *format_ptr++ = '.';
+    *format_ptr++ = '*';
+  }
+
+  append_float_length(format_ptr, value);
+  *format_ptr++ = type;
+  *format_ptr = '\0';
+
+  // Format using snprintf.
+  Char fill = static_cast<Char>(spec.fill());
+  for (;;) {
+    std::size_t buffer_size = buffer_.capacity() - offset;
+#if _MSC_VER
+    // MSVC's vsnprintf_s doesn't work with zero size, so reserve
+    // space for at least one extra character to make the size non-zero.
+    // Note that the buffer's capacity will increase by more than 1.
+    if (buffer_size == 0) {
+      buffer_.reserve(offset + 1);
+      buffer_size = buffer_.capacity() - offset;
+    }
+#endif
+    Char *start = &buffer_[offset];
+    int n = internal::CharTraits<Char>::format_float(
+        start, buffer_size, format, width_for_sprintf, spec.precision(), value);
+    if (n >= 0 && offset + n < buffer_.capacity()) {
+      if (sign) {
+        if ((spec.align() != ALIGN_RIGHT && spec.align() != ALIGN_DEFAULT) ||
+            *start != ' ') {
+          *(start - 1) = sign;
+          sign = 0;
+        } else {
+          *(start - 1) = fill;
+        }
+        ++n;
+      }
+      if (spec.align() == ALIGN_CENTER &&
+          spec.width() > static_cast<unsigned>(n)) {
+        width = spec.width();
+        CharPtr p = grow_buffer(width);
+        std::copy(p, p + n, p + (width - n) / 2);
+        fill_padding(p, spec.width(), n, fill);
+        return;
+      }
+      if (spec.fill() != ' ' || sign) {
+        while (*start == ' ')
+          *start++ = fill;
+        if (sign)
+          *(start - 1) = sign;
+      }
+      grow_buffer(n);
+      return;
+    }
+    // If n is negative we ask to increase the capacity by at least 1,
+    // but as std::vector, the buffer grows exponentially.
+    buffer_.reserve(n >= 0 ? offset + n + 1 : buffer_.capacity() + 1);
+  }
+}
+
+/**
+  \rst
+  This class template provides operations for formatting and writing data
+  into a character stream. The output is stored in a memory buffer that grows
+  dynamically.
+
+  You can use one of the following typedefs for common character types
+  and the standard allocator:
+
+  +---------------+-----------------------------------------------------+
+  | Type          | Definition                                          |
+  +===============+=====================================================+
+  | MemoryWriter  | BasicMemoryWriter<char, std::allocator<char>>       |
+  +---------------+-----------------------------------------------------+
+  | WMemoryWriter | BasicMemoryWriter<wchar_t, std::allocator<wchar_t>> |
+  +---------------+-----------------------------------------------------+
+
+  **Example**::
+
+     MemoryWriter out;
+     out << "The answer is " << 42 << "\n";
+     out.write("({:+f}, {:+f})", -3.14, 3.14);
+
+  This will write the following output to the ``out`` object:
+
+  .. code-block:: none
+
+     The answer is 42
+     (-3.140000, +3.140000)
+
+  The output can be converted to an ``std::string`` with ``out.str()`` or
+  accessed as a C string with ``out.c_str()``.
+  \endrst
+ */
+template <typename Char, typename Allocator = std::allocator<Char> >
+class BasicMemoryWriter : public BasicWriter<Char> {
+ private:
+  internal::MemoryBuffer<Char, internal::INLINE_BUFFER_SIZE, Allocator> buffer_;
+
+ public:
+  explicit BasicMemoryWriter(const Allocator& alloc = Allocator())
+    : BasicWriter<Char>(buffer_), buffer_(alloc) {}
+
+#if FMT_USE_RVALUE_REFERENCES
+  /**
+    \rst
+    Constructs a :class:`fmt::BasicMemoryWriter` object moving the content
+    of the other object to it.
+    \endrst
+   */
+  BasicMemoryWriter(BasicMemoryWriter &&other)
+    : BasicWriter<Char>(buffer_), buffer_(std::move(other.buffer_)) {
+  }
+
+  /**
+    \rst
+    Moves the content of the other ``BasicMemoryWriter`` object to this one.
+    \endrst
+   */
+  BasicMemoryWriter &operator=(BasicMemoryWriter &&other) {
+    buffer_ = std::move(other.buffer_);
+    return *this;
+  }
+#endif
+};
+
+typedef BasicMemoryWriter<char> MemoryWriter;
+typedef BasicMemoryWriter<wchar_t> WMemoryWriter;
+
+/**
+  \rst
+  This class template provides operations for formatting and writing data
+  into a fixed-size array. For writing into a dynamically growing buffer
+  use :class:`fmt::BasicMemoryWriter`.
+  
+  Any write method will throw ``std::runtime_error`` if the output doesn't fit
+  into the array.
+
+  You can use one of the following typedefs for common character types:
+
+  +--------------+---------------------------+
+  | Type         | Definition                |
+  +==============+===========================+
+  | ArrayWriter  | BasicArrayWriter<char>    |
+  +--------------+---------------------------+
+  | WArrayWriter | BasicArrayWriter<wchar_t> |
+  +--------------+---------------------------+
+  \endrst
+ */
+template <typename Char>
+class BasicArrayWriter : public BasicWriter<Char> {
+ private:
+  internal::FixedBuffer<Char> buffer_;
+
+ public:
+  /**
+   \rst
+   Constructs a :class:`fmt::BasicArrayWriter` object for *array* of the
+   given size.
+   \endrst
+   */
+  BasicArrayWriter(Char *array, std::size_t size)
+    : BasicWriter<Char>(buffer_), buffer_(array, size) {}
+
+  // FIXME: this is temporary undocumented due to a bug in Sphinx
+  /*
+   \rst
+   Constructs a :class:`fmt::BasicArrayWriter` object for *array* of the
+   size known at compile time.
+   \endrst
+   */
+  template <std::size_t SIZE>
+  explicit BasicArrayWriter(Char (&array)[SIZE])
+    : BasicWriter<Char>(buffer_), buffer_(array, SIZE) {}
+};
+
+typedef BasicArrayWriter<char> ArrayWriter;
+typedef BasicArrayWriter<wchar_t> WArrayWriter;
+
+// Formats a value.
+template <typename Char, typename T>
+void format(BasicFormatter<Char> &f, const Char *&format_str, const T &value) {
+  std::basic_ostringstream<Char> os;
+  os << value;
+  std::basic_string<Char> str = os.str();
+  internal::Arg arg = internal::MakeValue<Char>(str);
+  arg.type = static_cast<internal::Arg::Type>(
+        internal::MakeValue<Char>::type(str));
+  format_str = f.format(format_str, arg);
+}
+
+// Reports a system error without throwing an exception.
+// Can be used to report errors from destructors.
+void report_system_error(int error_code, StringRef message) FMT_NOEXCEPT;
+
+#ifdef _WIN32
+
+/** A Windows error. */
+class WindowsError : public SystemError {
+ private:
+  void init(int error_code, StringRef format_str, ArgList args);
+
+ public:
+  /**
+   \rst
+   Constructs a :class:`fmt::WindowsError` object with the description
+   of the form
+
+   .. parsed-literal::
+     *<message>*: *<system-message>*
+
+   where *<message>* is the formatted message and *<system-message>* is the
+   system message corresponding to the error code.
+   *error_code* is a Windows error code as given by ``GetLastError``.
+   If *error_code* is not a valid error code such as -1, the system message
+   will look like "error -1".
+
+   **Example**::
+
+     // This throws a WindowsError with the description
+     //   cannot open file 'madeup': The system cannot find the file specified.
+     // or similar (system message may vary).
+     const char *filename = "madeup";
+     LPOFSTRUCT of = LPOFSTRUCT();
+     HFILE file = OpenFile(filename, &of, OF_READ);
+     if (file == HFILE_ERROR) {
+       throw fmt::WindowsError(GetLastError(),
+                               "cannot open file '{}'", filename);
+     }
+   \endrst
+  */
+  WindowsError(int error_code, StringRef message) {
+    init(error_code, message, ArgList());
+  }
+  FMT_VARIADIC_CTOR(WindowsError, init, int, StringRef)
+};
+
+// Reports a Windows error without throwing an exception.
+// Can be used to report errors from destructors.
+void report_windows_error(int error_code, StringRef message) FMT_NOEXCEPT;
+
+#endif
+
+enum Color { BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE };
+
+/**
+  Formats a string and prints it to stdout using ANSI escape sequences
+  to specify color (experimental).
+  Example:
+    PrintColored(fmt::RED, "Elapsed time: {0:.2f} seconds") << 1.23;
+ */
+void print_colored(Color c, StringRef format, ArgList args);
+
+/**
+  \rst
+  Formats arguments and returns the result as a string.
+
+  **Example**::
+
+    std::string message = format("The answer is {}", 42);
+  \endrst
+*/
+inline std::string format(StringRef format_str, ArgList args) {
+  MemoryWriter w;
+  w.write(format_str, args);
+  return w.str();
+}
+
+inline std::wstring format(WStringRef format_str, ArgList args) {
+  WMemoryWriter w;
+  w.write(format_str, args);
+  return w.str();
+}
+
+/**
+  \rst
+  Prints formatted data to the file *f*.
+
+  **Example**::
+
+    print(stderr, "Don't {}!", "panic");
+  \endrst
+ */
+void print(std::FILE *f, StringRef format_str, ArgList args);
+
+/**
+  \rst
+  Prints formatted data to ``stdout``.
+
+  **Example**::
+
+    print("Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
+ */
+void print(StringRef format_str, ArgList args);
+
+/**
+  \rst
+  Prints formatted data to the stream *os*.
+
+  **Example**::
+
+    print(cerr, "Don't {}!", "panic");
+  \endrst
+ */
+void print(std::ostream &os, StringRef format_str, ArgList args);
+
+template <typename Char>
+void printf(BasicWriter<Char> &w, BasicStringRef<Char> format, ArgList args) {
+  internal::PrintfFormatter<Char>().format(w, format, args);
+}
+
+/**
+  \rst
+  Formats arguments and returns the result as a string.
+
+  **Example**::
+
+    std::string message = fmt::sprintf("The answer is %d", 42);
+  \endrst
+*/
+inline std::string sprintf(StringRef format, ArgList args) {
+  MemoryWriter w;
+  printf(w, format, args);
+  return w.str();
+}
+
+/**
+  \rst
+  Prints formatted data to the file *f*.
+
+  **Example**::
+
+    fmt::fprintf(stderr, "Don't %s!", "panic");
+  \endrst
+ */
+int fprintf(std::FILE *f, StringRef format, ArgList args);
+
+/**
+  \rst
+  Prints formatted data to ``stdout``.
+
+  **Example**::
+
+    fmt::printf("Elapsed time: %.2f seconds", 1.23);
+  \endrst
+ */
+inline int printf(StringRef format, ArgList args) {
+  return fprintf(stdout, format, args);
+}
+
+/**
+  Fast integer formatter.
+ */
+class FormatInt {
+ private:
+  // Buffer should be large enough to hold all digits (digits10 + 1),
+  // a sign and a null character.
+  enum {BUFFER_SIZE = std::numeric_limits<ULongLong>::digits10 + 3};
+  mutable char buffer_[BUFFER_SIZE];
+  char *str_;
+
+  // Formats value in reverse and returns the number of digits.
+  char *format_decimal(ULongLong value) {
+    char *buffer_end = buffer_ + BUFFER_SIZE - 1;
+    while (value >= 100) {
+      // Integer division is slow so do it for a group of two digits instead
+      // of for every digit. The idea comes from the talk by Alexandrescu
+      // "Three Optimization Tips for C++". See speed-test for a comparison.
+      unsigned index = (value % 100) * 2;
+      value /= 100;
+      *--buffer_end = internal::Data::DIGITS[index + 1];
+      *--buffer_end = internal::Data::DIGITS[index];
+    }
+    if (value < 10) {
+      *--buffer_end = static_cast<char>('0' + value);
+      return buffer_end;
+    }
+    unsigned index = static_cast<unsigned>(value * 2);
+    *--buffer_end = internal::Data::DIGITS[index + 1];
+    *--buffer_end = internal::Data::DIGITS[index];
+    return buffer_end;
+  }
+
+  void FormatSigned(LongLong value) {
+    ULongLong abs_value = static_cast<ULongLong>(value);
+    bool negative = value < 0;
+    if (negative)
+      abs_value = 0 - abs_value;
+    str_ = format_decimal(abs_value);
+    if (negative)
+      *--str_ = '-';
+  }
+
+ public:
+  explicit FormatInt(int value) { FormatSigned(value); }
+  explicit FormatInt(long value) { FormatSigned(value); }
+  explicit FormatInt(LongLong value) { FormatSigned(value); }
+  explicit FormatInt(unsigned value) : str_(format_decimal(value)) {}
+  explicit FormatInt(unsigned long value) : str_(format_decimal(value)) {}
+  explicit FormatInt(ULongLong value) : str_(format_decimal(value)) {}
+
+  /**
+    Returns the number of characters written to the output buffer.
+   */
+  std::size_t size() const { return buffer_ - str_ + BUFFER_SIZE - 1; }
+
+  /**
+    Returns a pointer to the output buffer content. No terminating null
+    character is appended.
+   */
+  const char *data() const { return str_; }
+
+  /**
+    Returns a pointer to the output buffer content with terminating null
+    character appended.
+   */
+  const char *c_str() const {
+    buffer_[BUFFER_SIZE - 1] = '\0';
+    return str_;
+  }
+
+  /**
+    Returns the content of the output buffer as an `std::string`.
+   */
+  std::string str() const { return std::string(str_, size()); }
+};
+
+// Formats a decimal integer value writing into buffer and returns
+// a pointer to the end of the formatted string. This function doesn't
+// write a terminating null character.
+template <typename T>
+inline void format_decimal(char *&buffer, T value) {
+  typename internal::IntTraits<T>::MainType abs_value = value;
+  if (internal::is_negative(value)) {
+    *buffer++ = '-';
+    abs_value = 0 - abs_value;
+  }
+  if (abs_value < 100) {
+    if (abs_value < 10) {
+      *buffer++ = static_cast<char>('0' + abs_value);
+      return;
+    }
+    unsigned index = static_cast<unsigned>(abs_value * 2);
+    *buffer++ = internal::Data::DIGITS[index];
+    *buffer++ = internal::Data::DIGITS[index + 1];
+    return;
+  }
+  unsigned num_digits = internal::count_digits(abs_value);
+  internal::format_decimal(buffer, abs_value, num_digits);
+  buffer += num_digits;
+}
+}
+
+#if FMT_GCC_VERSION
+// Use the system_header pragma to suppress warnings about variadic macros
+// because suppressing -Wvariadic-macros with the diagnostic pragma doesn't
+// work. It is used at the end because we want to suppress as little warnings
+// as possible.
+# pragma GCC system_header
+#endif
+
+// This is used to work around VC++ bugs in handling variadic macros.
+#define FMT_EXPAND(args) args
+
+// Returns the number of arguments.
+// Based on https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s.
+#define FMT_NARG(...) FMT_NARG_(__VA_ARGS__, FMT_RSEQ_N())
+#define FMT_NARG_(...) FMT_EXPAND(FMT_ARG_N(__VA_ARGS__))
+#define FMT_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+#define FMT_RSEQ_N() 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+#define FMT_CONCAT(a, b) a##b
+#define FMT_FOR_EACH_(N, f, ...) \
+  FMT_EXPAND(FMT_CONCAT(FMT_FOR_EACH, N)(f, __VA_ARGS__))
+#define FMT_FOR_EACH(f, ...) \
+  FMT_EXPAND(FMT_FOR_EACH_(FMT_NARG(__VA_ARGS__), f, __VA_ARGS__))
+
+#define FMT_ADD_ARG_NAME(type, index) type arg##index
+#define FMT_GET_ARG_NAME(type, index) arg##index
+
+#if FMT_USE_VARIADIC_TEMPLATES
+
+namespace fmt {
+namespace internal {
+inline void do_set_types(Arg *) {}
+
+template <typename T, typename... Args>
+inline void do_set_types(Arg *args, const T &arg, const Args & ... tail) {
+  args->type = static_cast<Arg::Type>(MakeValue<T>::type(arg));
+  do_set_types(args + 1, tail...);
+}
+
+template <typename... Args>
+inline void set_types(Arg *array, const Args & ... args) {
+  do_set_types(array, args...);
+  array[sizeof...(Args)].type = Arg::NONE;
+}
+
+template <typename... Args>
+inline void set_types(Value *, const Args & ...) {
+  // Do nothing as types are passed separately from values.
+}
+}
+}
+
+# define FMT_VARIADIC_(Char, ReturnType, func, call, ...) \
+  template <typename... Args> \
+  ReturnType func(FMT_FOR_EACH(FMT_ADD_ARG_NAME, __VA_ARGS__), \
+      const Args & ... args) { \
+    typename fmt::internal::ArgArray<sizeof...(Args)>::Type array = { \
+      fmt::internal::MakeValue<Char>(args)... \
+    }; \
+    if (fmt::internal::check(sizeof...(Args) > fmt::ArgList::MAX_PACKED_ARGS)) \
+      set_types(array, args...); \
+    call(FMT_FOR_EACH(FMT_GET_ARG_NAME, __VA_ARGS__), \
+      fmt::ArgList(fmt::internal::make_type(args...), array)); \
+  }
+#else
+// Defines a wrapper for a function taking __VA_ARGS__ arguments
+// and n additional arguments of arbitrary types.
+# define FMT_WRAP(Char, ReturnType, func, call, n, ...) \
+  template <FMT_GEN(n, FMT_MAKE_TEMPLATE_ARG)> \
+  inline ReturnType func(FMT_FOR_EACH(FMT_ADD_ARG_NAME, __VA_ARGS__), \
+      FMT_GEN(n, FMT_MAKE_ARG)) { \
+    const fmt::internal::Value values[] = {FMT_GEN(n, FMT_MAKE_REF_##Char)}; \
+    call(FMT_FOR_EACH(FMT_GET_ARG_NAME, __VA_ARGS__), fmt::ArgList( \
+      fmt::internal::make_type(FMT_GEN(n, FMT_MAKE_REF2)), values)); \
+  }
+
+# define FMT_VARIADIC_(Char, ReturnType, func, call, ...) \
+  inline ReturnType func(FMT_FOR_EACH(FMT_ADD_ARG_NAME, __VA_ARGS__)) { \
+    call(FMT_FOR_EACH(FMT_GET_ARG_NAME, __VA_ARGS__), fmt::ArgList()); \
+  } \
+  FMT_WRAP(Char, ReturnType, func, call, 1, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 2, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 3, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 4, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 5, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 6, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 7, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 8, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 9, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 10, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 11, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 12, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 13, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 14, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 15, __VA_ARGS__)
+#endif  // FMT_USE_VARIADIC_TEMPLATES
+
+/**
+  \rst
+  Defines a variadic function with the specified return type, function name
+  and argument types passed as variable arguments to this macro.
+
+  **Example**::
+
+    void print_error(const char *file, int line, const char *format,
+                     fmt::ArgList args) {
+      fmt::print("{}: {}: ", file, line);
+      fmt::print(format, args);
+    }
+    FMT_VARIADIC(void, print_error, const char *, int, const char *)
+
+  ``FMT_VARIADIC`` is used for compatibility with legacy C++ compilers that
+  don't implement variadic templates. You don't have to use this macro if
+  you don't need legacy compiler support and can use variadic templates
+  directly::
+
+    template <typename... Args>
+    void print_error(const char *file, int line, const char *format,
+                     const Args & ... args) {
+      fmt::print("{}: {}: ", file, line);
+      fmt::print(format, args...);
+    }
+  \endrst
+ */
+#define FMT_VARIADIC(ReturnType, func, ...) \
+  FMT_VARIADIC_(char, ReturnType, func, return func, __VA_ARGS__)
+
+#define FMT_VARIADIC_W(ReturnType, func, ...) \
+  FMT_VARIADIC_(wchar_t, ReturnType, func, return func, __VA_ARGS__)
+
+namespace fmt {
+FMT_VARIADIC(std::string, format, StringRef)
+FMT_VARIADIC_W(std::wstring, format, WStringRef)
+FMT_VARIADIC(void, print, StringRef)
+FMT_VARIADIC(void, print, std::FILE *, StringRef)
+FMT_VARIADIC(void, print, std::ostream &, StringRef)
+FMT_VARIADIC(void, print_colored, Color, StringRef)
+FMT_VARIADIC(std::string, sprintf, StringRef)
+FMT_VARIADIC(int, printf, StringRef)
+FMT_VARIADIC(int, fprintf, std::FILE *, StringRef)
+}
+
+// Restore warnings.
+#if FMT_GCC_VERSION >= 406
+# pragma GCC diagnostic pop
+#endif
+
+#ifdef __clang__
+# pragma clang diagnostic pop
+#endif
+
+#ifdef FMT_HEADER_ONLY
+# include "format.cc"
+#endif
+
+#endif  // FMT_FORMAT_H_
diff --git a/include/kseq.h b/include/kseq.h
new file mode 100644
index 0000000..71a8589
--- /dev/null
+++ b/include/kseq.h
@@ -0,0 +1,235 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
+
+#define __KS_TYPE(type_t)						\
+        typedef struct __kstream_t {				\
+                unsigned char *buf;						\
+                int begin, end, is_eof;					\
+                type_t f;								\
+        } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)								\
+        static inline kstream_t *ks_init(type_t f)						\
+        {																\
+                kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
+                ks->f = f;													\
+                ks->buf = (unsigned char*)malloc(__bufsize);				\
+                return ks;													\
+        }																\
+        static inline void ks_destroy(kstream_t *ks)					\
+        {																\
+                if (ks) {													\
+                        free(ks->buf);											\
+                        free(ks);												\
+                }															\
+        }
+
+#define __KS_GETC(__read, __bufsize)						\
+        static inline int ks_getc(kstream_t *ks)				\
+        {														\
+                if (ks->is_eof && ks->begin >= ks->end) return -1;	\
+                if (ks->begin >= ks->end) {							\
+                        ks->begin = 0;									\
+                        ks->end = __read(ks->f, ks->buf, __bufsize);	\
+                        if (ks->end < __bufsize) { ks->is_eof = 1; } \
+                        if (ks->end == 0) { return -1; } \
+                }													\
+                return (int)ks->buf[ks->begin++];					\
+        }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+        size_t l, m;
+        char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)								\
+        static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+        {																	\
+                if (dret) *dret = 0;											\
+                str->l = append? str->l : 0;									\
+                if (ks->begin >= ks->end && ks->is_eof) return -1;				\
+                for (;;) {														\
+                        int i;														\
+                        if (ks->begin >= ks->end) {									\
+                                if (!ks->is_eof) {										\
+                                        ks->begin = 0;										\
+                                        ks->end = __read(ks->f, ks->buf, __bufsize);		\
+                                        if (ks->end < __bufsize) { ks->is_eof = 1; } \
+                                        if (ks->end == 0) { break;} \
+                                } else break;											\
+                        }															\
+                        if (delimiter == KS_SEP_LINE) { \
+                                for (i = ks->begin; i < ks->end; ++i) \
+                                        if (ks->buf[i] == '\n') break; \
+                        } else if (delimiter > KS_SEP_MAX) {						\
+                                for (i = ks->begin; i < ks->end; ++i)					\
+                                        if (ks->buf[i] == delimiter) break;					\
+                        } else if (delimiter == KS_SEP_SPACE) {						\
+                                for (i = ks->begin; i < ks->end; ++i)					\
+                                        if (isspace(ks->buf[i])) break;						\
+                        } else if (delimiter == KS_SEP_TAB) {						\
+                                for (i = ks->begin; i < ks->end; ++i)					\
+                                        if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+                        } else i = 0; /* never come to here! */						\
+                        if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
+                                str->m = str->l + (i - ks->begin) + 1;					\
+                                kroundup32(str->m);										\
+                                str->s = (char*)realloc(str->s, str->m);				\
+                        }															\
+                        memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+                        str->l = str->l + (i - ks->begin);							\
+                        ks->begin = i + 1;											\
+                        if (i < ks->end) {											\
+                                if (dret) *dret = ks->buf[i];							\
+                                break;													\
+                        }															\
+                }																\
+                if (str->s == 0) {												\
+                        str->m = 1;													\
+                        str->s = (char*)calloc(1, 1);								\
+                } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+                str->s[str->l] = '\0';											\
+                return str->l;													\
+        } \
+        static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+        { return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+        __KS_TYPE(type_t)							\
+        __KS_BASIC(type_t, __bufsize)				\
+        __KS_GETC(__read, __bufsize)				\
+        __KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)										\
+        SCOPE kseq_t *kseq_init(type_t fd)									\
+        {																	\
+                kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
+                s->f = ks_init(fd);												\
+                return s;														\
+        }																	\
+        SCOPE void kseq_destroy(kseq_t *ks)									\
+        {																	\
+                if (!ks) return;												\
+                free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
+                ks_destroy(ks->f);												\
+                free(ks);														\
+        }
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+        SCOPE int kseq_read(kseq_t *seq) \
+        { \
+                int c; \
+                kstream_t *ks = seq->f; \
+                if (seq->last_char == 0) { /* then jump to the next header line */ \
+                        while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+                        if (c == -1) return -1; /* end of file */ \
+                        seq->last_char = c; \
+                } /* else: the first header char has been read in the previous call */ \
+                seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+                if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+                if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+                if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+                        seq->seq.m = 256; \
+                        seq->seq.s = (char*)malloc(seq->seq.m); \
+                } \
+                while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+                        if (c == '\n') continue; /* skip empty lines */ \
+                        seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+                        ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+                } \
+                if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
+                if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+                        seq->seq.m = seq->seq.l + 2; \
+                        kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+                        seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+                } \
+                seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
+                if (c != '+') return seq->seq.l; /* FASTA */ \
+                if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
+                        seq->qual.m = seq->seq.m; \
+                        seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+                } \
+                while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+                if (c == -1) return -2; /* error: no quality string */ \
+                while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+                seq->last_char = 0;	/* we have not come to the next header line */ \
+                if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+                return seq->seq.l; \
+        }
+
+#define __KSEQ_TYPE(type_t)						\
+        typedef struct {							\
+                kstring_t name, comment, seq, qual;		\
+                int last_char;							\
+                kstream_t *f;							\
+        } kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read)		\
+        KSTREAM_INIT(type_t, __read, 16384)			\
+        __KSEQ_TYPE(type_t)							\
+        __KSEQ_BASIC(SCOPE, type_t)					\
+        __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+        __KS_TYPE(type_t) \
+        __KSEQ_TYPE(type_t) \
+        extern kseq_t *kseq_init(type_t fd); \
+        void kseq_destroy(kseq_t *ks); \
+        int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/include/merge_files.hpp b/include/merge_files.hpp
new file mode 100644
index 0000000..d717975
--- /dev/null
+++ b/include/merge_files.hpp
@@ -0,0 +1,30 @@
+/*  This file is part of Jellyfish.
+
+    Jellyfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Jellyfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Jellyfish.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __JELLYFISH_MERGE_FILES_HPP__
+#define __JELLYFISH_MERGE_FILES_HPP__
+
+#include <vector>
+#include <jellyfish/err.hpp>
+#include <jellyfish/file_header.hpp>
+
+define_error_class(MergeError);
+
+/// Merge files. Throw a MergeError in case of error.
+void merge_files(std::vector<const char*> input_files, const char* out_file,
+                 jellyfish::file_header& h, uint64_t min, uint64_t max);
+
+#endif /* __JELLYFISH_MERGE_FILES_HPP__ */
diff --git a/include/obsolete/IterativeOptimizer.hpp b/include/obsolete/IterativeOptimizer.hpp
new file mode 100755
index 0000000..bda3ab3
--- /dev/null
+++ b/include/obsolete/IterativeOptimizer.hpp
@@ -0,0 +1,1198 @@
+#ifndef ITERATIVE_OPTIMIZER_HPP
+#define ITERATIVE_OPTIMIZER_HPP
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <unordered_map>
+#include <map>
+#include <vector>
+#include <unordered_set>
+#include <mutex>
+#include <thread>
+#include <sstream>
+#include <exception>
+#include <random>
+#include <queue>
+#include "btree_map.h"
+
+/** Boost Includes */
+#include <boost/range/irange.hpp>
+#include <boost/program_options.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/heap/fibonacci_heap.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+#include <boost/accumulators/framework/accumulator_set.hpp>
+#include <boost/accumulators/statistics/p_square_quantile.hpp>
+#include <boost/accumulators/statistics/count.hpp>
+#include <boost/accumulators/statistics/weighted_mean.hpp>
+#include <boost/lockfree/queue.hpp>
+#include <boost/thread/thread.hpp>
+
+//#include <Eigen/Core>
+#include <jellyfish/sequence_parser.hpp>
+#include <jellyfish/parse_read.hpp>
+#include <jellyfish/mer_counting.hpp>
+#include <jellyfish/misc.hpp>
+#include <jellyfish/compacted_hash.hpp>
+
+#include "tbb/concurrent_unordered_set.h"
+#include "tbb/concurrent_vector.h"
+#include "tbb/concurrent_unordered_map.h"
+#include "tbb/parallel_for.h"
+#include "tbb/parallel_for_each.h"
+#include "tbb/task_scheduler_init.h"
+
+//#include "nnls.h"
+#include "BiasIndex.hpp"
+#include "poisson_solver.hpp"
+#include "matrix_tools.hpp"
+#include "ezETAProgressBar.hpp"
+//#include "CompatibilityGraph.hpp"
+
+
+template <typename ReadHashT, typename TranscriptHashT>
+class IterativeOptimizer {
+
+private:
+    /**
+    * Typedefs
+    */
+    // A KmerMapT is a map from a kmer (encoded as an integer) to the set
+    // of transcripts where it occurs
+    typedef uint32_t TranscriptIDT;
+    typedef uint64_t KmerIDT;
+    typedef double KmerQuantityT;
+    typedef double PromiscutityT;
+    //typedef std::unordered_map< KmerIDT, std::vector<TranscriptIDT> > KmerMapT;
+    typedef tbb::concurrent_unordered_map< uint64_t, tbb::concurrent_vector<uint32_t> > KmerMapT;
+
+    struct TranscriptGeneVectors;
+    typedef std::vector<TranscriptGeneVectors> KmerIDMap;
+
+
+    typedef std::tuple<TranscriptIDT, std::vector<KmerIDT>> TranscriptKmerSet;
+    typedef std::string *StringPtrT;
+    typedef uint64_t TranscriptScoreT;
+    typedef jellyfish::invertible_hash::array<uint64_t, atomic::gcc, allocators::mmap> HashArrayT;
+    typedef size_t ReadLengthT;
+
+    // Necessary forward declaration
+    struct TranscriptDataT;
+    typedef std::tuple<TranscriptScoreT, TranscriptIDT> HeapPair;
+    typedef typename boost::heap::fibonacci_heap<HeapPair>::handle_type HandleT;
+
+    struct TranscriptGeneVectors {
+        tbb::concurrent_vector<uint32_t> transcripts;
+        tbb::concurrent_vector<uint32_t> genes;
+    };
+
+    struct TranscriptDataT {
+        TranscriptIDT id;
+        StringPtrT header;
+        std::map<KmerIDT, KmerQuantityT> binMers;
+        KmerQuantityT mean;
+        size_t length;
+    };
+
+    struct TranscriptInfo {
+        btree::btree_map<KmerIDT, KmerQuantityT> binMers;
+        KmerQuantityT mean;
+        ReadLengthT length;
+        ReadLengthT effectiveLength;
+    };
+
+    // This struct represents a "job" (transcript) that needs to be processed
+    struct TranscriptJob {
+        StringPtrT header;
+        StringPtrT seq;
+        TranscriptIDT id;
+    };
+
+    struct TranscriptResult {
+        TranscriptDataT *data;
+        TranscriptKmerSet *ks;
+    };
+
+    struct BinmerUpdates {
+        std::vector<KmerIDT> zeroedBinmers;
+        std::vector<KmerIDT> updatedBinmers;
+    };
+
+    size_t _merLen;
+    ReadHashT &_readHash;
+    TranscriptHashT &_transcriptHash;
+    BiasIndex& biasIndex_;
+
+    // The number of occurences above whcih a kmer is considered promiscuous
+    size_t _promiscuousKmerCutoff {15};
+
+    // Map each kmer to the set of transcripts it occurs in
+    //KmerMapT _transcriptsForKmer;
+    KmerIDMap _transcriptsForKmer;
+
+    // The actual data for each transcript
+    std::vector<TranscriptInfo *> _transcripts;
+
+    TranscriptGeneMap& _transcriptGeneMap;
+
+    tbb::concurrent_unordered_set<uint64_t> _genePromiscuousKmers;
+
+    std::vector<PromiscutityT> kmerPromiscuities_;
+    std::vector<PromiscutityT> kmerBiases_;
+
+
+    /******** REFACTOR ME *************/
+    void extractKmerTranscriptCompatibilityGraph() {
+        std::vector<int> kmerComponents(_transcriptsForKmer.size(), -1);
+        std::vector<int> transcriptComponents(_transcripts.size(), -1);
+
+        int component = 0;
+        size_t tstart = 0;
+        size_t numAssigned = 0;
+        std::queue<size_t> toProcess;
+
+        while (numAssigned < transcriptComponents.size()) {
+            // Find the first unassigned transcript
+            while ( transcriptComponents[tstart] != -1 and tstart < transcriptComponents.size()) { 
+                ++tstart; 
+            }
+
+            if ( tstart == transcriptComponents.size() ) { std::cerr << "no unassigned transcripts\n"; }
+
+            toProcess.push(tstart);
+            while( !toProcess.empty() ) {
+                auto tid = toProcess.front();
+                toProcess.pop();
+                if ( transcriptComponents[tid] == -1 ) {
+                    transcriptComponents[tid] = component;    
+                    ++numAssigned;
+
+                    // Find all kmers in this transcript
+                    for( auto binmer : _transcripts[tid]->binMers ) {
+
+                        auto existingComponent = kmerComponents[binmer.first];
+
+                        if ( existingComponent == -1 ) {
+                            kmerComponents[binmer.first] = component;
+                            for( auto ts : _transcriptsForKmer[binmer.first].transcripts ) {
+                                if ( transcriptComponents[ts] == -1 ) {
+                                    toProcess.push(ts);
+                                } else {
+                                    assert(transcriptComponents[ts] == component);
+                                }
+                            }
+                        }
+                    }
+
+                } else {
+                    assert( transcriptComponents[tid] == component );
+                }                    
+            }
+            ++component;
+        }
+
+        btree::btree_map<int, size_t> components;
+        for( auto tc : transcriptComponents ) {
+            assert(tc != -1);
+            components[tc] += 1;
+            //components.insert(tc);
+        }
+        for( auto kc : kmerComponents ) {
+            assert(kc != -1);
+        }
+        for ( auto cit : components ) {
+            std::cerr << "component " << cit.first << " contains " << cit.second << " transcripts\n";
+        }
+        std::cerr << "there were " << components.size() << " components\n";
+    }
+
+
+    /******** END REFACTOR ME *************/
+
+
+    inline double _idf( uint64_t k ) {
+        double df = _transcriptsForKmer[k].transcripts.size();
+        return (df > 0.0) ? std::log(_transcripts.size() / df) : 0.0;
+    }
+
+    // Should the given kmer be considered?
+    inline bool _considered( uint64_t mer ) {
+        // The kmer is only considered if it exists in the transcript set
+        // (i.e. it's possible to cover) and it's less prmiscuous than the
+        // cutoff.
+        return ( _transcriptHash.atIndex(mer) > 0 and
+                 _transcriptHash.atIndex(mer) < _promiscuousKmerCutoff );
+    }
+
+    KmerQuantityT _weight( KmerIDT k ) {
+        //return 1.0 / (_transcriptHash[k]);
+        return 1.0 / (_transcriptHash.atIndex(k) );
+    }
+
+    KmerQuantityT _computeMedian( TranscriptInfo* ti ) {
+
+        KmerQuantityT median = 0.0;
+        auto& binMers = ti->binMers;
+        auto len = binMers.size();
+        if ( len > 0 ) {
+            if ( len % 2 == 0 ) {
+                auto it = binMers.begin();
+                for ( size_t i = 0; i < (len / 2); ++i, ++it ) {}
+                median = it->second; ++it;
+                median += it->second;
+                median /= 2.0;
+            } else {
+                auto it = binMers.begin();
+                for ( size_t i = 0; i < (len / 2) + 1; ++i, ++it ) {}
+                median = it->second;
+            }
+        }
+        return median;
+    }
+
+    /**
+     * Computes the sum of kmer counts within the transcript given by ti, but clamping
+     * all non-zero counts to the given quantile.  For example, if quantile was 0.25, and
+     * x and y represented the 1st and 3rd quantile of kmer counts, then every nonzero count c 
+     * would be transformed as c = max(x, min(y,c));
+     * 
+     * @param  ti       [description]
+     * @param  quantile [description]
+     * @return          [description]
+     */
+    KmerQuantityT _computeSumQuantile( TranscriptInfo* ti, double quantile ) {
+        using namespace boost::accumulators;
+        typedef accumulator_set<double, stats<tag::p_square_quantile> > accumulator_t;
+        KmerQuantityT sum = 0.0;
+        
+        accumulator_t accLow(quantile_probability = quantile);
+        accumulator_t accHigh(quantile_probability = 1.0-quantile);
+        for ( auto binmer : ti->binMers ) {
+            if ( this->_genePromiscuousKmers.find(binmer.first) == this->_genePromiscuousKmers.end() ){
+                accLow(binmer.second);
+                accHigh(binmer.second);
+            }        
+        }
+
+        auto cutLow = p_square_quantile(accLow);
+        auto cutHigh = p_square_quantile(accHigh);
+
+        for ( auto binmer : ti->binMers ) {
+            if ( this->_genePromiscuousKmers.find(binmer.first) == this->_genePromiscuousKmers.end() ){
+                sum += std::min( cutHigh, std::max( cutLow, binmer.second ) );
+            }
+        }
+        return sum;
+    }
+
+    KmerQuantityT _computeSum( TranscriptInfo* ti ) {
+        KmerQuantityT sum = 0.0;
+        for ( auto binmer : ti->binMers ) {
+            if ( this->_genePromiscuousKmers.find(binmer.first) == this->_genePromiscuousKmers.end() ){
+                sum += kmerBiases_[binmer.first] * binmer.second;
+            }
+        }
+        return sum;
+    }
+
+    bool _discard(TranscriptInfo* ti) {
+        if ( ti->mean == 0.0 ) { 
+            return false; 
+        } else {
+            ti->mean = 0.0;
+            ti->binMers.clear();
+            return true;
+        }
+    }
+
+    KmerQuantityT _computeMean( TranscriptInfo* ti ) {
+        //return (ti->effectiveLength > 0.0) ? _computeSumQuantile(ti, 0.25) / ti->effectiveLength : 0.0;
+        return (ti->effectiveLength > 0.0) ? (_computeSum(ti) / ti->effectiveLength) : 0.0;
+    }
+
+    KmerQuantityT _computeWeightedMean( TranscriptInfo* ti ) {
+        using namespace boost::accumulators;
+        accumulator_set<double, stats<tag::count, tag::weighted_mean>, double> acc;
+
+        for ( auto binmer : ti->binMers ) {
+          if ( this->_genePromiscuousKmers.find(binmer.first) == this->_genePromiscuousKmers.end() ){
+            acc(binmer.second, weight=kmerBiases_[binmer.first] * kmerPromiscuities_[binmer.first]);
+          }
+        }
+
+        auto nnz = count(acc);
+        
+        if ( nnz < ti->effectiveLength ) {
+            acc(0.0, weight=ti->effectiveLength-nnz);
+        }
+       
+        auto sum = sum_of_weights(acc);
+        return sum > 0.0 ? weighted_mean(acc) : 0.0;
+    }
+
+    double _effectiveLength( TranscriptInfo* ts ) {
+        double length = 0.0;
+        for ( auto binmer : ts->binMers ) {
+            length += _weight(binmer.first);
+        }
+        return length;
+    }
+
+    void normalizeTranscriptMeans_(){
+        auto sumMean = 0.0;
+        for ( auto ti : _transcripts ) { sumMean += ti->mean; }
+    
+        // compute the new mean for each transcript
+        tbb::parallel_for( size_t(0), size_t(_transcripts.size()),
+            [this, sumMean]( size_t tid ) -> void { this->_transcripts[tid]->mean /= sumMean; });
+
+    }
+
+    double averageCount(TranscriptInfo* ts){
+        if ( ts->binMers.size() == 0 ) { return 0.0; }
+        double sum = 0.0;
+        for ( auto binmer : ts->binMers ) {
+            sum += kmerBiases_[binmer.first] * binmer.second;
+        }
+        return sum / ts->binMers.size();
+
+    }
+
+    /**
+     * This function should be run only after <b>after</b> an EM loop.
+     * It estimates kmer specific biases based on how much a kmer's count
+     * deviates from it's corresponding transcript's mean 
+     */
+    void computeKmerFidelities_() {
+
+        std::vector<double> transcriptFidelities(_transcripts.size(), 0.0);
+        tbb::parallel_for( size_t(0), _transcripts.size(),
+            [this, &transcriptFidelities]( TranscriptIDT tid ) {
+                double sumDiff = 0.0;
+                auto ts = this->_transcripts[tid];
+                for ( auto& b : ts->binMers ) {
+                    auto diff = (this->kmerBiases_[b.first] * b.second) - ts->mean;
+                    //auto diff = (this->kmerBiases_[b.first] * b.second) - this->averageCount(ts);
+                    sumDiff += diff*diff;
+                }
+                transcriptFidelities[tid] = std::sqrt(sumDiff / ts->binMers.size());
+            });
+        
+        auto totalFidelity = std::accumulate(transcriptFidelities.begin(), transcriptFidelities.end(), 0.0);
+        auto maxFidelity = *std::max_element(transcriptFidelities.begin(), transcriptFidelities.end());
+        auto averageFidelity = totalFidelity / transcriptFidelities.size();
+
+        std::cerr << "max fidelity = " << maxFidelity << "\n";
+
+        tbb::parallel_for( size_t(0), _transcriptsForKmer.size(),
+            [this, &transcriptFidelities, averageFidelity, maxFidelity]( KmerIDT kid ) -> void {
+                double sumT = 0.0; double sumK = 0.0;
+                double confidence = 0.0;
+                for( auto tid : this->_transcriptsForKmer[kid].transcripts ) {
+                    sumT += this->_transcripts[tid]->mean;
+                    sumK += this->_transcripts[tid]->binMers[kid];   
+                    confidence += transcriptFidelities[tid];// / averageFidelity;
+                }
+
+                confidence /= this->_transcriptsForKmer[kid].transcripts.size();
+                double alpha = 0.5; //std::min( 1.0, 10.0*averageFidelity / confidence);
+                double bias = sumK > 0.0 ? (sumT / sumK) : 0.0;
+                double prevBias = this->kmerBiases_[kid];
+                this->kmerBiases_[kid] = alpha * bias + (1.0 - alpha) * prevBias;
+            }
+        );
+
+    }
+
+    size_t _initialize( const std::vector<std::string> &transcriptFiles ) { 
+
+        char** fnames = new char*[transcriptFiles.size()];
+        size_t z{0};
+        size_t numFnames{0};
+        for ( auto& s : transcriptFiles ){
+            // Ugly, yes?  But this is not as ugly as the alternatives.
+            // The char*'s contained in fname are owned by the transcriptFiles
+            // vector and need not be manually freed.
+            fnames[numFnames] = const_cast<char*>(s.c_str());
+            ++numFnames;
+        }
+
+        // Create a jellyfish parser
+        jellyfish::parse_read parser( fnames, fnames+numFnames, 1000);
+
+        // So we can concisely identify each transcript
+        TranscriptIDT transcriptIndex {0};
+
+        size_t numActors = 12;
+        std::vector<std::thread> threads;
+
+        _transcripts.resize( _transcriptGeneMap.numTranscripts(), nullptr );
+
+        tbb::parallel_for( size_t{0}, _transcriptGeneMap.numTranscripts(),
+            [this](size_t tid) -> void {
+             auto procRead = new TranscriptInfo {
+                btree::btree_map<KmerIDT, KmerQuantityT>(),
+                0.0,
+                0,
+                0
+                };
+              this->_transcripts[tid] = procRead;
+            }
+        );
+
+        _transcriptsForKmer.resize( _transcriptHash.size() );
+        kmerBiases_.resize(_transcriptHash.size(), 1.0);
+
+        bool done {false};
+        std::atomic<size_t> numRes {0};
+        std::atomic<size_t> nworking{numActors-1};
+
+        // Start the thread that will print the progress bar
+        threads.push_back( std::thread( [&numRes, &nworking, this] () {
+            auto numJobs = this->_transcriptGeneMap.numTranscripts();
+            ez::ezETAProgressBar show_progress(numJobs);
+            //boost::progress_display show_progress( numJobs );
+            size_t lastCount = numRes;
+            show_progress.start();
+            while ( lastCount < numJobs and nworking > 0 ) {
+                auto diff = numRes - lastCount;
+                if ( diff > 0 ) {
+                    show_progress += static_cast<unsigned int>(diff);
+                    lastCount += diff;
+                }
+                boost::this_thread::sleep_for(boost::chrono::milliseconds(1010));
+            }
+        }) );
+
+        std::cerr << "Processing transcripts\n";
+
+        // Start the desired number of threads to parse the transcripts
+        // and build our data structure.
+        for (size_t i = 0; i < numActors - 1; ++i) {
+
+            threads.push_back( std::thread( 
+                [&numRes, &parser, &nworking, this]() -> void {
+
+                    // Each thread gets it's own stream
+                    jellyfish::parse_read::read_t* read;
+                    jellyfish::parse_read::thread stream = parser.new_thread();
+
+                    while ( (read = stream.next_read()) ) {
+                        ++numRes;
+                        
+                        std::string header(read->header, read->hlen);
+                        size_t d = std::min(header.length()-1, header.find(' '));
+                        std::string tname( header.substr(0,d) );
+
+                        auto biases = this->biasIndex_.getBiases(tname);
+
+                        // Strip the newlines from the read and put it into a string (newSeq)
+                        std::string seq(read->seq_s, std::distance(read->seq_s, read->seq_e) - 1 );
+                        auto newEnd  = std::remove( seq.begin(), seq.end(), '\n' );
+                        auto readLen = std::distance( seq.begin(), newEnd );
+                        std::string newSeq(seq.begin(), seq.begin() + readLen);
+
+                        // The set of kmers in this transcript
+                        //auto ts = new TranscriptKmerSet {transcriptIndex, {}};
+                        // We know how many kmers there will be, so allocate the space
+                        /*
+                        auto procRead = new TranscriptInfo {
+                            btree::btree_map<KmerIDT, KmerQuantityT>(),
+                            0.0,
+                            readLen,
+                            0
+                        };
+                        */
+                        
+                        // Lookup the ID of this transcript in our transcript -> gene map
+                        auto transcriptIndex = this->_transcriptGeneMap.findTranscriptID(tname); 
+                        bool valid = ((transcriptIndex != this->_transcriptGeneMap.INVALID) and 
+                                      (readLen > this->_merLen));
+                        
+                        if ( not valid ) {
+                            //this->_transcripts[transcriptIndex] = procRead;
+                            continue; 
+                        }
+                        
+                        auto procRead = this->_transcripts[transcriptIndex];
+                        procRead->length = readLen;
+                        auto geneIndex = this->_transcriptGeneMap.gene(transcriptIndex);
+
+                        size_t numKmers {readLen - this->_merLen + 1};
+                        // The binary representation of the kmers in this transcript
+                        //btree::btree_map<KmerIDT, KmerQuantityT> binMers;
+
+                        // Iterate over the kmers
+                        ReadLengthT effectiveLength(0);
+                        float weightedLen = 0.0;
+                        size_t coverage = _merLen;
+                        auto INVALID = this->_transcriptHash.INVALID;
+                        for ( auto offset : boost::irange( size_t(0), numKmers ) ) { 
+                            // the kmer and it's uint64_t representation
+                            try {
+                                auto mer = newSeq.substr( offset, this->_merLen );                            
+                            auto binMer = jellyfish::parse_dna::mer_string_to_binary( mer.c_str(), _merLen );
+                            auto rcBinMer = jellyfish::parse_dna::reverse_complement( binMer, _merLen );
+                            binMer = std::min(binMer, rcBinMer);
+
+                            auto binMerId = this->_transcriptHash.id(binMer);
+                            // Only count and track kmers which should be considered
+                            //if ( this->_considered(binMer) ) {
+                            if ( binMerId != INVALID and this->_considered(binMerId) ) {
+                                weightedLen += (1.0 / this->_transcriptHash.atIndex(binMerId));
+                                effectiveLength += coverage;
+                                coverage = 1;
+                                //binMers[binMer] += 1.0;
+                                //procRead->binMers[binMerId] += 1.0;
+                                procRead->binMers[binMerId] += 1.0 / biases[offset];
+                                
+                                this->_transcriptsForKmer[binMerId].transcripts.push_back(transcriptIndex);
+                                this->_transcriptsForKmer[binMerId].genes.push_back(geneIndex);
+                            } else {
+                                coverage += (coverage < _merLen) ? 1 : 0;
+                            }
+
+                            } catch ( std::exception& e ) { 
+                                std::cerr << "trying to take kmer starting at " << offset << 
+                                ", length of read is " << readLen << " merLen is " << this->_merLen << "\n";
+                                std::cerr << "read is " << newSeq << "\n";
+                                std::cerr << e.what() << "\n";
+                                std::exit(1);
+                            }
+
+                        }
+                        procRead->effectiveLength = numKmers;
+
+                        /*
+                        auto procRead = new TranscriptInfo {
+                            binMers, // the set of kmers
+                            0.0, // initial desired mean
+                            readLen, // real transcript length
+                            effectiveLength // effective transcript length
+                        };*/
+
+                        //this->_transcripts[transcriptIndex] = procRead;
+                    }
+                    nworking--;
+            }) );
+
+        }
+
+        // Wait for all of the threads to finish
+        for ( auto& thread : threads ){ thread.join(); }
+
+        numRes = 0;
+        tbb::task_scheduler_init tbb_init;
+
+        std::cerr << "\n\nDetermining gene-promiscuous kmers ... ";
+        
+        //tbb::parallel_for_each( _transcriptsForKmer.begin(), _transcriptsForKmer.end(),
+        tbb::parallel_for( size_t(0), size_t(_transcriptsForKmer.size()),
+            [&numRes, this]( size_t idx ) { //KmerMapT::value_type& kt ) {
+
+                /*
+                //auto& transcripts = kt.second;
+                */
+                
+                // Uniqify the transcripts
+                auto& transcripts = this->_transcriptsForKmer[idx].transcripts;
+                std::sort(transcripts.begin(), transcripts.end());
+                auto it = std::unique(transcripts.begin(), transcripts.end()); 
+                transcripts.resize( it - transcripts.begin() );
+                
+                bool filterGenePromiscuous = false;
+
+                if (filterGenePromiscuous) {
+                    auto numGenes = this->_transcriptsForKmer[idx].genes.size();
+                    if (numGenes > 1) {
+                        auto geneId = this->_transcriptsForKmer[idx].genes[0];
+                        for ( auto i : boost::irange(size_t(1), this->_transcriptsForKmer[idx].genes.size()) ) {
+                          auto id = this->_transcriptsForKmer[idx].genes[i];
+                          if( id != geneId) { 
+                            this->_genePromiscuousKmers.insert(idx);
+                            break; 
+                          }
+                        }
+                    }
+                }
+    
+
+                // auto geneId = this->_transcriptGeneMap.gene(transcripts[0]);
+                // for ( auto i : boost::irange(size_t(1), transcripts.size()) ) {
+                //     auto id = this->_transcriptGeneMap.gene(transcripts[i]);
+                //     if( id != geneId) { this->_genePromiscuousKmers.insert(kt.first); break; }
+                // }
+
+                /*
+                std::unordered_set<uint32_t> geneSet;
+                for ( auto t : transcripts ){
+                    geneSet.insert(this->_transcriptGeneMap.gene(t));
+                }
+                if ( geneSet.size() > 1 ){ this->_genePromiscuousKmers.insert(kt.first); }
+                */
+                ++numRes;
+            }
+        );
+
+        auto gpfrac = _genePromiscuousKmers.size() / static_cast<double>(_transcriptsForKmer.size());
+        std::cerr << "There were " << _genePromiscuousKmers.size() << " (" << 100 * gpfrac << "%) gene promiscuous kmers\n";
+
+        std::cerr << "Computing promiscuity rates\n";
+        kmerPromiscuities_.resize(_transcriptsForKmer.size());
+        tbb::parallel_for( size_t{0}, kmerPromiscuities_.size(),
+            [this]( KmerIDT kid ) -> void { this->kmerPromiscuities_[kid] = this->_weight(kid); }
+        );
+
+        /**
+         * gene-promiscuous kmers can never be added to a transcript's counts, so
+         * it's unfair to consider them in the transcripts effective length. 
+         */
+        std::for_each( _genePromiscuousKmers.begin(), _genePromiscuousKmers.end(),
+            [this]( KmerIDT kmerId ) { 
+                for ( auto tid : _transcriptsForKmer[kmerId].transcripts ) {
+                    _transcripts[tid]->effectiveLength -= 1.0;
+                }
+            });
+
+        std::cerr << "done\n";
+
+        std::cerr << "computing number of mapped (usable) reads\n";
+        size_t mappedReads = 0;
+        for ( auto kidx : boost::irange(size_t(0), _transcriptsForKmer.size()) ) {
+            if ( _genePromiscuousKmers.find(kidx) == _genePromiscuousKmers.end() ) {
+                mappedReads += _readHash.atIndex(kidx);
+            }
+        }
+
+        //std::cerr << "computing kmer compatibility graph\n";
+        //extractKmerTranscriptCompatibilityGraph();
+        //std::cerr << "done\n";
+        return mappedReads;
+    }
+
+    void _dumpCoverage( const std::string &cfname ) {
+        typedef std::string* StringPtr;
+
+        size_t numTrans = _transcripts.size();
+        size_t numProc = 0;
+        std::ofstream ofile(cfname);
+
+        ofile << "# num_transcripts\n";
+        ofile << "# transcript_name_{1} num_kmers_{1} count_1 count_2 ... count_{num_kmers}\n";
+        ofile << "# ... \n";
+        ofile << "# transcript_name_{num_transcripts} num_kmers_{num_transcripts} count_1 count_2 ... count_{num_kmers_{num_transcripts}}\n";
+
+        ofile << _transcripts.size() << "\n";
+
+        std::cerr << "Dumping coverage statistics to " << cfname << "\n";
+
+        boost::lockfree::queue<StringPtr> covQueue(_transcripts.size());
+        
+        tbb::parallel_for( size_t{0}, _transcripts.size(),
+            [this, &covQueue] (size_t index) -> void {
+
+                auto td = this->_transcripts[index];
+                
+                std::stringstream ostream;
+                ostream << this->_transcriptGeneMap.transcriptName(index) << " " << td->binMers.size();
+                for ( auto bm : td->binMers ) {
+                    ostream << " " << bm.second;
+                }
+                ostream << "\n";
+                std::string* ostr = new std::string(ostream.str());
+                while(!covQueue.push(ostr));
+            }
+        );
+
+
+                        
+        ez::ezETAProgressBar pb(_transcripts.size());
+        pb.start();
+
+        std::string* sptr = nullptr;
+        while ( numProc < numTrans ) {
+            while( covQueue.pop(sptr) ) {
+                ofile << (*sptr);
+                ++pb;
+                ++numProc;
+                delete sptr;
+            }
+        }
+
+        ofile.close();
+
+    }
+
+
+
+    std::vector<double> _estimateIsoformNNLS( const size_t geneID, const size_t mappedReads ) {
+
+        typedef std::vector<std::vector<double>> IsoOptMatrix;
+        typedef std::vector<double> IsoOptVector;
+    
+        auto transcripts = _transcriptGeneMap.transcriptsForGene( geneID );
+        size_t numTranscripts = transcripts.size();
+
+        std::unordered_map<uint64_t, size_t> kmerIDs;
+        std::unordered_set<uint64_t> geneKmers;
+
+        // reindex the kmers for this gene
+        for ( size_t i = 0; i < transcripts.size(); ++i ) {
+            for ( auto kc : _transcripts[ transcripts[i] ]->binMers ) {
+                auto kmer = kc.first;
+                auto kmerIt = geneKmers.find(kmer);
+                
+                if ( kmerIt == geneKmers.end() ) {
+                    kmerIDs[kmer] = geneKmers.size();
+                    geneKmers.insert(kmer);
+                }
+            }            
+        }
+
+
+        size_t numKmers = geneKmers.size();
+        if ( numKmers == 0 ) {
+            return IsoOptVector();
+        }
+
+
+        // Create a numTranscripts x numKmers matrix to hold the "rates"
+        IsoOptVector x(numTranscripts, 0.0);
+        IsoOptMatrix A(numTranscripts, IsoOptVector(numKmers, 0.0) );
+        IsoOptVector counts(numKmers, 0.0);
+        double numReads = 0.0;
+
+        std::vector<double> numMappedReads(transcripts.size(), 0.0);
+
+        for ( size_t i = 0; i < transcripts.size(); ++i ) {
+            for ( auto kc : _transcripts[ transcripts[i] ]->binMers ) {
+                auto kmer = kc.first; auto count = kc.second;
+                size_t j = kmerIDs[kmer];
+                A[i][j] = count;
+                numMappedReads[i] += count;
+                if ( counts[j] == 0 ) {
+                    auto promIt = _genePromiscuousKmers.find(kmer);
+                    double normFact = (promIt == _genePromiscuousKmers.end() ) ? 1.0 : 0.0;
+                    counts[j] = _readHash.atIndex(kmer) * normFact;
+                    numReads += counts[j];
+                }
+            }
+        }
+
+
+        std::unique_ptr<IsoOptMatrix> collapsedA;
+        std::unique_ptr<IsoOptVector> collapsedCounts;
+        matrix_tools::collapseIntoCategories(A, counts, collapsedA, collapsedCounts);
+
+        std::vector<double> cc(collapsedCounts->size(), 0.0);
+        for( size_t i = 0; i < cc.size(); ++i ) { cc[i] = (*collapsedCounts)[i]; }
+
+        auto nr = collapsedA->size();
+        auto nc = (*collapsedA)[0].size();
+
+        typedef Eigen::SparseMatrix<double> SpMat; // declares a column-major sparse matrix type of double
+        typedef Eigen::Triplet<double> T;
+
+        auto M = std::unique_ptr<SpMat>(new SpMat(nr,nc));
+        std::vector<T> trips;
+
+        for( size_t r = 0; r < nr; ++r ) {
+            for( size_t c = 0; c < nc; ++c ) {
+                if ( (*collapsedA)[r][c] > 0 ) {
+                    trips.push_back( T(r,c,(*collapsedA)[r][c]) );
+                }
+            }
+        }
+        M->setFromTriplets(trips.begin(), trips.end());
+                
+        /*
+        auto dupCols = matrix_tools::markDependentColumns(*M);
+        if (dupCols.size() > 0) {
+            std::vector<T> triplets;
+
+            auto mCol = M->cols() - dupCols.size();
+            size_t dupPtr = 0;
+            for (size_t k = 0; k < M->outerSize(); ++k) {
+                // If this is a colum we skip, then continue this loop and
+                // move to the next column to skip
+                if ( dupPtr < dupCols.size() && k == dupCols[dupPtr] ) {
+                    ++dupPtr;
+                    continue;
+                }
+                // The column index is the column index of A minus the number of
+                // columns we've skipped so far
+                size_t colInd = k - dupPtr;
+                assert(colInd < mCol);
+                // Add all rows of this column to the new matrix
+                for (SpMat::InnerIterator it(*M, k); it; ++it) {
+                    triplets.push_back( {it.row(), colInd, it.value()} );
+                }
+            }
+            assert(M->cols() > dupCols.size());
+            auto B = std::unique_ptr<SpMat>(new SpMat(M->rows(), M->cols() - dupCols.size()));
+            B->setFromTriplets(triplets.begin(), triplets.end());
+            std::swap(M, B);
+        }
+       */ 
+       
+        auto r = matrix_tools::nnlsSolve(*M, cc);
+        for( auto i : boost::irange(size_t{0}, r.size()) ){
+            r[i] *= numMappedReads[i];
+        }
+
+        return r;
+    }
+
+
+
+    std::vector<double> _estimateIsoformAbundance( const size_t geneID, const size_t mappedReads
+      //std::vector<std::vector<uint64_t>> &transcriptKmers,
+      //std::vector<double>& transcriptLengths,
+      //size_t mappedReads,
+      //tbb::concurrent_unordered_map<uint64_t, uint32_t>& genePromiscuousKmers
+      ) {
+
+        typedef std::vector<std::vector<double>> IsoOptMatrix;
+        typedef std::vector<double> IsoOptVector;
+	
+        auto transcripts = _transcriptGeneMap.transcriptsForGene( geneID );
+        size_t numTranscripts = transcripts.size();
+
+        std::unordered_map<uint64_t, size_t> kmerIDs;
+        std::unordered_set<uint64_t> geneKmers;
+        // reindex the kmers for this gene
+        for ( size_t i = 0; i < transcripts.size(); ++i ) {
+            for ( auto kc : _transcripts[ transcripts[i] ]->binMers ) {
+                auto kmer = kc.first;
+                auto kmerIt = geneKmers.find(kmer);
+                if ( kmerIt == geneKmers.end() ) {
+                    kmerIDs[kmer] = geneKmers.size();
+                    geneKmers.insert(kmer);
+                }
+            }            
+        }
+
+
+	size_t numKmers = geneKmers.size();
+        if ( numKmers == 0 ) {
+            return IsoOptVector();
+        }
+
+
+	// Create a numTranscripts x numKmers matrix to hold the "rates"
+        IsoOptVector x(numTranscripts, 0.0);
+        IsoOptMatrix A(numTranscripts, IsoOptVector(numKmers, 0.0) );
+        IsoOptVector counts(numKmers, 0.0);
+        double numReads = 0.0;
+
+        for ( size_t i = 0; i < transcripts.size(); ++i ) {
+            for ( auto kc : _transcripts[ transcripts[i] ]->binMers ) {
+                auto kmer = kc.first; auto count = kc.second;
+                size_t j = kmerIDs[kmer];
+                A[i][j] = mappedReads;//1.0;
+                if ( counts[j] == 0 ) {
+                    auto promIt = _genePromiscuousKmers.find(kmer);
+                    double normFact = (promIt == _genePromiscuousKmers.end() ) ? 1.0 : 0.0;
+                    counts[j] = _readHash.atIndex(kmer) * normFact;
+                    numReads += counts[j];
+                }
+            }
+        }
+
+
+        std::unique_ptr<IsoOptMatrix> collapsedA;
+        std::unique_ptr<IsoOptVector> collapsedCounts;
+        matrix_tools::collapseIntoCategories(A, counts, collapsedA, collapsedCounts);
+        
+        for (size_t i = 0; i < collapsedA->size(); ++i) {
+            auto tlen = _transcripts[ transcripts[i] ]->length;
+            for (size_t j = 0; j < (*collapsedA)[0].size(); ++j) {
+                (*collapsedA)[i][j] = (*collapsedA)[i][j] / ( tlen * mappedReads / 1000000000.0 );
+                //(*collapsedA)[i][j] = (*collapsedA)[i][j] * tlen / 1000.0 * static_cast<double>(mappedReads) / 1000000.0;  
+            }
+        }
+
+        solve_likelihood( *collapsedA, *collapsedCounts, x, false );
+        return x;
+    }
+
+public:
+    /**
+     * Construct the solver with the read and transcript hashes
+     */
+    IterativeOptimizer( ReadHashT &readHash, TranscriptHashT  &transcriptHash, TranscriptGeneMap& transcriptGeneMap,
+                        BiasIndex& biasIndex ) : 
+        _readHash(readHash), _transcriptHash(transcriptHash), _merLen(transcriptHash.kmerLength()), 
+        _transcriptGeneMap(transcriptGeneMap), biasIndex_(biasIndex) {}
+
+
+    KmerQuantityT optimize( const std::vector<std::string> &transcriptFiles, const std::string &outputFile, size_t numIt, double minMean ) {
+
+        _initialize(transcriptFiles);
+
+        // Holds results that we will use to update the index map
+        size_t numActors = 1;
+
+        KmerQuantityT globalError {0.0};
+        bool done {false};
+        std::atomic<size_t> numJobs {0};
+        std::atomic<size_t> completedJobs {0};
+        std::vector<KmerIDT> kmerList( _transcriptsForKmer.size(), 0 );
+        size_t idx = 0;
+
+        tbb::task_scheduler_init tbb_init;
+
+
+        std::cerr << "Computing initial coverage estimates ... ";
+
+        std::default_random_engine generator;
+        std::normal_distribution<double> distribution(5.0,2.0);
+
+        tbb::parallel_for( size_t(0), size_t(_transcriptGeneMap.numTranscripts()),
+        [this, &distribution, &generator]( size_t tid ) -> void {
+            auto transcriptData = this->_transcripts[tid];
+
+            for ( auto & kv : transcriptData->binMers ) {
+                auto kmer = kv.first;
+                if ( this->_genePromiscuousKmers.find(kmer) == this->_genePromiscuousKmers.end() ){
+                    // count is the number of times kmer appears in transcript (tid)
+                    auto count = kv.second;
+                    kv.second = count * this->_readHash.atIndex(kmer) * this->_weight(kmer);
+                }
+            }
+            transcriptData->mean = this->_computeMean( transcriptData );
+            //transcriptData->mean = distribution(generator);
+            //this->_computeWeightedMean( transcriptData );
+        }
+        );
+
+        //normalizeTranscriptMeans_();
+
+        std::cerr << "done\n";
+
+        size_t outerIterations = 1;
+        for ( size_t oiter = 0; oiter < outerIterations; ++oiter ) {
+
+        for ( size_t iter = 0; iter < numIt; ++iter ) {
+
+            auto reqNumJobs = _transcriptsForKmer.size();
+
+            std::cerr << "iteraton: " << iter << "\n";
+
+            globalError = 0.0;
+            numJobs = 0;
+            completedJobs = 0;
+
+            auto pbthread = std::thread( 
+                [&completedJobs, reqNumJobs]() -> bool {
+                    auto prevNumJobs = 0;
+                    ez::ezETAProgressBar show_progress(reqNumJobs);
+                    //boost::progress_display show_progress( reqNumJobs );
+                    show_progress.start();
+                    while ( prevNumJobs < reqNumJobs ) {
+                        if ( prevNumJobs < completedJobs ) {
+                            show_progress += completedJobs - prevNumJobs;
+                        }
+                        prevNumJobs = completedJobs.load();
+
+                        boost::this_thread::sleep_for(boost::chrono::seconds(1));
+                    }
+                    show_progress.done();
+                    return true;
+                }
+            );
+
+            tbb::parallel_for( size_t(0), size_t(_transcriptsForKmer.size()),
+                // for each kmer    
+                [&kmerList, &completedJobs, this]( size_t kid ) {
+
+                        auto kmer = kid;
+                        if ( this->_genePromiscuousKmers.find(kmer) == this->_genePromiscuousKmers.end() ){
+                        
+                            auto &transcripts = this->_transcriptsForKmer[kmer].transcripts;
+                            if ( transcripts.size() > 1 ) {
+
+                                // for each transcript containing this kmer
+                                double totalMass = 0.0;
+                                for ( auto tid : transcripts ) {
+                                    totalMass += this->_transcripts[tid]->mean;
+                                }
+
+
+                                if ( totalMass > 0.0 ) {
+                                    double norm = 1.0 / totalMass;
+                                    for ( auto tid : transcripts ) {
+                                        if ( this->_transcripts[tid]->mean > 0.0 ) {
+                                            this->_transcripts[tid]->binMers[kmer] =
+                                            this->_transcripts[tid]->mean * norm * kmerBiases_[kmer] * this->_readHash.atIndex(kmer);
+                                        }
+                                    }
+                                }
+
+                            }
+                        }
+                        ++completedJobs;
+                    }
+            );
+
+            pbthread.join();
+            //tp.wait();
+
+            // reset the job counter
+            completedJobs = 0;
+
+            double delta = 0.0;
+            double norm = 1.0 / _transcripts.size();
+
+            std::vector<KmerQuantityT> prevMeans( _transcripts.size(), 0.0 );
+            tbb::parallel_for( size_t(0), size_t(_transcripts.size()),
+                [this, &prevMeans]( size_t tid ) -> void { prevMeans[tid] = this->_transcripts[tid]->mean; });
+
+            std::cerr << "\ncomputing new means ... ";
+            size_t discard = 0;
+            // compute the new mean for each transcript
+            tbb::parallel_for( size_t(0), size_t(_transcripts.size()),
+                [this, iter, numIt, norm, minMean, &discard]( size_t tid ) -> void {
+                        // this thread claimed myJobID;
+                        auto ts = this->_transcripts[tid];
+                        auto tsNorm = 1.0;//(ts->effectiveLength > 0.0) ? 1.0 / std::sqrt(ts->effectiveLength) : 1.0;
+                        //ts->mean = tsNorm * this->_computeWeightedMean( ts );
+                        //ts->mean = tsNorm * this->averageCount( ts );
+                        ts->mean = tsNorm * this->_computeMean( ts );
+                }
+            );
+
+            //normalizeTranscriptMeans_();
+            for( auto tid : boost::irange(size_t{0}, prevMeans.size()) ){
+                delta += std::abs( _transcripts[tid]->mean - prevMeans[tid] );
+            }
+
+            std::cerr << "done\n";
+            std::cerr << "total variation in mean = " << delta << "\n";
+            std::cerr << "discarded " << discard << " transcripts in this round whose mean was below " << minMean << "\n";
+        }
+
+        std::cerr << "end of outer iteration " << oiter << " recomputing biases\n";
+        computeKmerFidelities_();
+    }
+
+        std::cerr << "Writing output\n";
+        ez::ezETAProgressBar pb(_transcripts.size());
+        pb.start();
+
+        std::ofstream ofile( outputFile );
+        size_t index = 0;
+        ofile << "Transcript" << '\t' << "Length" << '\t' << "Effective Length" << '\t' << "Weighted Mapped Reads" << '\n';
+        for ( auto ts : _transcripts ) {
+            ofile << _transcriptGeneMap.transcriptName(index) << '\t' << ts->length << '\t' <<
+                    ts->effectiveLength << '\t' << _computeSum(ts) << "\n";
+            ++index;
+            ++pb;
+        }
+        ofile.close();
+
+        auto writeCoverageInfo = true;
+        if ( writeCoverageInfo ) {
+            std::string cfname("transcriptCoverage.txt");
+            _dumpCoverage( cfname );
+        }
+
+    }
+
+    KmerQuantityT optimizePoisson( const std::vector<std::string> &transcriptFiles, const std::string &outputFile ) {
+
+        auto mappedReads = _initialize(transcriptFiles);
+
+        // Holds results that we will use to update the index map
+        size_t numActors = 1;
+
+        KmerQuantityT globalError {0.0};
+        bool done {false};
+        std::atomic<size_t> numJobs {0};
+        std::atomic<size_t> completedJobs {0};
+        std::mutex nnlsmutex;
+        std::vector<KmerIDT> kmerList( _transcriptsForKmer.size(), 0 );
+        size_t idx = 0;
+
+
+        std::vector<double> abundances( _transcriptGeneMap.numTranscripts(), 0.0 );
+
+        _transcriptGeneMap.needReverse();
+        std::atomic<size_t> numGenesProcessed{0};
+
+        auto abundanceEstimateProgress = std::thread( [&numGenesProcessed, this] () -> void {
+            auto numJobs = this->_transcriptGeneMap.numGenes();
+            ez::ezETAProgressBar pbar(numJobs);
+            pbar.start();
+            size_t lastCount = numGenesProcessed;
+            while ( lastCount < numJobs ) {
+                auto diff = numGenesProcessed - lastCount;
+                if ( diff > 0 ) {
+                    pbar += diff;
+                    lastCount += diff;
+                }
+                boost::this_thread::sleep_for(boost::chrono::milliseconds(1010));
+            }
+        });
+        std::cerr << "Processing abundances\n";
+
+        // process each gene and solve the estimation problem
+        tbb::parallel_for( size_t {0}, _transcriptGeneMap.numGenes(), 
+            [this, mappedReads, &abundances, &numGenesProcessed]( const size_t & geneID ) {
+                //auto abundance = this->_estimateIsoformAbundance(geneID, mappedReads);
+                auto abundance = this->_estimateIsoformNNLS(geneID, mappedReads);
+                auto transcripts = this->_transcriptGeneMap.transcriptsForGene(geneID);
+
+                for ( size_t i = 0; i < abundance.size(); ++i ) {
+                    abundances[ transcripts[i] ] = abundance[i];
+                }
+                ++numGenesProcessed;
+            }
+        );
+
+        /*
+        for( size_t geneID = 0; geneID < _transcriptGeneMap.numGenes(); ++geneID ) {
+                //auto abundance = this->_estimateIsoformAbundance(geneID, mappedReads);
+                std::cerr << "computing abundance for " << geneID << "  . . . ";
+                auto abundance = this->_estimateIsoformNNLS(geneID, mappedReads);
+                std::cerr << "done, size = " << abundance.size() << "\n";
+                auto transcripts = this->_transcriptGeneMap.transcriptsForGene(geneID);
+
+                for ( size_t i = 0; i < abundance.size(); ++i ) {
+                    abundances[ transcripts[i] ] = abundance[i];
+                }
+                ++numGenesProcessed;
+        }
+        */
+       
+        abundanceEstimateProgress.join();
+
+        std::cerr << "Writing output\n";
+        ez::ezETAProgressBar pb(_transcripts.size());
+        pb.start();
+
+        std::ofstream ofile( outputFile );
+        size_t index = 0;
+        ofile << "Transcript" << '\t' << "Length" << '\t' << "Effective Length" << '\t' << "Abundance" << '\n';
+        for ( auto ts : _transcripts ) {
+            ofile << _transcriptGeneMap.transcriptName(index) << '\t' << ts->length << '\t' <<
+                    ts->effectiveLength << '\t' << abundances[index] << "\n";
+            ++index;
+            ++pb;
+        }
+        ofile.close();
+
+        auto writeCoverageInfo = false;
+        if ( writeCoverageInfo ) {
+            std::string cfname("transcriptCoverage.txt");
+            _dumpCoverage( cfname );
+        }
+
+    }
+
+
+};
+
+#endif // ITERATIVE_OPTIMIZER_HPP
diff --git a/include/obsolete/KFITF.hpp b/include/obsolete/KFITF.hpp
new file mode 100755
index 0000000..1f25113
--- /dev/null
+++ b/include/obsolete/KFITF.hpp
@@ -0,0 +1,158 @@
+#ifndef KFITF_HPP
+#define KFITF_HPP
+
+#include <atomic>
+#include <cmath>
+#include <cstdlib>
+#include <cstdint>
+#include <mutex>
+#include <unordered_set>
+
+#include <jellyfish/sequence_parser.hpp>
+#include <jellyfish/parse_read.hpp>
+#include <jellyfish/mer_counting.hpp>
+#include <jellyfish/misc.hpp>
+#include <jellyfish/compacted_hash.hpp>
+
+
+/** Kmer Frequence Inverse Transcript Frequency (KFITF) calculator
+ *
+ * We consider each k-mer as a "term" and each transcript as a "document".
+ * The purpose of this class is to compute the TF-IDF (Term Frequency-Inverse Document Frequency)
+ * of each kmer transcript pair.  Since the kmer frequency (KF) is on a per transcript basis,
+ * it can be computed independently in each transcript and need not be stored in memory
+ * after this transcript has been processed.  The inverse transcript freuqency (ITF), however
+ * requires knowing the number of different transcripts in which a k-mer is located.  This can
+ * be computed in a single pass over the data, but must be stored in memory.
+ *
+ */
+
+
+class KFITFCalculator {
+  private:
+  typedef jellyfish::invertible_hash::array<uint64_t, atomic::gcc, allocators::mmap> hash_array;
+
+  size_t _merLen;
+  size_t _numTranscripts;
+  hash_array _hash;
+
+  public:
+  KFITFCalculator( size_t merLen ) :
+    _merLen( merLen ),
+    _numTranscripts( 0 ),
+    _hash( hash_array(1000000, // Size of hash. Will be rounded up to a power of 2
+                     2*_merLen,      // Key length in bits (2 * mer length)
+                     30,       // Value length in bits
+                     126,     // Max # of reprobe
+                     jellyfish::quadratic_reprobes // Reprobe policy
+                     ) ) {
+
+    //Test out the hash
+    std::string mer{"AAAAAATTGTGTATGGC"};
+    auto binMer = jellyfish::parse_dna::mer_string_to_binary( mer.c_str(), _merLen );
+    for ( size_t i = 0; i < 100; ++i ) {
+      auto k = rand() % 100000 + 1;
+      for ( size_t j = 0; j < k; ++j ) {
+        _hash.add(binMer,1);
+      }
+      uint64_t val{0}; _hash.get_val(binMer, val);
+      if( val != k ) {
+        std::cerr << "val  = " << val << " but it should be  " << k  << "\n";
+        std::abort();
+      }
+      _hash.map(binMer,0);
+    }
+  }
+
+  template <typename StreamT>
+  void computeITF( StreamT& stream ) {
+
+    using std::string;
+
+    std::mutex resLock;
+
+    size_t numActors = 26;
+    // Thread pool to count membership of multiple transcripts simultaneously
+    boost::threadpool::pool tp(numActors);
+
+    // Lock-free queue to hold kmer membership sets
+    boost::lockfree::fifo< std::unordered_set<uint64_t>* > q;
+
+    std::atomic<int> counter{0};
+
+    auto readResults = [this, &tp, &counter] () -> bool {
+
+      while ( tp.active() + tp.pending() > 1 ) {
+        std::cerr << counter.load() << "\n";
+        boost::this_thread::sleep_for(boost::chrono::milliseconds(250));
+      }
+      return true;
+    };
+
+    // schedule the consumer
+    tp.schedule(readResults);
+
+
+    // Read pointer
+    jellyfish::read_parser::read_t *read;
+    // Read the input data
+    while ( (read = stream.next_read()) ) {
+      std::string oseq(read->seq_s, std::distance(read->seq_s, read->seq_e)-1 );
+      std::string header( read->header, read->hlen );
+
+      auto task = [&resLock, &q, &counter, oseq, header, this]() -> void {
+        std::string seq = oseq;
+        auto newEnd  = std::remove( seq.begin(), seq.end(), '\n' );
+        auto readLen = std::distance( seq.begin(), newEnd );
+
+        std::unordered_map<uint64_t, size_t> occurringKmers;// = new std::unordered_set<uint64_t>;
+        auto splitPos = header.find('|');
+        auto geneName = header.substr(0, splitPos);
+        size_t offset = 0;
+        size_t mapped = 0;
+
+        while ( offset < readLen - this->_merLen )  {
+          auto mer = seq.substr( offset, this->_merLen );
+          auto binMer = jellyfish::parse_dna::mer_string_to_binary( mer.c_str(), this->_merLen );
+          // add this kmer to the set for this transcript
+          occurringKmers[binMer] += 1;//.insert( binMer );
+          ++offset;
+        }
+
+        //resLock.lock();
+        for ( const auto& kmer : occurringKmers ) {
+          // Increment the count of this kmer
+          this->_hash.add(kmer.first, kmer.second);
+        }
+        //resLock.unlock();
+
+        ++counter;
+        // Enqueue this transcript's kmer set
+        //resLock.lock();
+        //q.enqueue(occurringKmers);
+        //resLock.unlock();
+      }; // end task closure
+
+      tp.schedule(task);
+
+    } // end reading input
+
+    // Wait for the work to finish
+    tp.wait();
+    _numTranscripts = counter.load();
+  }
+
+  float itf( const std::string& mer ) {
+    auto binMer = jellyfish::parse_dna::mer_string_to_binary( mer.c_str(), _merLen );
+    uint64_t val{0}; _hash.get_val( binMer, val );
+    return 1.0 / (1.0f+val);
+    if( _numTranscripts < val ) {
+      std::cerr << "ERROR! Num Transcripts = " << _numTranscripts << ", but df(t,D) = " << val << "\n";
+      std::abort();
+    }
+    return std::log2( static_cast<float>(_numTranscripts) / (1.0f + val));
+  }
+
+};
+
+#endif //KFITF_HPP
diff --git a/include/obsolete/LinearSystemBuilder.hpp b/include/obsolete/LinearSystemBuilder.hpp
new file mode 100755
index 0000000..69b28f7
--- /dev/null
+++ b/include/obsolete/LinearSystemBuilder.hpp
@@ -0,0 +1,170 @@
+#ifndef LINEARSYSTEMBUILDER_HPP
+#define LINEARSYSTEMBUILDER_HPP
+
+#include <cstdint>
+#include <fstream>
+#include <vector>
+#include <unordered_map>
+#include <limits>
+#include <iomanip>
+#include <sstream>
+
+#include <jellyfish/sequence_parser.hpp>
+#include <jellyfish/parse_read.hpp>
+#include <jellyfish/mer_counting.hpp>
+#include <jellyfish/misc.hpp>
+#include <jellyfish/compacted_hash.hpp>
+
+/** This class builds the linear system that represents the
+ * transcript abundance problem.  Given the transcript database (T)
+ * and the read database (R), let K|R be the set of kmers occurring
+ * in R.  We then construct two quantities, A, a matrix and b, a vector.
+ * Let N = \card{K|R} and M = \card{T}, then A is an NxM matrix and b is
+ * an Nx1 vector.  Eventually, we will solve this system directly, but, for now,
+ * we'll write it to file so we can call an external solver on it.
+ */
+template <typename HashT>
+class LinearSystemBuilder {
+
+  private:
+  std::string _transcriptFile;
+  HashT _readHash;
+  std::unordered_map<uint64_t, uint64_t> _indexMap;
+  size_t _numTranscripts;
+
+  bool buildIndexMap() {
+    uint_t merLen = _readHash.get_mer_len();
+
+    const char* fnames[] = { _transcriptFile.c_str() };
+    jellyfish::parse_read parser( fnames, fnames+1, 100);
+    jellyfish::parse_read::thread stream = parser.new_thread();
+
+    // Read pointer
+    jellyfish::read_parser::read_t *read;
+    // Read the input data
+    while ( (read = stream.next_read()) ) {
+
+      // Ignore header for now
+      //std::string header( read->header, read->hlen );
+      std::string seq(read->seq_s, std::distance(read->seq_s, read->seq_e)-1 );
+
+      // Remove newlines from the read
+      auto newEnd  = std::remove( seq.begin(), seq.end(), '\n' );
+      auto readLen = std::distance( seq.begin(), newEnd );
+
+      size_t offset = 0;
+      size_t mapped = 0;
+      uint64_t indexSize = _indexMap.size();
+
+      while ( offset < readLen - merLen )  {
+        auto mer = seq.substr( offset, merLen );
+        auto binMer = jellyfish::parse_dna::mer_string_to_binary( mer.c_str(), merLen );
+        // add it if we don't find it
+        if ( _indexMap.find(binMer) == _indexMap.end() ) {
+          _indexMap[binMer] = indexSize;
+          // We've grown the index
+          ++indexSize;
+        }
+        ++offset;
+      } // end kmer loop
+
+      std::cerr << "processed " << _numTranscripts << " transcripts\n";
+
+      ++_numTranscripts;
+    } // end transcripts loop
+
+    return true;
+  }
+
+  public:
+  LinearSystemBuilder( std::string& transcriptFile, HashT& readHash) :
+    _transcriptFile( transcriptFile ), _readHash( readHash ), _numTranscripts( 0 ) {}
+
+  void writeToFiles( const std::string& matFileName, const std::string& bFileName) {
+    // Map each occurring kmer to an index
+    buildIndexMap();
+
+    auto numRows = _indexMap.size();
+    auto numCols = _numTranscripts;
+
+    {
+      // Put the right-hand side in a vector since we'll have
+      // to write it out in order
+      std::vector<float> b(numRows,0.0f);
+      for ( auto& rowInd : _indexMap ){ b[rowInd.second] = static_cast<float>(_readHash[rowInd.first]); }
+
+      // write it out
+      std::ofstream bFile(bFileName);
+      for ( auto be : b ) { bFile << be << "\n"; }
+      bFile.close();
+    }
+
+    uint_t merLen = _readHash.get_mer_len();
+    const char* fnames[] = { _transcriptFile.c_str() };
+    jellyfish::parse_read parser( fnames, fnames+1, 100);
+    jellyfish::parse_read::thread stream = parser.new_thread();
+
+    std::ofstream matFile(matFileName);
+
+    std::stringstream ss(std::stringstream::out);
+    ss << "SPARSE\n";
+    ss << numRows << " " << numCols <<  "\n";
+
+    matFile << ss.str();
+
+    size_t nnzFileOffset = ss.str().length();
+
+    ss.clear();
+    ss << std::numeric_limits<size_t>::max();
+    auto nnzLen = ss.str().length();
+    std::string placeHolder(nnzLen, ' ');
+    matFile << "\n";
+    //matFile << placeHolder << "\n";
+
+    size_t nnz{0};
+    size_t col{0};
+    // Read pointer
+    jellyfish::read_parser::read_t *read;
+    // Read the input data
+    while ( (read = stream.next_read()) ) {
+
+      // Ignore header for now
+      std::string header( read->header, read->hlen );
+      std::string seq(read->seq_s, std::distance(read->seq_s, read->seq_e)-1 );
+
+      // Remove newlines from the read
+      auto newEnd  = std::remove( seq.begin(), seq.end(), '\n' );
+      auto readLen = std::distance( seq.begin(), newEnd );
+
+      size_t offset = 0;
+      size_t mapped = 0;
+      uint64_t indexSize = _indexMap.size();
+
+      std::unordered_map<size_t, size_t> rowEntries;
+      while ( offset < readLen - merLen )  {
+        auto mer = seq.substr( offset, merLen );
+        auto binMer = jellyfish::parse_dna::mer_string_to_binary( mer.c_str(), merLen );
+        auto row = _indexMap[binMer];
+        rowEntries[row] += 1;
+        ++offset;
+      } // end kmer loop
+
+      // write out entries
+      for ( auto& rowVal : rowEntries ) {
+        matFile << col+1 << "\t" << rowVal.first+1 << "\t" << rowVal.second << "\n";
+        ++nnz;
+      }
+      std::cerr << "processed " << col << " columns\n";
+      ++col;
+
+    } // end transcripts loop
+
+    matFile.seekp(nnzFileOffset);
+    matFile << nnz << "\n";
+    matFile.close();
+  }
+
+
+};
+
+#endif //LINEARSYSTEMBUILDER_HPP
diff --git a/include/obsolete/PoissonSolver.hpp b/include/obsolete/PoissonSolver.hpp
new file mode 100755
index 0000000..b3b841e
--- /dev/null
+++ b/include/obsolete/PoissonSolver.hpp
@@ -0,0 +1,320 @@
+#ifndef POISSON_SOLVER_HPP
+#define POISSON_SOLVER_HPP
+
+//#include "math_utils.h"
+//#include "exon_junction_extractor.h"
+#include <vector>
+#include <iostream>
+
+class isoform_opt {
+
+const double epsilon = 1e-10;
+const int max_iteration = 10000;
+bool do_EM = false;
+
+/*double sum_matrix(const std::vector<std::vector<double> > &matrix) {
+	double result = 0;
+	for (int i = 0; i < (int)matrix.size(); i++)
+		for (int j = 0; j < (int)matrix[i].size(); j++)
+			result += matrix[i][j];
+	return result;
+}*/
+
+/*std::vector<std::vector<double> > inv_matrix(const std::vector<std::vector<double> > &matrix) {
+	int m = (int)matrix.size(), n = (int)matrix[0].size();
+	MATRIX input, *output;
+	input.init((short)m, (short)n);
+	for (int i = 0; i < m; i++)
+		for (int j = 0; j < n; j++)
+			input.put(i, j, matrix[i][j]);
+	output = input.inverse();
+	std::vector<std::vector<double> > result;
+	result.resize(m);
+	for (int i = 0; i < m; i++) {
+		result[i].resize(m);
+		for (int j = 0; j < m; j++)
+			result[i][j] = output->get(i,j);
+	}
+	delete output;
+	return result;
+}*/
+
+public:
+	bool verbose;
+	int M; //number of categories
+	int K; //number of isoforms
+// for parameters
+//	std::vector<double> N; //category read counts
+//	std::vector<std::vector<double> > A; //category sampling rate matrix; A[i][j] is the sampling rate for isoform i and category j
+
+	isoform_opt() {verbose = false;}
+
+	double log_likelihood(const std::vector<std::vector<double> > &A, const std::vector<double> &N, const std::vector<double> &X) {
+		double result = 0;
+		double temp1 = 0;	
+		for (int i = 0; i < M; i++) {
+			double temp2 = 0;
+			for (int j = 0; j < K; j++) {
+				temp2 += X[j]*A[j][i];
+			}
+			temp1 += temp2;
+		}
+		result -= temp1;
+		temp1 = 0;
+		for (int i = 0; i < M; i++) {
+			double temp2 = 0;
+			for (int j = 0; j < K; j++) {
+				temp2 += X[j] * A[j][i];
+			}
+			temp1 += log(temp2 + epsilon) * N[i];
+		}
+		result += temp1;
+		return result;
+	}
+
+	double grad_log_likelihood(const std::vector<std::vector<double> > &A, const std::vector<double> &N, const std::vector<double> &X, int k) {
+		double result = 0;
+		double temp1 = 0;	
+		for (int i = 0; i < M; i++) {
+			temp1 += A[k][i];
+		}
+		result -= temp1;
+		temp1 = 0;
+		for (int i = 0; i < M; i++) {
+			double temp2 = 0;
+			for (int j = 0; j < K; j++) {
+				temp2 += X[j] * A[j][i];
+			}
+			temp1 += N[i] * A[k][i] / (temp2 + epsilon);
+		}
+		result += temp1;
+		return result;
+	}
+
+	std::vector<double> grad_log_likelihood(const std::vector<std::vector<double> > &A, const std::vector<double> &N, const std::vector<double> &X) {
+		std::vector<double> grad;
+		grad.resize(K);
+		for (int i = 0; i < K; i++) {
+			grad[i] = grad_log_likelihood(A, N, X, i);
+		}
+		return grad;
+	}
+
+	std::vector<std::vector<double> > hessian_log_likelihood(const std::vector<std::vector<double> > &A, const std::vector<double> &N, const std::vector<double> &X) {
+		std::vector<std::vector<double> > hessian;
+		hessian.resize(K);
+		for (int i = 0; i < K; i++) hessian[i].resize(K);
+		std::vector<double> temp1;
+		temp1.resize(M);
+		for (int i = 0; i < M; i++) {
+			double temp2 = 0;
+			for (int j = 0; j < K; j++) {
+				temp2 += X[j] * A[j][i];
+			}
+			temp1[i] = N[i] / (temp2 + epsilon) / (temp2 + epsilon);
+		}
+		for (int k = 0; k < K; k++) {
+			for (int l = 0; l < K; l++) {
+				hessian[k][l] = 0;
+				for (int i = 0; i < M; i++) {
+					hessian[k][l] -= temp1[i] * A[k][i] * A[l][i];
+				}
+				if (k == l) hessian[k][l] -= 1e-6;
+			}
+		}
+		return hessian;
+	}
+
+	std::vector<std::vector<double> > obs_fisher(const std::vector<std::vector<double> > &A, const std::vector<double> &N, const std::vector<double> &X) {
+		std::vector<std::vector<double> > temp = hessian_log_likelihood(A, N, X);
+		for (int i = 0; i < K; i++)
+			for (int j = 0; j < K; j++)
+				temp[i][j] = -temp[i][j];
+		return temp;
+	}
+
+/*	std::vector<std::vector<double> > inv_obs_fisher(const std::vector<double> &X) {
+		return inv_matrix(obs_fisher(X));
+	}*/
+	
+	std::vector<std::vector<double> > fisher(const std::vector<std::vector<double> > &A, const std::vector<double> &N, const std::vector<double> &X) {
+		std::vector<std::vector<double> > fisher;
+		fisher.resize(K);
+		for (int i = 0; i < K; i++) fisher[i].resize(K);
+		std::vector<double> temp1;
+		temp1.resize(M);
+		for (int i = 0; i < M; i++) {
+			double temp2 = 0;
+			for (int j = 0; j < K; j++) {
+				temp2 += X[j] * A[j][i];
+			}
+			temp1[i] = 1 / (temp2 + epsilon);
+		}
+		for (int k = 0; k < K; k++) {
+			for (int l = 0; l < K; l++) {
+				fisher[k][l] = 0;
+				for (int i = 0; i < M; i++) {
+					fisher[k][l] += temp1[i] * A[k][i] * A[l][i];
+				}
+				if (k == l) fisher[k][l] += 1e-6;
+			}
+		}
+		return fisher;
+	}
+
+/*	std::vector<std::vector<double> > inv_fisher(const std::vector<double> &X) {
+		return inv_matrix(fisher(X));
+	}*/
+
+	double fmax_coord(const std::vector<std::vector<double> > &A, const std::vector<double> &N, std::vector<double> &X, int k) {
+		double f_value = log_likelihood(A, N, X);
+		double grad = grad_log_likelihood(A, N, X, k);
+		if (grad > 0) grad = 1; else if (grad < 0) grad = -1; else return f_value;
+		double x_old = X[k];
+		double step = 1e6;
+		int iteration = 0;
+		while (true) {
+			X[k] = x_old + step * grad;
+			if (X[k] < 0) X[k] = 0;
+			double new_f_value = log_likelihood(A, N, X);
+			if (new_f_value > f_value) return new_f_value;
+			if (step > epsilon) step /= 2; else break;
+			if (iteration < max_iteration) iteration++; else break;
+		}
+		//if (verbose && step <= epsilon) cout << "warning: step <= epsilon\n";
+		if (iteration >= max_iteration) std::cout << "warning: iteration >= max_iteration\n";
+		X[k] = x_old;
+		return f_value;
+	}
+
+	double fmax_coord(const std::vector<std::vector<double> > &A, const std::vector<double> &N, std::vector<double> &X, bool use_EM) {
+		double f_value = log_likelihood(A, N, X);
+		int iteration = 0;
+		while (true) {
+			if (verbose) {
+				std::cout << "iteration: " << iteration << " value: " << f_value << " X: ";
+				for (int i = 0; i < K; i++) std::cout << X[i] << ",";
+				std::cout << std::endl;
+			}
+			if (use_EM) {
+				std::vector<double> temp1(M, 0);
+				for (int j = 0; j < M; j++) {
+					for (int i = 0; i < K; i++) {
+						temp1[j] += A[i][j] * X[i];
+					}
+					if (temp1[j] < epsilon) temp1[j] = epsilon;
+				}
+				std::vector<double> X_old = X;
+				for (int i = 0; i < K; i++) {
+					double temp2 = 0, temp3 = 0;
+					for (int j = 0; j < M; j++) {
+						temp2 += A[i][j] * N[j] / temp1[j];
+						temp3 += A[i][j];
+					}
+					if (temp3 < epsilon) temp3 = epsilon;
+					X[i] = X_old[i] * temp2 / temp3;
+					if (X[i] < 1e-4) X[i] = 0;
+				}
+			} else { //coordinate wise descent
+				for (int i = 0; i < K; i++) {
+					fmax_coord(A, N, X, i);
+				}
+			}
+			double new_f_value = log_likelihood(A, N, X);
+			if (fabs(f_value - new_f_value) < epsilon) {
+				if (verbose) std::cout << "number of iteration: " << iteration << std::endl;
+				return new_f_value;
+			}
+			f_value = new_f_value;
+			if (iteration < max_iteration) iteration++; else break;
+		}
+		if (iteration >= max_iteration) std::cout << "warning: iteration >= max_iteration\n";
+		return f_value;
+	}
+};
+
+
+inline void test_solve_likelihood() {
+	isoform_opt iso_opt;
+	bool do_EM = false;
+	int M = 3;
+	int K = 2;
+	iso_opt.M = M;
+	iso_opt.K = K;
+	std::vector<double> N;
+	N.resize(M);
+	N[0] = 100;
+	N[1] = 700;
+	N[2] = 500;
+	std::vector<std::vector<double> > A;
+	A.resize(K);
+	A[0].resize(M);
+	A[0][0] = 1;
+	A[0][1] = 1;
+	A[0][2] = 0;
+	A[1].resize(M);
+	A[1][0] = 0;
+	A[1][1] = 1;
+	A[1][2] = 1;
+	iso_opt.verbose = true;
+	std::vector<double> X;
+	X.resize(2);
+	X[0] = 1;
+	X[1] = 1;
+	iso_opt.fmax_coord(A, N, X, do_EM);
+	std::cout << "X: " << X[0] << "\t" << X[1] << std::endl;
+	std::vector<std::vector<double> > H = iso_opt.hessian_log_likelihood(A, N, X);
+	std::cout << "H: ";
+	std::cout << "\t" << H[0][0] << "\t" << H[0][1] << std::endl;
+	std::cout << "\t" << H[1][0] << "\t" << H[1][1] << std::endl;	
+	std::vector<std::vector<double> > OF = iso_opt.obs_fisher(A, N, X);
+	std::cout << "OF: ";
+	std::cout << "\t" << OF[0][0] << "\t" << OF[0][1] << std::endl;
+	std::cout << "\t" << OF[1][0] << "\t" << OF[1][1] << std::endl;	
+	std::vector<std::vector<double> > F = iso_opt.fisher(A, N, X);
+	std::cout << "F: ";
+	std::cout << "\t" << F[0][0] << "\t" << F[0][1] << std::endl;
+	std::cout << "\t" << F[1][0] << "\t" << F[1][1] << std::endl;	
+/*		std::vector<std::vector<double> > IOF = inv_obs_fisher(X);
+	cout << "IOF: ";
+	cout << "\t" << IOF[0][0] << "\t" << IOF[0][1] << std::endl;
+	cout << "\t" << IOF[1][0] << "\t" << IOF[1][1] << std::endl;	
+	std::vector<std::vector<double> > IF = inv_fisher(X);
+	cout << "IF: ";
+	cout << "\t" << IF[0][0] << "\t" << IF[0][1] << std::endl;
+	cout << "\t" << IF[1][0] << "\t" << IF[1][1] << std::endl;	*/
+}
+
+inline void solve_likelihood(const std::vector<std::vector<double> > &rates, const std::vector<double> &counts, std::vector<double> &exp, bool use_EM = false) {
+	isoform_opt iso_opt;
+	iso_opt.M = (int)counts.size();
+	iso_opt.K = (int)rates.size();
+	exp.resize(iso_opt.K);
+	for (int i = 0; i < iso_opt.K; i++) {
+		exp[i] = 1;
+	}
+	iso_opt.fmax_coord(rates, counts, exp, use_EM);
+}
+
+inline void solve_likelihood_t(const std::vector<std::vector<double> > &rates, const std::vector<double> &counts, std::vector<double> &exp, bool use_EM = false) {
+	isoform_opt iso_opt;
+	iso_opt.M = (int)counts.size();
+	//__ASSERT(iso_opt.M > 0, "internal error: empty rates.\n");
+	iso_opt.K = (int)rates[0].size();
+	std::vector<std::vector<double> > A;
+	A.resize(iso_opt.K);
+	for (int i = 0; i < iso_opt.K; i++) A[i].resize(iso_opt.M);
+	for (int i = 0; i < iso_opt.M; i++) {
+		for (int j = 0; j < iso_opt.K; j++) {
+			A[j][i] = rates[i][j];
+		}
+	}
+	exp.resize(iso_opt.K);
+	for (int i = 0; i < iso_opt.K; i++) {
+		exp[i] = 1;
+	}
+	iso_opt.fmax_coord(A, counts, exp, use_EM);
+}
+
+
+#endif // POISSON_SOLVER_HPP
\ No newline at end of file
diff --git a/include/obsolete/affymetrix_utils.hpp b/include/obsolete/affymetrix_utils.hpp
new file mode 100755
index 0000000..a0aac10
--- /dev/null
+++ b/include/obsolete/affymetrix_utils.hpp
@@ -0,0 +1,62 @@
+
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=2 et sw=2 tw=80: */
+#include <fstream>
+#include <iostream>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/filesystem.hpp>
+
+#include <seqan/sequence.h>
+
+namespace affymetrix {
+
+  namespace utils {
+
+    using std::string;
+    using std::vector;
+
+
+class ProbeTarget{
+public:
+friend std::ostream& operator<< (std::ostream&, const ProbeTarget&);
+ProbeTarget( const string& _geneName, const seqan::DnaString& _probeSeq );
+ProbeTarget();
+  std::string geneName;
+  seqan::DnaString probeSeq;
+};
+
+    typedef std::unordered_map<string, ProbeTarget> StringMapT;
+    typedef std::unordered_set<string> StringSetT;
+
+    /**
+     * Given a file containing the probe information, build a reverse mapping from
+     * probe name to ENSEMBL gene name.  Any probe mapping to > 1 gene will be discarded.
+     */
+    StringMapT ComputeReverseMap( const string& fname );
+
+    /**
+     * Given a file containing a list of strings, return a vector of corresponding
+     * paths.  This is the list of probe files to be read.
+     */
+    vector< boost::filesystem::path > GetProbeFiles( const string& fname );
+
+    class AffyEntry {
+    public:
+      AffyEntry( const string& _probeName, float _expression, float _background );
+
+      std::string probeName;
+      float expression;
+      float background;
+    };
+
+  }
+
+}
diff --git a/include/obsolete/graph_utils.hpp b/include/obsolete/graph_utils.hpp
new file mode 100755
index 0000000..5425a84
--- /dev/null
+++ b/include/obsolete/graph_utils.hpp
@@ -0,0 +1,82 @@
+#ifndef GRAPH_UTILS_HPP
+#define GRAPH_UTILS_HPP
+
+#include <type_traits>
+#include <boost/graph/adjacency_list.hpp>
+
+#include "mlrmcl/metis.h"
+
+namespace utils {
+	namespace graph {
+	
+	template <typename GraphT>//, typename IndexT, typename WeightT>
+	bool boostToMetis( GraphT& G, graphdef* metisG) {
+
+		std::cerr << "calling InitGraph\n";
+	    // Initalize the graph structure
+		InitGraph(metisG);
+		std::cerr << "done\n";
+
+  	    // Check if this graph has weighted edges or not
+		bool hasEdgeWeights = !(std::is_same< typename GraphT::edge_property_type, typename boost::no_property >::value);
+	    // Number of vertices in this graph
+		auto numVerts = metisG->nvtxs = boost::num_vertices(G);
+
+	    // Allocate the arrays for the vertex adj. offsets and adj. structure
+		metisG->xadj = new idxtype[numVerts+1]; metisG->xadj[0] = 0;
+		metisG->adjncy = new idxtype[ 2 * boost::num_edges(G) + numVerts ];
+
+	    // If the original graph has edge weights, then allocate the memory for the
+	    // corresponding array.
+		if ( hasEdgeWeights ) {
+			metisG->adjwgt = new idxtype[ 2 * boost::num_edges(G) + numVerts];
+		}
+
+		size_t edgeNum{0};
+	    
+	    // For each vertex in order
+		for ( size_t i = 0; i < numVerts; ++i ) {
+			bool selfLoop{false};
+
+			// Keep track of the maximum edge weight (this is what
+			// we'll set the self-loop edge weight to).
+			idxtype maxWeight = std::numeric_limits<idxtype>::min();
+
+		    // Iterate over all edges adjacent to vertex i
+			auto se = boost::out_edges(i,G);
+			for ( auto& e = se.first; e != se.second; ++e ) {
+				auto tgt = boost::target(*e,G);
+				if ( tgt == i ) { selfLoop = true; }
+
+                // Output the pair (target, weight)
+				metisG->adjncy[edgeNum] = tgt;
+				if ( hasEdgeWeights ) { 
+					auto weight = G[*e].weight; 
+					metisG->adjwgt[edgeNum] = weight;
+					maxWeight = (weight > maxWeight) ? weight : maxWeight;
+				}
+				++edgeNum;
+			} 
+
+		    // Ensure self-loops
+			if ( !selfLoop ) { 
+				metisG->adjncy[edgeNum] = i; 
+				if ( hasEdgeWeights ) { metisG->adjwgt[edgeNum] = maxWeight; }
+				++edgeNum; 
+			}
+
+			if ( i < numVerts - 1 ) { metisG->xadj[i+1] = edgeNum; }
+		}
+	    // # of edges
+
+		// Apparently, this is necessary
+		metisG->xadj[metisG->nvtxs] = edgeNum;
+
+		metisG->nedges = edgeNum;
+		return true;
+	} // end of boostToMetis
+
+	} // end of graph
+} // end of utils
+
+#endif // GRAPH_UTILS_HPP
\ No newline at end of file
diff --git a/include/obsolete/linq.h b/include/obsolete/linq.h
new file mode 100755
index 0000000..719bdd0
--- /dev/null
+++ b/include/obsolete/linq.h
@@ -0,0 +1,473 @@
+/*=============================================================================
+    Copyright (c) 2012 Paul Fultz II
+    linq.h
+    Distributed under the Boost Software License, Version 1.0. (See accompanying
+    file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+==============================================================================*/
+
+#ifndef INCLUDE_GUARD_LINQ_H
+#define INCLUDE_GUARD_LINQ_H
+
+#include <utility>
+#include <boost/optional.hpp>
+#include <boost/preprocessor.hpp>
+#include <boost/preprocessor/facilities/is_empty.hpp>
+#include <boost/range.hpp>
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+
+//
+//
+// Some preprocessor utilites
+//
+//
+
+//
+// LINQ_RETURNS for auto return type deduction.
+//
+#define LINQ_RETURNS(...) -> decltype(__VA_ARGS__) { return (__VA_ARGS__); }
+
+//
+// LINQ_IS_PAREN is used to detect if the first token is a parenthesis.
+// It expands to 1 if it is, otherwise it expands to 0.
+//
+#define LINQ_IS_PAREN(x) LINQ_IS_PAREN_CHECK(LINQ_IS_PAREN_PROBE x)
+#define LINQ_IS_PAREN_CHECK(...) LINQ_IS_PAREN_CHECK_N(__VA_ARGS__,0)
+#define LINQ_IS_PAREN_PROBE(...) ~, 1,
+#ifndef _MSC_VER
+#define LINQ_IS_PAREN_CHECK_N(x, n, ...) n
+#else
+// MSVC workarounds
+#define LINQ_IS_PAREN_CHECK_RES(x) x
+#define LINQ_IS_PAREN_CHECK_II(x, n, ...) n
+#define LINQ_IS_PAREN_CHECK_I(x) LINQ_IS_PAREN_CHECK_RES(LINQ_IS_PAREN_CHECK_II x)
+#define LINQ_IS_PAREN_CHECK_N(...) LINQ_IS_PAREN_CHECK_I((__VA_ARGS__))
+#endif
+
+//
+// LINQ_IS_EMPTY will expands to 1 if the parameter is empty, otherwise
+// it expands to 0. This will work even if the parameter given is a set
+// of parenthesis.
+//
+#define LINQ_IS_EMPTY(x) BOOST_PP_CAT(LINQ_IS_EMPTY_, LINQ_IS_PAREN(x))(x)
+#define LINQ_IS_EMPTY_0(x) BOOST_PP_IS_EMPTY(x)
+#define LINQ_IS_EMPTY_1(x) 0
+
+//
+// LINQ_HEAD retrieves the first element of a sequence.
+// Example:
+//
+// LINQ_HEAD((1)(2)(3)) // Expands to (1)
+//
+#define LINQ_HEAD(x) LINQ_PICK_HEAD(LINQ_MARK x)
+#define LINQ_MARK(...) (__VA_ARGS__),
+#define LINQ_PICK_HEAD(...) LINQ_PICK_HEAD_I(__VA_ARGS__,)
+#ifndef _MSC_VER
+#define LINQ_PICK_HEAD_I(x, ...) x
+#else
+// MSVC workarounds
+#define LINQ_PICK_HEAD_III(x, ...) x
+#define LINQ_PICK_HEAD_II(x) LINQ_PICK_HEAD_III x
+#define LINQ_PICK_HEAD_I(...) LINQ_PICK_HEAD_II((__VA_ARGS__))
+#endif
+
+
+//
+// LINQ_TAIL retrieves the tail of a sequence.
+// Example:
+//
+// LINQ_TAIL((1)(2)(3)) // Expands to (2)(3)
+//
+#define LINQ_TAIL(x) LINQ_EAT x
+// Various utilities
+#define LINQ_EAT(...)
+#define LINQ_REM(...) __VA_ARGS__
+#define LINQ_EXPAND(...) __VA_ARGS__
+
+//
+// LINQ_BACK gets the last element of a sequence
+//
+#define LINQ_BACK(seq) BOOST_PP_SEQ_ELEM(BOOST_PP_DEC(BOOST_PP_SEQ_SIZE(seq)), seq)
+
+//
+// LINQ_KEYWORD transforms the keyword. Keywords are generally defined
+// like this:
+//
+// //This defines a `my_keyword` macro
+// //NOTE: The space between the keyword and parenthesis is necessary
+// #define LINQ_KEYWORD_my_keyword (MY_KEYWORD_MACRO)
+//
+// Here is an example:
+//
+// LINQ_KEYWORD(my_keyword foo) // Expands to (MY_KEYWORD_MACRO) foo
+//
+#define LINQ_KEYWORD(x) BOOST_PP_CAT(LINQ_KEYWORD_, x)
+
+//
+// LINQ_IS_KEYWORD will expand to 1 if the first token is a valid keyword
+//
+#define LINQ_IS_KEYWORD(x) LINQ_IS_PAREN(LINQ_KEYWORD(x))
+
+//
+// LINQ_PLACE retrieves whats placed in parenthesis. In essence it does
+// this:
+//
+// LINQ_PLACE((1) foo) //Expands to 1
+//
+// But when its used in the contexts of keywords, it will retrieve whats
+// been defined inside the parenthesis of a keyword. Heres an example:
+//
+// //This defines a `my_keyword` macro
+// //NOTE: The space between the keyword and parenthesis is necessary
+// #define LINQ_KEYWORD_my_keyword (MY_KEYWORD_MACRO)
+// LINQ_PLACE(LINQ_KEYWORD(my_keyword foo)) // Expands to MY_KEYWORD_MACRO
+//
+#define LINQ_PLACE(x) LINQ_EXPAND(LINQ_REM LINQ_PICK_HEAD(LINQ_MARK x))
+
+//
+// LINQ_TO_SEQ converts the keywords into a preprocessor sequence
+//
+#define LINQ_TO_SEQ(x) LINQ_TO_SEQ_WHILE_M \
+( \
+BOOST_PP_WHILE(LINQ_TO_SEQ_WHILE_P, LINQ_TO_SEQ_WHILE_O, (,x)) \
+)
+
+#define LINQ_TO_SEQ_WHILE_P(r, state) LINQ_TO_SEQ_P state
+#define LINQ_TO_SEQ_WHILE_O(r, state) LINQ_TO_SEQ_O state
+#define LINQ_TO_SEQ_WHILE_M(state) LINQ_TO_SEQ_M state
+
+#define LINQ_TO_SEQ_P(prev, tail) BOOST_PP_NOT(LINQ_IS_EMPTY(tail))
+#define LINQ_TO_SEQ_O(prev, tail) \
+BOOST_PP_IF(LINQ_IS_PAREN(tail), \
+    LINQ_TO_SEQ_PAREN, \
+    LINQ_TO_SEQ_KEYWORD \
+    )(prev, tail)
+#define LINQ_TO_SEQ_PAREN(prev, tail) \
+(prev (LINQ_HEAD(tail)), LINQ_TAIL(tail))
+
+#define LINQ_TO_SEQ_KEYWORD(prev, tail) \
+LINQ_TO_SEQ_REPLACE(prev, LINQ_KEYWORD(tail))
+
+#define LINQ_TO_SEQ_REPLACE(prev, tail) \
+(prev LINQ_HEAD(tail), LINQ_TAIL(tail))
+
+#define LINQ_TO_SEQ_M(prev, tail) prev
+
+
+//
+// LINQ_SEQ_TO_STRING convert a sequence back to a string of tokens
+//
+#define LINQ_SEQ_TO_STRING(seq) BOOST_PP_SEQ_FOR_EACH(LINQ_SEQ_TO_STRING_EACH, ~, seq)
+#define LINQ_SEQ_TO_STRING_EACH(r, data, x) x
+
+//
+// LINQ_SEQ_SPLIT
+//
+#define LINQ_SEQ_SPLIT(seq, pred, data) LINQ_SEQ_SPLIT_FOLD_LEFT_M(BOOST_PP_SEQ_FOLD_LEFT(LINQ_SEQ_SPLIT_FOLD_LEFT_O, (pred, data,,), seq))
+#define LINQ_SEQ_SPLIT_FOLD_LEFT_O(s, state, x) LINQ_SEQ_SPLIT_FOLD_LEFT_INVOKE((s, x, LINQ_REM state))
+#ifndef _MSC_VER
+#define LINQ_SEQ_SPLIT_FOLD_LEFT_INVOKE(x) LINQ_SEQ_SPLIT_OP x
+#else
+#define LINQ_SEQ_SPLIT_FOLD_LEFT_INVOKE(x) LINQ_SEQ_SPLIT_FOLD_LEFT_INVOKE_I x
+#define LINQ_SEQ_SPLIT_FOLD_LEFT_INVOKE_I(...) LINQ_SEQ_SPLIT_FOLD_LEFT_INVOKE_II((__VA_ARGS__))
+#define LINQ_SEQ_SPLIT_FOLD_LEFT_INVOKE_II(x) LINQ_SEQ_SPLIT_OP x
+#endif
+#define LINQ_SEQ_SPLIT_OP(s, x, pred, data, seq, elem) BOOST_PP_IF(pred(s, data, x), LINQ_SEQ_SPLIT_OP_TRUE, LINQ_SEQ_SPLIT_OP_FALSE)(x, pred, data, seq, elem)
+#define LINQ_SEQ_SPLIT_OP_TRUE(x, pred, data, seq, elem) BOOST_PP_IIF(LINQ_IS_PAREN(elem), \
+                                                                    (pred, data, seq(elem),),\
+                                                                    (pred, data, seq,) )
+#define LINQ_SEQ_SPLIT_OP_FALSE(x, pred, data, seq, elem) (pred, data, seq, elem (x))
+#ifndef _MSC_VER
+#define LINQ_SEQ_SPLIT_FOLD_LEFT_M(x) LINQ_SEQ_SPLIT_M x
+#else
+#define LINQ_SEQ_SPLIT_FOLD_LEFT_M_X(x) x
+#define LINQ_SEQ_SPLIT_FOLD_LEFT_M_N(x) LINQ_SEQ_SPLIT_FOLD_LEFT_M_X(LINQ_SEQ_SPLIT_FOLD_LEFT_M_X(LINQ_SEQ_SPLIT_FOLD_LEFT_M_X(LINQ_SEQ_SPLIT_FOLD_LEFT_M_X(x))))
+#define LINQ_SEQ_SPLIT_FOLD_LEFT_M(x) LINQ_SEQ_SPLIT_FOLD_LEFT_M_N(LINQ_SEQ_SPLIT_M x)
+#endif
+#define LINQ_SEQ_SPLIT_M(pred, data, seq, elem) seq BOOST_PP_IIF(LINQ_IS_PAREN(elem), (elem),)
+
+//
+// LINQ_SEQ_NEST
+//
+#define LINQ_SEQ_NEST(seq) BOOST_PP_SEQ_FOLD_LEFT(LINQ_SEQ_NEST_OP, LINQ_BACK(seq) , BOOST_PP_SEQ_POP_BACK(seq))
+#define LINQ_SEQ_NEST_OP(s, state, x) x(state)
+
+//
+// LINQ_SEQ_NEST_REVERSE
+//
+#define LINQ_SEQ_NEST_REVERSE(seq) BOOST_PP_SEQ_FOLD_RIGHT(LINQ_SEQ_NEST_OP, LINQ_BACK(seq) , BOOST_PP_SEQ_POP_BACK(seq))
+
+// TODO: LINQ_SEQ_TRANSFORM_W
+
+
+namespace linq {
+
+// MSVC 2010 doesn't provide declval
+// We also return T&& instead std::add_rvalue_reference<T>
+// because MSVC has a buggy implementation of it.
+// So, this function will work in all cases except when T
+// is void(which should rarely happen).
+template <typename T>
+T&& declval(); // no definition required
+
+// Lambdas aren't very nice, so we use this wrapper to make them play nicer. This
+// will make the function_object default constructible, even if it doesn't have a
+// default constructor. This helpful since these function objects are being used
+// inside of iterators.
+template<class Fun>
+struct function_object
+{
+    boost::optional<Fun> f;
+
+    function_object()
+    {}
+    function_object(Fun f): f(f)
+    {}
+
+    function_object(const function_object & rhs) : f(rhs.f)
+    {}
+
+    // Assignment operator is just a copy construction, which does not provide
+    // the strong exception guarentee.
+    function_object& operator=(const function_object& rhs)
+    {
+        if (this != &rhs)
+        {
+            this->~function_object();
+            new (this) function_object(rhs);
+        }
+        return *this;
+    }
+
+    template<class F>
+    struct result
+    {};
+
+    template<class F, class T>
+    struct result<F(T)>
+    {
+        typedef decltype(linq::declval<Fun>()(linq::declval<T>())) type;
+    };
+
+    template<class T>
+    auto operator()(T && x) const LINQ_RETURNS((*f)(std::forward<T>(x)))
+
+    template<class T>
+    auto operator()(T && x) LINQ_RETURNS((*f)(std::forward<T>(x)))
+};
+
+template<class F>
+function_object<F> make_function_object(F f)
+{
+    return function_object<F>(f);
+}
+
+
+// bind_iterator
+template<class OuterIterator, class Selector, class SelectorRange = typename std::result_of<Selector(typename boost::iterator_reference<OuterIterator>::type)>::type>
+struct bind_iterator
+: boost::iterator_facade
+<
+    bind_iterator<OuterIterator, Selector, SelectorRange>,
+    typename boost::range_value<SelectorRange >::type,
+    boost::forward_traversal_tag,
+    typename boost::range_reference<SelectorRange >::type
+>
+{
+    typedef typename boost::range_iterator<SelectorRange >::type InnerIteraror;
+
+    Selector selector;
+    OuterIterator iterator;
+    InnerIteraror inner_first;
+    InnerIteraror inner_last;
+    OuterIterator last;
+
+    bind_iterator(Selector selector, OuterIterator iterator, OuterIterator last) : selector(selector), iterator(iterator), last(last)
+    {
+        this->select();
+    }
+
+    void select()
+    {
+        for(;iterator!=last;iterator++)
+        {
+            if (inner_first==inner_last)
+            {
+                auto&& r = selector(*iterator);
+                inner_first = boost::begin(r);
+                inner_last = boost::end(r);
+            }
+            else inner_first++;
+            for(;inner_first!=inner_last;inner_first++)
+                return;
+        }
+    }
+
+    void increment()
+    {
+        this->select();
+    }
+
+    bool equal(const bind_iterator& other) const
+    {
+        return this->iterator == other.iterator;
+    }
+
+    typename boost::range_reference<SelectorRange >::type dereference() const
+    {
+        return *inner_first;
+    }
+
+};
+
+template<class Iterator, class Selector>
+bind_iterator<Iterator, Selector> make_bind_iterator(Selector selector, Iterator iterator, Iterator last)
+{
+    return bind_iterator<Iterator, Selector>(selector, iterator, last);
+}
+
+template<class Range, class Selector>
+auto bind_range(Range && r, Selector s) LINQ_RETURNS
+(
+    boost::make_iterator_range
+    (
+        make_bind_iterator(s, boost::begin(r), boost::end(r)),
+        make_bind_iterator(s, boost::end(r), boost::end(r))
+    )
+)
+
+// Bound range adaptor
+namespace detail {
+template<class Selector>
+struct bound_t
+{
+    Selector s;
+    bound_t(Selector s) : s(s)
+    {
+    }
+
+    template<class Range>
+    friend auto operator|(Range && r, const bound_t self) -> decltype(linq::bind_range(std::forward<Range>(r), linq::declval<Selector>()))
+    {
+        return linq::bind_range(std::forward<Range>(r), self.s);
+    }
+
+};
+}
+
+template<class Selector>
+detail::bound_t<Selector> bound(Selector s)
+{
+    return detail::bound_t<Selector>(s);
+}
+
+
+
+namespace detail {
+
+// These are used to avoid using parenthessis with lambdas, since it
+// could perhaps result in a pathological case for the preprocessor.
+struct where_t
+{
+    template<class Pred>
+    auto operator+(Pred p) LINQ_RETURNS(boost::adaptors::filtered(linq::make_function_object(p)))
+};
+
+struct select_t
+{
+    template<class Tran>
+    auto operator+(Tran t) LINQ_RETURNS(boost::adaptors::transformed(linq::make_function_object(t)))
+};
+
+struct select_many_t
+{
+    template<class Sel>
+    auto operator+(Sel s) LINQ_RETURNS(linq::bound(make_function_object(s)))
+};
+}
+
+detail::select_many_t select_many = {};
+detail::select_t select = {};
+detail::where_t where = {};
+
+}
+
+// Helps in defining lambdas in two steps. It also dedcues the type held
+// by the container.
+#define LINQ_LAMBDA_BLOCK(...) { return (__VA_ARGS__); }
+#define LINQ_LAMBDA_HEADER(var, col) [&](decltype( *(boost::begin(col)) ) var)
+
+// These macros help in defining the clauses. So instead of writing this:
+//
+// auto r = numbers | boost::adaptors::filtered([](int i) { return i > 2});
+//
+// This can be written:
+//
+// auto r = numbers | LINQ_WHERE(i, numbers)(i > 2);
+//
+#define LINQ_WHERE(var, col) linq::where + LINQ_LAMBDA_HEADER(var, col) LINQ_LAMBDA_BLOCK
+#define LINQ_SELECT(var, col) linq::select + LINQ_LAMBDA_HEADER(var, col) LINQ_LAMBDA_BLOCK
+#define LINQ_SELECT_MANY(var, col) linq::select_many + LINQ_LAMBDA_HEADER(var, col) LINQ_LAMBDA_BLOCK
+
+//
+// Keywords used by linq
+//
+#define LINQ_KEYWORD_from ()
+#define LINQ_KEYWORD_select (LINQ_SELECT)
+#define LINQ_KEYWORD_where (LINQ_WHERE)
+
+//
+// Process the sequence
+//
+
+// Expands the parameter, for another preprocessor scan
+#define LINQ_X(...) __VA_ARGS__
+
+#define LINQ_PROCESS_PAREN(data, x) x
+#ifndef _MSC_VER
+#define LINQ_PROCESS_KEYWORD(data, x) | x data
+#else
+// MSVC Workarounds
+#define LINQ_PROCESS_KEYWORD_RES(x) x
+#define LINQ_PROCESS_KEYWORD(data, x) | LINQ_PROCESS_KEYWORD_RES(x data)
+#endif
+
+#define LINQ_COL(var, col) col
+
+// Process the select, where clauses
+#define LINQ_SELECT_WHERE(seq) LINQ_SELECT_WHERE_TRANSFORM(BOOST_PP_SEQ_ELEM(0, seq) ,BOOST_PP_SEQ_REST_N(1, seq))
+#define LINQ_SELECT_WHERE_TRANSFORM(data, seq) LINQ_COL data LINQ_SEQ_TO_STRING(BOOST_PP_SEQ_TRANSFORM(LINQ_SELECT_WHERE_O, data, seq))
+#define LINQ_SELECT_WHERE_O(s, data, x) BOOST_PP_IF(LINQ_IS_PAREN(x), LINQ_PROCESS_PAREN, LINQ_PROCESS_KEYWORD)(data, x)
+
+// Process from clauses
+// ()((x, col))()((y, x.col))(LINQ_SELECT)((x))
+// SPLIT
+// ( ((x, col)) ) ( ((y, x.col))(LINQ_SELECT)((x)) )
+// TRANSFORM
+// (SELECT_MANY(x, col))((y, x.col)(LINQ_SELECT)(x))
+// NEST
+#define LINQ_FROM(seq) LINQ_FROM_CHECK(LINQ_SEQ_SPLIT(seq, LINQ_FROM_P, data))
+#define LINQ_FROM_CHECK(seq) BOOST_PP_IF(BOOST_PP_DEC(BOOST_PP_SEQ_SIZE(seq)), LINQ_FROM_MULTI, LINQ_FROM_SINGLE)(seq)
+#define LINQ_FROM_MULTI(seq) LINQ_SEQ_NEST(LINQ_FROM_TRANSFORM(seq))
+#define LINQ_FROM_SINGLE(seq) LINQ_SELECT_WHERE(BOOST_PP_SEQ_HEAD(seq))
+
+
+#define LINQ_FROM_TRANSFORM(seq) BOOST_PP_SEQ_TRANSFORM(LINQ_FROM_OP, data, BOOST_PP_SEQ_POP_BACK(seq)) (LINQ_SELECT_WHERE(LINQ_BACK(seq)))
+
+#define LINQ_FROM_P(s, data, x) LINQ_IS_EMPTY(x)
+#ifndef _MSC_VER
+#define LINQ_FROM_OP(s, data, x) LINQ_PROCESS_FROM LINQ_REM x
+#else
+// MSVC Workarounds
+#define LINQ_FROM_OP(s, data, x) LINQ_FROM_OP_INVOKE(LINQ_REM x)
+#define LINQ_FROM_OP_INVOKE_X(x) x
+#define LINQ_FROM_OP_INVOKE(x) LINQ_FROM_OP_INVOKE_X(LINQ_PROCESS_FROM x)
+#endif
+#define LINQ_PROCESS_FROM(var, col) col | LINQ_SELECT_MANY(var, col)
+
+// Transforms the sequence
+#define LINQ_TRANSFORM(seq) LINQ_X(LINQ_FROM(seq))
+// And finally the LINQ macro
+#define LINQ(x) LINQ_TRANSFORM(LINQ_TO_SEQ(x))
+
+#endif
\ No newline at end of file
diff --git a/include/obsolete/set_multicover_solver.hpp b/include/obsolete/set_multicover_solver.hpp
new file mode 100755
index 0000000..94ba3b6
--- /dev/null
+++ b/include/obsolete/set_multicover_solver.hpp
@@ -0,0 +1,492 @@
+#ifndef SET_MULTICOVER_SOLVER_HPP
+#define SET_MULTICOVER_SOLVER_HPP
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+#include <unordered_set>
+
+/** Boost Includes */
+#include <boost/program_options.hpp>
+#include <boost/progress.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/heap/fibonacci_heap.hpp>
+
+#include <jellyfish/sequence_parser.hpp>
+#include <jellyfish/parse_read.hpp>
+#include <jellyfish/mer_counting.hpp>
+#include <jellyfish/misc.hpp>
+#include <jellyfish/compacted_hash.hpp>
+
+template <typename ReadHashT, typename TranscriptHashT>
+class SetMulticoverSolver {
+
+    private:
+        /**
+        * Typedefs
+        */
+        // A KmerMapT is a map from a kmer (encoded as an integer) to the set
+        // of transcripts where it occurs
+        typedef uint32_t TranscriptIDT;
+        typedef uint64_t KmerIDT;
+        typedef std::unordered_map< KmerIDT, std::unordered_set<TranscriptIDT> > KmerMapT;
+        typedef std::tuple<TranscriptIDT, std::vector<KmerIDT>> TranscriptKmerSet;
+        typedef std::string* StringPtrT;
+        typedef uint64_t TranscriptScoreT;
+        typedef jellyfish::invertible_hash::array<uint64_t, atomic::gcc, allocators::mmap> HashArrayT;
+        // Necessary forward declaration
+        struct TranscriptDataT;
+        typedef std::tuple<TranscriptScoreT,TranscriptIDT> HeapPair;
+        typedef typename boost::heap::fibonacci_heap<HeapPair>::handle_type HandleT;
+
+        size_t _merLen;
+        ReadHashT& _readHash;
+        TranscriptHashT& _transcriptHash;
+        std::unordered_map<KmerIDT, size_t> _merHash;
+
+        struct TranscriptDataT {
+            uint32_t id;
+            uint32_t numTimesUsed;
+            HandleT heapHandle;
+            StringPtrT header;
+            std::unordered_set<uint64_t> binMers;
+            bool active;
+            double cost;
+            double benefit;
+        };
+    
+        // This struct represents a "job" (transcript) that needs to be processed 
+        struct TranscriptJob{
+        	StringPtrT header;
+        	StringPtrT seq;
+        	uint32_t id;
+        };
+
+        struct TranscriptResult {
+        	TranscriptDataT* data;
+        	TranscriptKmerSet* ks;
+        };
+
+        struct BinmerUpdates {
+        	std::vector<KmerIDT> zeroedBinmers;
+        	std::vector<KmerIDT> updatedBinmers;
+        };
+
+        // The number of occurences above whcih a kmer is considered promiscuous
+        size_t _promiscuousKmerCutoff{10};
+
+        // Map each kmer to the set of transcripts it occurs in
+        KmerMapT _transcriptsForKmer;
+
+        // The actual data for each transcript
+        std::vector<TranscriptDataT*> _transcripts;
+
+        // Is a given kmer considered part of the universe?
+        inline bool _considered( const char* mer ) {
+        	// The kmer is only considered if it exists in the transcript set 
+        	// (i.e. it's possible to cover) and it's less prmiscuous than the
+        	// cutoff.
+        	return ( _readHash[mer] > 0 and 
+        		     _transcriptHash[mer] > 0 and 
+        		     _transcriptHash[mer] < _promiscuousKmerCutoff );
+        }
+
+        // Process the transcript
+        BinmerUpdates _processTranscript( uint32_t transcriptID, uint64_t &rem, size_t& numTimesUsed ) {
+        	auto transcript = _transcripts[transcriptID];
+        	std::unordered_map<uint64_t, std::unordered_set<uint64_t>> binMerCounts;
+        	size_t minCount = std::numeric_limits<size_t>::max();
+        	// Both the zeroed kmers and all affected kmers start out empty
+        	BinmerUpdates updates{ {}, {} };
+
+            // loop over all kmers (remember, only considered kmers are stored here)
+            for ( auto binMer : transcript->binMers ) {
+            	// See if this kmer is still active
+            	auto merIt = _merHash.find(binMer);
+            	// If it is, then
+            	if ( merIt != _merHash.end() ) {
+            		auto currentCount = merIt->second;
+            		// Is it the minimum count kmer?
+            		minCount = (currentCount < minCount) ? currentCount : minCount;
+            		// Insert it into the map with it's current count
+            		binMerCounts[currentCount].insert(binMer);
+            	}
+            }
+
+            // The number of kmers we're getting rid of
+            rem -= binMerCounts[minCount].size();
+
+            // For every kmer in this transcript
+            for ( auto countMer : binMerCounts ) {
+            	auto origCount = countMer.first;
+            	// Get rid of minCount copies of it
+            	for ( auto binMer : countMer.second ) {
+            		updates.updatedBinmers.push_back(binMer);
+            		_merHash[binMer] -= minCount;
+            		// If this is a kmer with minimum count, then get rid of it
+            		if ( origCount == minCount ) {
+            			_merHash.erase(binMer);
+            			updates.zeroedBinmers.push_back(binMer);
+            		}
+            	}
+            }
+            
+            numTimesUsed = minCount;
+            return updates;
+        }
+
+        void _createUniverse( const std::string& transcriptFile, uint64_t& numRequiredCover ) {
+
+        	// open up the transcript file for reading
+            const char *fnames[] = { transcriptFile.c_str() };
+            jellyfish::parse_read parser( fnames, fnames + 1, 100);
+            jellyfish::parse_read::thread stream = parser.new_thread();
+
+            // Read pointer
+            jellyfish::read_parser::read_t *read;
+
+            // So we can concisely identify each transcript
+            uint32_t transcriptIndex {0};
+
+            // Holds the tasks to be processed
+            boost::lockfree::fifo< TranscriptJob * > workQueue;
+            // Holds the processed transcript data
+            boost::lockfree::fifo< TranscriptDataT * > processedReadQueue;
+            // Holds results that we will use to update the index map
+            boost::lockfree::fifo< TranscriptKmerSet * > q;
+
+            size_t numActors = 26;
+            boost::threadpool::pool tp(numActors);
+
+            auto readResults = [&q, &tp, this] () -> void {
+                size_t numRes = 0;
+                // While tasks remain to be processed
+                while (tp.active() + tp.pending() > 1) {
+                    TranscriptKmerSet *ks;
+                    // Process existing results
+                    while ( q.dequeue(ks) ) {
+
+                        uint32_t transcriptID = std::get<0>(*ks);
+                        // For each kmer in this transcrpt, add this
+                        // transcript to all kmers it contains in the map
+                        for ( uint64_t kmer : std::get<1>(*ks) ) {
+                            this->_transcriptsForKmer[kmer].insert(transcriptID);
+                        }
+
+                        // no memory leaks!
+                        delete ks;
+                        ++numRes;
+                        std::cerr << "#res = " << numRes << "\n";
+                    }
+                    boost::this_thread::sleep_for(boost::chrono::milliseconds(250));
+                }
+            };
+
+            tp.schedule(readResults);
+
+            bool done {false};
+            std::atomic<size_t> numJobs {0};
+
+            for (size_t i = 0; i < numActors - 1; ++i) {
+
+                auto task = [&workQueue, &q, &done, &processedReadQueue, &numJobs, this]() -> void {
+                    TranscriptJob *tj;
+                    while ( !done || numJobs > 0 ) {
+                        while (workQueue.dequeue(tj) ) {
+                            auto seq = tj->seq;
+                            auto newEnd  = std::remove( seq->begin(), seq->end(), '\n' );
+                            auto readLen = std::distance( seq->begin(), newEnd );
+                            StringPtrT newSeq = new std::string( seq->begin(), seq->begin() + readLen); 
+                            auto transcriptIndex = tj->id;
+
+                            // The set of kmers in this transcript
+                            auto ts = new TranscriptKmerSet {transcriptIndex, {}};
+                            
+                            size_t offset {0};
+                            // We know how many kmers there will be, so allocate the space
+                            size_t numKmers {readLen - this->_merLen+1};
+                            std::get<1>(*ts).reserve(numKmers);
+                            // The binary representation of the kmers in this transcript
+                            std::unordered_set<uint64_t> binMers;
+                            //binMers.reserve(numKmers);
+
+                            // Iterate over the kmers
+                            while ( offset < numKmers )  {
+                                auto mer = newSeq->substr( offset, this->_merLen );
+                                // only count and track kmers which should be considered
+                                if ( this->_considered(mer.c_str()) ) {
+                                    auto binMer = jellyfish::parse_dna::mer_string_to_binary( mer.c_str(), _merLen );
+                                    //binMers.push_back(binMer);
+                                    binMers.insert(binMer);
+                                    std::get<1>(*ts).push_back(binMer);
+                                }
+
+                                ++offset;
+                            }
+
+                            // Create the processed read
+                            auto procRead = new TranscriptDataT {
+                            	transcriptIndex,
+                            	0, // Number of times used in the cover
+                            	HandleT(0), // The priority heap handle, we don't know it yet
+                            	tj->header, // header to idenfity this transcript in the ouptut
+                            	binMers, // the set of kmers
+                            	false, // not initially active
+                            	binMers.size(), // the cost is the number of considered kmers
+                            	0 // we don't know the initial benefit
+                            };
+
+                            // Push it onto the processed read queue
+                            processedReadQueue.enqueue(procRead);
+                            // Trnascript set enqueue
+                            q.enqueue(ts);
+
+                            // Free what we no longer need
+                            delete tj->seq;
+                            delete tj;
+                            --numJobs;
+                        }
+                        // if we're not done, but there is no work, then sleep a bit
+                        boost::this_thread::sleep_for(boost::chrono::milliseconds(250));
+                    }
+
+                };
+
+                tp.schedule(task);
+            }
+
+
+            // Read the input data
+            while ( (read = stream.next_read()) ) {
+
+                StringPtrT header = new std::string( read->header, read->hlen );
+                StringPtrT oseq = new std::string(read->seq_s, std::distance(read->seq_s, read->seq_e) - 1 );
+                // Create a job to process this read and throw it on the work queue
+                auto job = new TranscriptJob { header, oseq, transcriptIndex };
+                workQueue.enqueue(job);
+                ++transcriptIndex;
+                ++numJobs;
+            } // end while
+            done = true;
+
+            // Wait for all reads to be processed
+            tp.wait();
+
+            // Now we know how many transrcipts there will be
+            _transcripts.resize(transcriptIndex, nullptr);
+
+            // Pop each transcript off the queue and put it in the right
+            // index in the transcript array
+            TranscriptDataT* td;
+            while ( processedReadQueue.dequeue(td) ) {
+            	auto id = td->id;
+            	_transcripts[id] = td;
+            }
+
+            // iterate over the jellyfish hash and count the number of occurences of
+            // each kmer that appears in the set of reads
+            auto it = _transcriptHash.iterator_all();
+            while ( it.next() ) {
+            	auto mer = it.get_dna_str();
+            	auto binMer = jellyfish::parse_dna::mer_string_to_binary(mer, _merLen);
+            	auto count = _readHash[binMer];//it.get_val();
+            	// we only care about considered kmers
+            	if ( _considered(mer) ) {
+            		_merHash[binMer] = count;//.map( binMer, count );
+            		numRequiredCover += 1;
+            	}
+            }
+        }
+
+    public:
+    	/**
+    	* Construct the solver with the read and transcript hashes
+    	*/
+    	SetMulticoverSolver( ReadHashT& readHash, TranscriptHashT&  transcriptHash ) : 
+    	_readHash(readHash), _transcriptHash(transcriptHash), _merLen(readHash.get_mer_len()),
+    	_merHash( std::unordered_map<KmerIDT,size_t>() ) {}
+    		/*
+    	_merHash( HashArrayT(transcriptHash.get_size(), // Size of hash. Will be rounded up to a power of 2
+                     2*_merLen,      // Key length in bits (2 * mer length)
+                     30,      // Value length in bits
+                     126,     // Max # of reprobe
+                     jellyfish::quadratic_reprobes // Reprobe policy
+                     ) ) {}
+	    */
+
+        bool operator()(const std::string& transcriptFile, const std::string& outputFile ) {
+
+            typedef size_t TranscriptCountT;
+
+            // Hold the number of kmers that remain to be processed
+            uint64_t numRemainingKmers = 0;
+
+            // Create the universe and initialize all necesssary data
+            // structures for solving the cover problem
+            _createUniverse( transcriptFile, numRemainingKmers );
+
+            boost::heap::fibonacci_heap<HeapPair> pq;
+
+            // Function to compute the cost of a given transcript.  Right now it's
+            // 1 (all transcripts have uniform cost), but this could be changed in
+            // the future.
+            auto cost = []( TranscriptDataT* transcript ) -> double { return 1.0; };
+
+            // The reward of a given transcript.  For each required kmer that a transcript
+            // covers, it's score is higher.
+            auto reward = [this]( TranscriptDataT* transcript ) -> TranscriptScoreT {
+                TranscriptScoreT benefit{0};
+                // Iterate over the considered kmers in the transcript
+                for ( auto binMer : transcript->binMers ) {
+                	auto currentIt = _merHash.find(binMer);
+                	benefit += ( currentIt != _merHash.end() ) ? 1 : 0;//currentIt->second : 0; 
+                }
+                return benefit;
+            };
+
+            //std::atomic<size_t> transcriptsToProcess = _transcripts.size();
+            // Score each transcript and add it to the queue
+            for ( auto tdata : _transcripts ) { 
+            	auto benefit = reward(tdata);
+                //auto c = cost(tdata);
+                tdata->benefit = benefit;
+                if ( tdata->benefit > 0 && tdata->cost > 0 ) {
+                	tdata->heapHandle = pq.push( std::make_tuple( tdata->benefit / tdata->cost , tdata->id) );
+                	tdata->active = true;
+                }
+            }
+
+            //tp.wait();
+            std::cerr << "numRemainingKmers = " << numRemainingKmers << "\n";
+            // While not everything has been covered the required number of times
+            boost::progress_display show_progress( numRemainingKmers );
+            while ( numRemainingKmers > 0 and !pq.empty() ) {
+                // The score of this best transcript
+                double score;
+                // The best transcript's ID
+                TranscriptIDT t;
+
+                // Get the best transcript to use and its score
+                std::tie(score, t) = pq.top();
+                // Update the cover --- add a copy of this transcript 
+                // and update the number of things remaining to be covered
+                auto preSize = numRemainingKmers;
+                // Number of times this transcript is used in this round
+                size_t numCopies{0};
+                // Gets the set of kmers whose remaining count became zeros during
+                // this round
+                auto binmerUpdates = _processTranscript(t, numRemainingKmers, numCopies);
+                /*
+                std::cerr << "best transcript is " << t << "\n";
+                std::cerr << "before, used " << _transcripts[t]->numTimesUsed << " times; numCopies = " << numCopies;
+                */
+                _transcripts[t]->numTimesUsed += numCopies;
+                /*
+                std::cerr << " after, used " << _transcripts[t]->numTimesUsed << " times\n";
+                std::cerr << "got rid of " << binmerUpdates.zeroedBinmers.size() << " kmers\n";
+                std::cerr << "with total weight " << binmerUpdates.updatedBinmers.size() * numCopies << "\n";
+                */
+                if( binmerUpdates.zeroedBinmers.size() == 0 ) { std::cerr << "zeroedBinmers == 0\n"; std::abort(); }
+                show_progress += (preSize - numRemainingKmers);
+
+                // Update all of the transcripts containing a changed kmer (including
+                // the guy at the top of the heap)
+                std::unordered_map<TranscriptDataT*, uint64_t> affectedTranscripts;
+                for ( auto binMer : binmerUpdates.updatedBinmers ) {
+                    // For every transcript containing this kmer, its scorew ill be decreased
+                    // by numCopies kmers
+                    for ( auto transcriptID : _transcriptsForKmer[binMer] ){
+                    	affectedTranscripts[ _transcripts[transcriptID] ] += numCopies;
+                    }
+                }
+                //std::cerr << "\n";
+                // update the affected kmers
+                for ( auto transcriptDecrease : affectedTranscripts ) {
+                	
+                	auto transcript = transcriptDecrease.first;
+                	//std::cerr << "Updating transcript " << transcript->id << "\n";
+                	// Remove these kmers from the transcript
+                	
+                	size_t numErased = 0;
+                	/*
+                	auto br = reward(transcript);
+                	std::cerr << "it contains " << transcriptDecrease.second.size() << " affected kmers; [";
+                	for ( auto binMer : transcriptDecrease.second ) { std::cerr << " " << binMer; }
+                	std::cerr << " ]\n";
+	                */
+
+                	for ( auto binMer : binmerUpdates.zeroedBinmers ) {
+                	    //for ( auto binMer : changedKmers ) { 
+                		/*
+                		auto it = _merHash.find(binMer);
+                		if( transcript->binMers.find(binMer) == transcript->binMers.end() ) {
+                			std::cerr << "Transcript should have contained " << binMer << " but it didn't!\n";
+                			std::abort();
+                		}
+                		std::cerr << "erasing " << binMer << " from " << transcript->id << "\n";
+                		if ( it != _merHash.end() ) {
+                	    */
+              			auto erased = transcript->binMers.erase(binMer); 
+               			numErased += erased;
+                	}
+
+                	if ( transcript->active ) {
+                	transcript->benefit -= numErased;
+                	std::get<0>( *transcript->heapHandle ) = transcript->benefit / transcript->cost;
+
+                	 /*              	
+                	auto decrease = transcriptDecrease.second;
+                	auto benefit = std::get<0>( *transcript->heapHandle );
+                	std::get<0>( *transcript->heapHandle ) -= decrease;
+                	*/
+
+                	/*
+                	if (  std::get<0>( *transcript->heapHandle ) != reward(transcript) ) {
+                		std::cerr << "original = " << benefit << ", decrease = " << decrease << "\n";
+                		std::cerr << "calculated reward explicitlly as " << reward(transcript) <<
+                		"; a decrease of " << benefit-reward(transcript) << 
+                		" but updated it to" << std::get<0>( *transcript->heapHandle ) << "\n";
+                		std::abort();
+                	}
+                	*/
+                	if ( transcript->benefit == 0 ) {
+                		if ( transcript->active ) {
+                			pq.erase( transcript->heapHandle ); 
+                		}
+                		transcript->active = false;
+                	} else {
+                		pq.update( transcript->heapHandle );
+                	}
+                    }
+                	//std::cerr << "zeroed: " << binmerUpdates.zeroedBinmers.size() << ", affected: " << binmerUpdates.updatedBinmers.size() << "\n";
+
+                	    /*
+                        auto benefit = reward(transcript);
+                        auto c = cost(transcript);
+                        // Compute the new score and update it in the heap
+                        // If the reward is zero, then drop this guy
+                        if ( benefit < 1e-5 ) { 
+                        	pq.erase( transcript->heapHandle ); 
+                        } else {
+                   	         // Otherwise, update him
+                        	std::get<0>( *transcript->heapHandle ) = benefit;
+                        	pq.update( transcript->heapHandle );
+                        }
+                        */
+                } // done updating affected transcripts
+
+            } // done solving the cover problem
+            
+            std::cerr << "numRemainingKmers: " << numRemainingKmers << "\n";
+
+            std::ofstream ofile( outputFile );
+            for ( auto ts : _transcripts ) {
+            	ofile << *(ts->header) << '\t' << ts->numTimesUsed << "\n";
+            }
+            ofile.close();
+        }
+
+};
+
+#endif // MULTI_SETCOVER_SOLVER_HPP
\ No newline at end of file
diff --git a/include/obsolete/transcript_segmenter.hpp b/include/obsolete/transcript_segmenter.hpp
new file mode 100755
index 0000000..0f68a7f
--- /dev/null
+++ b/include/obsolete/transcript_segmenter.hpp
@@ -0,0 +1,1132 @@
+#ifndef TRANSCRIPTSEGMENTER_HPP
+#define TRANSCRIPTSEGMENTER_HPP
+
+#include <cassert>
+#include <cstdint>
+#include <fstream>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+#include <algorithm>
+#include <sstream>
+#include <limits>
+#include <exception>
+#include <ext/vstring.h>
+#include <ext/vstring_fwd.h>
+
+#include <boost/foreach.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/pending/disjoint_sets.hpp>
+#include <boost/graph/incremental_components.hpp>
+#include <boost/graph/subgraph.hpp>
+#include <boost/progress.hpp>
+#include <boost/range/counting_range.hpp>
+#include <boost/range/irange.hpp>
+
+#include <Eigen/Core>
+//#include <Eigen/Sparse>
+
+#include "g2log.h"
+
+#include <jellyfish/sequence_parser.hpp>
+#include <jellyfish/parse_read.hpp>
+#include <jellyfish/mer_counting.hpp>
+#include <jellyfish/misc.hpp>
+#include <jellyfish/compacted_hash.hpp>
+
+#include "tbb/concurrent_unordered_set.h"
+#include "tbb/concurrent_unordered_map.h"
+#include "tbb/parallel_for.h"
+#include "tbb/task_scheduler_init.h"
+
+#include "boost/lockfree/fifo.hpp"
+#include "threadpool.hpp"
+#include "graph_utils.hpp"
+#include "transcript_gene_map.hpp"
+#include "poisson_solver.hpp"
+
+//#include "tsnnls/tsnnls.h"
+//#include "shotgun/common.h"
+//extern "C"
+//{
+#include "nnls.h"
+//}
+
+#include "matrix_tools.hpp"
+
+template <typename HashT>
+class TranscriptSegmenter {
+
+private:
+    // typedefs to make things less noisy later on
+    //typedef __gnu_cxx::__vstring string;
+    typedef std::string string;
+    typedef string *StringPtrT;
+    typedef std::mutex mutex;
+    typedef tbb::concurrent_unordered_map< uint64_t, tbb::concurrent_unordered_set<uint32_t> > KmerMapT;
+
+    // The final result type of our analysis; mapping each transcript to an abundance
+    struct ResultT {
+        StringPtrT name;
+        double abundance;
+    };
+
+    // This struct represents a "job" (transcript) that needs to be processed
+    struct TranscriptJob {
+        StringPtrT header;
+        StringPtrT seq;
+        uint32_t index;
+    };
+
+
+    size_t errCtr {0};
+
+    // If a connected subgraph contains more than this many transcripts, then
+    // it will be partitioned into smaller subgraphs
+    const size_t tooManyVertices {
+        1500
+    };
+
+    // Filename where we'll read the transcripts from
+    std::string _transcriptFile;
+
+    // Map from each kmer to the transcripts it occurs in
+    KmerMapT _indexMap;
+    KmerMapT _kmerGeneMap;
+
+    // Jellyfish hash of hte reads
+    HashT _readHash;
+    //HashT _transcriptHash;
+
+    TranscriptGeneMap &_transcriptGeneMap;
+
+    /**
+    * Private member functions
+    **/
+    bool _considered( const char *mer ) {
+        return true;
+    }
+    /**
+    * Given a transcript graph G, and the vector of transcripts, partition G into
+    */
+    template <typename GraphT>
+    void _partitionIntoSubgraphs( GraphT &G,
+                                  std::vector< TranscriptJob * > &transcripts,
+                                  std::unordered_map<size_t, GraphT> &subgraphMap) {
+
+        LOG(DEBUG) << "in _partitionIntoSubgraphs\n";
+
+        // Create a metis graph from our boost graph
+        graphdef mG;
+        int wgtflag {1}; // we have edge weights
+
+        LOG(DEBUG) << "calling boostToMetis\n";
+
+        utils::graph::boostToMetis( G, &mG );
+
+        LOG(DEBUG) << "done boost to metis\n";
+        LOG(DEBUG) << "num verts = " << mG.nvtxs << ", num edges = " << mG.nedges << "\n";
+
+        // Hold the partitions
+        idxtype *part = new idxtype[mG.nvtxs];
+
+        LOG(DEBUG) << "alloctated new partition vector\n";
+
+        // Initialize the clustering options
+        Options opt;
+        initOptions(&opt);
+        opt.matchType = MATCH_SHEMN;
+
+        LOG(DEBUG) << "calling mlmcl\n";
+        LOG(DEBUG) << "num verts = " << mG.nvtxs << ", num edges = " << mG.nedges << "\n";
+        LOG(DEBUG) << "num edges 2 (should = num edges) = " << mG.xadj[mG.nvtxs] << "\n";
+
+        // Call the clustering procedure
+        mlmcl(&mG.nvtxs, mG.xadj, mG.adjncy, mG.vwgt,
+              mG.adjwgt, &wgtflag, part, opt );
+
+        LOG(DEBUG) << "done mlmcl\n";
+
+        size_t numClust {0};
+        // Create the subgraphs
+        for ( size_t i = 0; i < mG.nvtxs; ++i ) {
+            auto cluster = part[i];
+            if ( subgraphMap.find(cluster) == subgraphMap.end() ) {
+                subgraphMap[cluster] = G.create_subgraph();
+            }
+        }
+        // For each vertex, add it to the appropriate subgraph
+        for ( size_t i = 0; i < mG.nvtxs; ++i ) {
+            auto cluster = part[i];
+            boost::add_vertex(i, subgraphMap[cluster]);
+        }
+
+    }
+
+    /**
+    * Given the matrix A (m x n) and the right-hand-side vector b (m x 1) solve
+    * for the solution vector x ( n x 1 ) satisfying
+    * min_{x} || Ax - b ||
+    * subject to x[i] >= 0 for all i
+    */
+    template< typename MatT, typename VecT >
+    std::vector<double> _solveNonNegativeLeastSquares( MatT &A, VecT &b ) {
+        std::cerr << "in least squares problem\n";
+
+        Eigen::MatrixXd ADense(A);
+
+
+        int    mda = A.rows();
+        int    m = A.rows(), n = A.cols();
+        std::vector<double> x( A.cols(), 1.0 );
+        double rnorm;
+        std::vector<double> w(n, 0.0);
+        std::vector<double> zz(m, 0.0);
+        std::vector<int> indx(n, 0);
+        int mode;
+
+        nnls( ADense.data(), mda, m, n, &b[0], &x[0], &rnorm, &w[0], &zz[0], &indx[0], &mode);
+        //auto adata = ADense.data();
+        //nnls( &adata, m, n, &b[0], &x[0], &rnorm, &w[0], &zz[0], &indx[0]);
+        std::vector<double> myX( A.cols(), 0.0 );
+
+        for ( size_t i = 0; i < myX.size(); ++i ) {
+            myX[i] = x[i];
+            std::cerr << "x[" << i << "]  = " << x[i] << "\n";
+        }
+
+        return myX;
+
+        /*
+        // Create a new TAUCS matrix structure
+        taucs_ccs_matrix* mat;
+        mat = new taucs_ccs_matrix;//(taucs_ccs_matrix*)malloc(sizeof(taucs_ccs_matrix));
+
+        mat->n = A.cols();
+        mat->m = A.rows();
+        mat->flags = TAUCS_DOUBLE;
+
+        std::cerr << "inited TAUCS matrix\n";
+
+        // Compress this matrix so we can steal / share the data
+        A.makeCompressed();
+
+        std::cerr << "compressed matrix\n";
+
+        mat->colptr = A.outerIndexPtr();
+        mat->rowind = A.innerIndexPtr();
+        mat->values.d = A.valuePtr();
+
+        double residualNorm{0.0};
+
+        std::cerr << "called NNLS solver\n";
+        auto x = t_snnls_fallback( mat, &b[0], &residualNorm, 0.0, 0);
+        char *errString;
+        tsnnls_error(&errString);
+        std::cerr << "NNLS solver returned\n";
+        std::cerr << "ERROR STRING " << errString << "\n";
+
+        if( x == NULL ) {
+          std::cerr << "AHHH, x is STILL NULL\n";
+
+          std::stringstream ss;
+          ss << "errorA_" << errCtr << ".mtx";
+          std::ofstream Afile(ss.str());
+
+          Afile << "SPARSE\n";
+          Afile << A.rows() << '\t' << A.cols() << '\n' << A.nonZeros() << '\n';
+          for (int k=0; k < A.outerSize(); ++k) {
+            for (typename MatT::InnerIterator it(A,k); it; ++it) {
+              Afile << it.row()+1 << '\t' << it.col()+1 << '\t' << it.value() << '\n';
+            }
+          }
+          Afile.close();
+
+          ss.clear();
+          ss << "errorb_"<< errCtr << ".mat";
+          std::ofstream bfile("errorb.mtx");
+          bfile << "# error right hand side\n";
+          bfile << "# rows: " << b.size() << "\n";
+          bfile << "# columns: 1\n";
+          for (size_t i = 0; i < b.size(); ++i) {
+              bfile << b[i] << '\n';
+          }
+          bfile.close();
+          ++errCtr;
+
+          delete mat;
+          return std::vector<double>();
+          //std::abort();
+        } else {
+
+          std::vector<double> myX( A.cols(), 0.0 );
+
+          for( size_t i = 0; i < myX.size(); ++i ) { myX[i] = x[i]; std::cerr << "x[" << i << "]  = " << x[i] << "\n"; }
+
+          free(x);
+          delete mat;
+
+          return myX;
+        }
+        */
+
+        /*
+        // Try shotgun LASSO solver
+        shotgun_data prob;
+        std::cerr << "trying to reserve " << A.nonZeros() << " nonzeros in prob rows/cols\n";
+        prob.A_cols.resize( A.nonZeros(), sparse_array() );
+        prob.A_rows.resize( A.nonZeros(), sparse_array() );
+
+        std::cerr << "done\n";
+
+        auto nnz = A.nonZeros();
+        size_t ctr{0};
+        for (int k=0; k < A.outerSize(); ++k) {
+          std::cerr << "column " << k << " of " << A.outerSize() << "\n";
+          for (typename MatT::InnerIterator it(A,k); it; ++it) {
+            auto val = it.value();
+            auto i = it.row();   // row index
+            auto j = it.col();   // col index (here it is equal to k)
+            //std::cerr << "adding " << i << ", " << j << " : " << val << "\n";
+            try {
+            prob.A_cols[j].add(i, val);
+            prob.A_rows[i].add(j, val);
+            ++ctr;
+            if ( ctr > nnz ) { std::cerr << "only reserved space for " << nnz <<
+              " elements, but you're pushing on the " << ctr << "th element\n";
+            }
+            } catch ( std::exception& e ) {
+              std::cerr << "died trying to add " << i << ", " << j << " : " << val << "\n";
+              std::cerr << e.what();
+            }
+          }
+        }
+
+        prob.nx = A.cols();
+        prob.ny = A.rows();
+
+        prob.y.reserve(b.size());
+        for ( auto e : b ) { prob.y.push_back(e); };
+        double lambda = 0.0;
+        int K = 10;
+        int maxiter = 5000;
+        int verbose = 10;
+        double threshold = 1e-8;
+        assert( prob.ny == prob.y.size() );
+        std::cerr << "before solve lasso\n";
+        solveLasso(&prob, lambda, K, threshold, maxiter, verbose);
+        std::cerr << "after solve lasso\n";
+
+        std::vector<double> myX(prob.x);
+        return myX;
+        */
+
+    }
+
+
+
+    template <typename GraphT, typename TranscriptHashT>
+    std::vector<ResultT> _estimateViaNNLS( GraphT &G, TranscriptHashT &transcriptHash, std::vector< TranscriptJob * > &transcriptJobs ) {
+
+        typedef Eigen::SparseMatrix<double> SpMat; // declares a column-major sparse matrix type of double
+        typedef Eigen::Triplet<double> T;
+
+        struct TranscriptColumn {
+            size_t localID;
+            StringPtrT name;
+            std::vector<uint64_t> kmers;
+        };
+
+        struct LocalTranscriptJob {
+            size_t localID;
+            TranscriptJob *globalJob;
+        };
+
+        auto numJobs = boost::num_vertices(G);
+        size_t numActors = (26 > numJobs + 1) ? numJobs + 1 : 26; //min(26, boost::num_vertices(G)+1);
+        boost::threadpool::pool tp(numActors);
+
+        // Holds the transcripts to be processed
+        boost::lockfree::fifo< LocalTranscriptJob * > workQueue;
+
+        // Holds the results of processed transcripts
+        boost::lockfree::fifo< TranscriptColumn * > resultQueue;
+
+        // The matrix that will hold the left-hand side of our problem
+        SpMat *A = nullptr;
+        // The vector that will hold the right-hand side of our problem
+        std::vector<double> b;
+
+        auto merLen = _readHash.get_mer_len();
+
+        // Collect the processed transcripts and build a matrix out of them
+        auto collectResults = [&resultQueue, &A, &b, &tp, &transcriptHash, this, merLen, numJobs]() -> void {
+            // Char array to hold a kmer string
+            char merStr[merLen];
+            // Triplets we'll use to make our matrix
+            std::vector<T> triplets;
+            // Map from a kmer (encoded in binary) to its row ID
+            auto kmerSet = std::unordered_map<uint64_t, uint32_t>();
+
+            // The number of rows and columns in our matrix
+            size_t numRows{0};
+            size_t numCols{0};
+            size_t resultsProcessed{0};
+
+            // While tasks remain to be processed
+            TranscriptColumn *tc;
+            while ( resultsProcessed < numJobs ) {
+                // Pull a result off the queue
+                while ( resultQueue.dequeue(tc) ) {
+                    std::cerr << "unqueueing result " << tc->name << "\n";
+                    // Get the column id
+                    auto columnID = tc->localID;
+                    // Update the max column id
+                    numCols = ( numCols > (columnID + 1)) ? numCols : columnID + 1 ;
+
+                    // Map from a row ID to it's value (likely just a "1")
+                    std::unordered_map<uint32_t, float> rowIDs;
+                    // The number of non-zeros in this column
+                    rowIDs.reserve(tc->kmers.size());
+
+                    // For each kmer, map it to a local id
+                    for ( auto & kmer : tc->kmers ) {
+                        auto weight = ( 1.0 / (0.1 + transcriptHash[merStr]));
+                        // If this is a new kmer, then give it the next available id
+                        if ( kmerSet.find(kmer) == kmerSet.end() ) {
+                            // Update the number of rows
+                            kmerSet[kmer] = numRows;
+                            ++numRows;
+
+                            // Put the count of this kmer in the right-hand side
+                            jellyfish::parse_dna::mer_binary_to_string( kmer, merLen, &merStr[0] );
+                            b.push_back( this->_readHash[merStr] * weight );
+                        };
+                        // Assign the local id to this column
+                        rowIDs[ kmerSet[kmer] ] += weight;
+                    }
+
+                    // Push this column's values onto our triplets array
+                    for (auto & kv : rowIDs ) {
+                        triplets.push_back( {kv.first, columnID, kv.second} );
+                    }
+                    ++resultsProcessed;
+                }
+
+                // if we're not done, but there is no work, then sleep a bit
+                boost::this_thread::sleep_for(boost::chrono::milliseconds(250));
+            }
+
+            // Build a proper matrix from our triplets
+            /*size_t maxRow{0};
+            int minRow = 2000000;
+            for ( auto& t : triplets ) { std::cerr << "pushing " << t.row() << ", " << t.col() << ", " << t.value() << "\n";
+            maxRow = max(maxRow,t.row());
+            minRow = min(minRow,t.row());
+            }
+            */
+            std::cerr << "numCols = " << numCols << "\n";
+            std::cerr << "kmerSet.size = " << kmerSet.size() << ", numRows = " << numRows << "\n";
+            /*std::cerr << "maxRow = " << maxRow << "\n";
+            std::cerr << "minRow = " << minRow << "\n";*/
+            if ( numRows > 0 && numCols > 0 ) {
+                A = new SpMat(numRows, numCols);
+                A->setFromTriplets(triplets.begin(), triplets.end());
+            }
+            std::cerr << "done building matrix\n";
+        };
+
+        tp.schedule(collectResults);
+
+        // Read Hash
+        bool done {false};
+
+        std::atomic<size_t> jobsRemaining {numJobs};
+        // Launch the worker jobs
+        for ( size_t i = 0; i < numActors - 1; ++i ) {
+            //
+            auto columnBuilder = [&workQueue, &resultQueue, &done, &jobsRemaining, &transcriptHash, merLen, this]() -> void {
+                LocalTranscriptJob *ltj;
+                while ( jobsRemaining > 0 ) {
+                    while (workQueue.dequeue(ltj) ) {
+
+                        TranscriptJob *tj = ltj->globalJob;
+                        StringPtrT seq = tj->seq;
+                        auto readLen = seq->length();
+                        auto transcriptIndex = tj->index;
+                        auto localIndex = ltj->localID;
+
+                        size_t offset {0};
+                        auto transcriptColumn = new TranscriptColumn {
+                            localIndex,
+                            tj->header,
+                            std::vector<uint64_t>()
+                        };
+
+                        if ( transcriptColumn->name->length() != tj->header->length() ) {
+                            std::cerr << "transcriptColumn name is " << *transcriptColumn->name << ", but "  <<
+                                      "tj->header is " << *tj->header << "\n";
+                        }
+
+                        std::cerr << "trying to allocate kmers vector of size " << readLen - merLen + 1 << "\n";
+                        transcriptColumn->kmers.reserve( readLen - merLen + 1 );
+
+                        // Iterate over the kmers
+                        while ( offset < readLen - merLen + 1 )  {
+                            auto mer = seq->substr( offset, merLen );
+                            if ( this->_considered(mer.c_str()) ) {// transcriptHash[mer.c_str()] < 10 ) {
+                                auto binMer = jellyfish::parse_dna::mer_string_to_binary( mer.c_str(), merLen );
+                                transcriptColumn->kmers.push_back(binMer);
+                            }
+                            ++offset;
+                        }
+
+                        // Enqueue the column
+                        resultQueue.enqueue(transcriptColumn);
+                        std::cerr << "processed job " << *transcriptColumn->name << " enqueueing result\n";
+                        --jobsRemaining;
+                        std::cerr << "there are " << jobsRemaining << " jobs remaining\n";
+                        // Delete the original transcript job
+                        // delete tj;
+                        // Delete the local job as well
+                        // delete ltj;
+
+                    }
+                    // if we're not done, but there is no work, then sleep a bit
+                    boost::this_thread::sleep_for(boost::chrono::milliseconds(250));
+                }
+
+            };
+
+            tp.schedule(columnBuilder);
+        } // end launch of workers
+
+        std::vector<StringPtrT> columnNames( boost::num_vertices(G), nullptr );
+        // for each vertex in the graph
+        auto se = boost::vertices(G);
+        for ( auto &v = se.first; v != se.second; ++v ) {
+            // Get the global id of the vertex
+            auto globalIdx = G.local_to_global( *v );
+            // Create a local job from the global job
+            auto localJob = new LocalTranscriptJob { *v, transcriptJobs[globalIdx] };
+            // Add this transcripts name to the index
+            columnNames[*v] = transcriptJobs[globalIdx]->header;//std::string(header.begin(), he);
+            std::cerr << "pusing job " << *(transcriptJobs[globalIdx]->header) << ", for vertex " << *v << "\n";
+            // build column for this vertex
+            workQueue.enqueue(localJob);
+        } // end launch of jobs
+        done =  true;
+        tp.wait();
+
+        std::cerr << "done waiting\n";
+        std::vector<ResultT> results;
+
+        if ( A != nullptr ) {
+            auto dupCols = matrix_tools::markDependentColumns(*A);
+
+            if (dupCols.size() > 0) {
+                std::cerr << "Matrix had " << dupCols.size() << " duplicate columns\n";
+                std::vector<T> triplets;
+                std::vector<StringPtrT> newColumnNames;
+
+                size_t dupPtr = 0;
+                for (size_t k = 0; k < A->outerSize(); ++k) {
+                    // If this is a colum we skip, then continue this loop and
+                    // move to the next column to skip
+                    if ( dupPtr < dupCols.size() && k == dupCols[dupPtr] ) {
+                        ++dupPtr;
+                        continue;
+                    }
+                    // The column index is the column index of A minus the number of
+                    // columns we've skipped so far
+                    size_t colInd = k - dupPtr;
+                    // Add all rows of this column to the new matrix
+                    for (SpMat::InnerIterator it(*A, k); it; ++it) {
+                        triplets.push_back( {it.row(), colInd, it.value()} );
+                    }
+                    newColumnNames.push_back(columnNames[colInd]);
+                }
+                SpMat *B = new SpMat(A->rows(), A->cols() - dupCols.size());
+                B->setFromTriplets(triplets.begin(), triplets.end());
+                std::swap(A, B);
+                std::swap(columnNames, newColumnNames);
+                delete B;
+            }
+
+
+
+
+            // At this point we've built the matrix, so solve the problem
+            auto x = _solveNonNegativeLeastSquares(*A, b);
+            results.reserve(x.size());
+            for ( size_t i = 0; i < x.size(); ++i ) {
+                results.push_back( {columnNames[i], x[i]} );
+            }
+            delete A;
+        }
+
+        return results;
+    }
+
+    template <typename GraphT, typename TranscriptHashT>
+    std::vector<ResultT> _computeAbundance( GraphT &G, TranscriptHashT &transcriptHash, std::vector< TranscriptJob * > transcriptJobs ) {
+        auto numVerts = boost::num_vertices(G);
+        // If the graph is non-trivial
+        //if ( numVerts > 1 ) {
+
+        // If the graph is "too big", then we cluster it
+        if ( numVerts > tooManyVertices ) {
+            std::unordered_map<size_t, GraphT> subgraphMap;
+            _partitionIntoSubgraphs(G, transcriptJobs, subgraphMap);
+            // Now G has some set of "children" subgraphs
+            auto se = G.children();
+
+            // Iterate over each child subgraph and build it's linear system
+            std::vector<ResultT> res;
+            for ( auto & kv : subgraphMap ) {
+                // recursively decompose
+                auto sgres = _computeAbundance(kv.second, transcriptHash, transcriptJobs);//_estimateViaNNLS(kv.second, transcriptJobs);
+                res.insert(res.end(), sgres.begin(), sgres.end());
+            }
+            return res;
+
+        } else {
+            std::cerr << "Component has only " << boost::num_vertices(G) << " vertices; solving NNLS directly\n";
+            // Otherwise, solve the nnls problem directly
+            auto res = _estimateViaNNLS(G, transcriptHash, transcriptJobs);
+            return res;
+        }
+        /*
+        } else {
+        // The graph has only one vertex; its kmer set doesn't overlap with
+        // any others and so its abundance can be uniquely estimated
+        auto singletonVertex = boost::vertices(G).first;
+        auto globalIdx = G.local_to_global( *singletonVertex );
+        //auto res = _estimateAbundanceTrivial( globalIdx );
+        return std::vector<;
+        }
+        */
+    }
+
+    template <typename hash_t>
+    std::vector<double> _estimateIsoformAbundanceByCount( const size_t geneID, 
+      std::vector<std::vector<uint64_t>> &transcriptKmers,
+      std::vector<double>& transcriptLengths,
+      size_t mappedReads,
+      hash_t& _transcriptHash
+      ) {
+
+        auto transcripts = _transcriptGeneMap.transcriptsForGene( geneID );
+        size_t numTranscripts = transcripts.size();
+        std::vector<double> exps(numTranscripts, 0.0);
+
+        for ( auto ti : boost::irange(0ul, numTranscripts) ){
+            auto globalTranscriptID = transcripts[ti];
+
+            for ( auto kmer : transcriptKmers[ globalTranscriptID ] ) {
+                exps[ti] += _readHash[kmer] / (1.0 + _transcriptHash[kmer]);
+            }
+        }
+
+        return exps;
+    }
+
+    std::vector<double> _estimateIsoformAbundance( const size_t geneID, 
+      std::vector<std::vector<uint64_t>> &transcriptKmers,
+      std::vector<double>& transcriptLengths,
+      size_t mappedReads,
+      tbb::concurrent_unordered_map<uint64_t, uint32_t>& genePromiscuousKmers
+      ) {
+
+        typedef std::vector<std::vector<double>> IsoOptMatrix;
+        typedef std::vector<double> IsoOptVector;
+
+        auto transcripts = _transcriptGeneMap.transcriptsForGene( geneID );
+        size_t numTranscripts = transcripts.size();
+
+        std::unordered_map<uint64_t, size_t> kmerIDs;
+        std::unordered_set<uint64_t> geneKmers;
+        // reindex the kmers for this gene
+        for ( size_t i = 0; i < transcripts.size(); ++i ) {
+            for ( auto kmer : transcriptKmers[ transcripts[i] ] ) {                
+                auto kmerIt = geneKmers.find(kmer);
+                if ( kmerIt == geneKmers.end() ) {
+                    kmerIDs[kmer] = geneKmers.size();
+                    geneKmers.insert(kmer);
+                }
+            }            
+        }
+
+        size_t numKmers = geneKmers.size();
+        if ( numKmers == 0 ) {
+            return IsoOptVector();
+        }
+
+        // Create a numTranscripts x numKmers matrix to hold the "rates"
+        IsoOptVector x(numTranscripts, 0.0);
+        IsoOptMatrix A( numTranscripts, IsoOptVector(numKmers, 0.0) );
+        IsoOptVector counts(numKmers, 0.0);
+        double numReads = 0.0;
+
+        for ( size_t i = 0; i < transcripts.size(); ++i ) {
+            for ( auto kmer : transcriptKmers[ transcripts[i] ] ) {
+                size_t j = kmerIDs[kmer];
+                A[i][j] = 1.0;
+                if ( counts[j] == 0 ) {
+
+                    auto promIt = genePromiscuousKmers.find(kmer);
+                    double normFact = 1.0;
+                    if ( promIt != genePromiscuousKmers.end() ) {
+                        normFact /= 4.0*promIt->second;
+                    }
+
+                    counts[j] = _readHash[kmer] * normFact;
+                    numReads += counts[j];
+                }
+            }
+        }
+
+        
+        std::unique_ptr<IsoOptMatrix> collapsedA;
+        std::unique_ptr<IsoOptVector> collapsedCounts;
+        matrix_tools::collapseIntoCategories(A, counts, collapsedA, collapsedCounts);
+        
+        for (size_t i = 0; i < collapsedA->size(); ++i) {
+            auto tlen = transcriptLengths[ transcripts[i] ];
+          for (size_t j = 0; j < (*collapsedA)[0].size(); ++j) {
+            (*collapsedA)[i][j] = (*collapsedA)[i][j] / ( tlen * mappedReads / 1000000000.0 );
+            //(*collapsedA)[i][j] *= numReads;
+          }
+        }
+
+        //transcriptLengths[transcripts[i]] / 1000.0 * mappedReads / 1000000.0;
+
+        solve_likelihood( *collapsedA, *collapsedCounts, x, false );
+
+        //solve_likelihood( A, counts, x, false );
+        //for ( auto& e : x ) { e *= numReads; }
+        return x;
+    }
+
+
+public:
+    TranscriptSegmenter( const std::string &transcriptFile, HashT &readHash, TranscriptGeneMap &t2g ) :
+        _transcriptFile (transcriptFile), _indexMap( KmerMapT() ), _readHash(readHash), _transcriptGeneMap(t2g) {}
+
+    template <typename TranscriptHashT>
+    bool findOverlappingSegments( int merLen, TranscriptHashT &transcriptHash ) {
+        // We'll need the reverse map (gene -> [transcripts])
+        _transcriptGeneMap.needReverse();
+
+        // Holds the set of kmers present in each transcript
+        typedef std::tuple<uint32_t, std::vector<uint64_t>> TranscriptKmerSet;
+
+        // Parser for the reads
+        const char *fnames[] = { _transcriptFile.c_str() };
+        jellyfish::parse_read parser( fnames, fnames + 1, 100);
+        jellyfish::parse_read::thread stream = parser.new_thread();
+
+        // Read pointer
+        jellyfish::read_parser::read_t *read;
+
+        // So we can concisely identify each transcript
+        uint32_t transcriptIndex {0};
+
+        // Holds the tasks to be processed
+        boost::lockfree::fifo< TranscriptJob * > workQueue;
+
+        // Holds the processed transcripts (removed newlines and index of each)
+        boost::lockfree::fifo< TranscriptJob * > processedReadQueue;
+
+        // Holds results that we will use to update the index map
+        //boost::lockfree::fifo< TranscriptKmerSet * > q;
+
+        std::atomic<size_t> numJobsProcessed {0};
+        std::atomic<size_t> numRes {0};
+        size_t numActors = 26 + 1;
+        boost::threadpool::pool tp(numActors);
+
+        std::vector<double> transcriptLengths( _transcriptGeneMap.numTranscripts(), 0.0 );
+
+        auto printProgressBar = [&numRes, this] () {
+            auto numJobs = this->_transcriptGeneMap.numTranscripts();
+            boost::progress_display show_progress( numJobs );
+            size_t lastCount = numRes;
+            while ( lastCount < numJobs ) {
+                auto diff = numRes - lastCount;
+                if ( diff > 0 ) {
+                    show_progress += diff;
+                    lastCount += diff;
+                }
+                boost::this_thread::sleep_for(boost::chrono::milliseconds(250));
+            }
+        };
+        std::cerr << "Processing transcripts\n";
+        tp.schedule(printProgressBar);
+
+
+        /**
+        * Read results from the result queue, and build the index map
+        */
+        /*
+        auto readResults = [&q, &tp, &numRes, this] () -> void {
+            // While tasks remain to be processed
+            while (tp.active() + tp.pending() > 1) {
+                TranscriptKmerSet *ks;
+                // Process existing results
+                while ( q.dequeue(ks) ) {
+
+                    uint32_t transcriptID = std::get<0>(*ks);
+                    // For each kmer in this transcrpt, add this
+                    // transcript to all kmers it contains in the map
+                    for ( uint64_t & kmer : std::get<1>(*ks) ) {
+                        //this->_indexMap[kmer].insert(transcriptID);
+                        this->_kmerGeneMap[kmer].insert( this->_transcriptGeneMap.gene(transcriptID) );
+                    }
+
+                    // no memory leaks!
+                    //delete ks;
+                    ++numRes;
+                }
+                boost::this_thread::sleep_for(boost::chrono::milliseconds(250));
+            }
+        };
+
+        tp.schedule(readResults);
+        */
+
+        // Holds, for each transcript, the kmers present in it
+        std::vector< std::vector<uint64_t> > transcriptKmers(_transcriptGeneMap.numTranscripts(), std::vector<uint64_t>());
+
+        /**
+        * Spawn off the appropriate number of worker threads.  Each will process reads from the
+        * transcript file.
+        */
+        for (size_t i = 0; i < numActors; ++i) {
+
+            auto task = [&workQueue, &transcriptHash, &processedReadQueue, &numJobsProcessed,
+            &transcriptKmers, &numRes, &transcriptLengths, merLen, this]() -> void {
+                TranscriptJob *tj;
+                size_t numTranscripts = this->_transcriptGeneMap.numTranscripts();
+
+                while ( numJobsProcessed < numTranscripts ) {
+                    while (workQueue.dequeue(tj) ) {
+                        // Get the sequence
+                        auto seq = tj->seq;
+                        // Strip the newlines from it
+                        auto newEnd  = std::remove( seq->begin(), seq->end(), '\n' );
+                        auto readLen = std::distance( seq->begin(), newEnd );
+                        // Copy the newline-stripped read into newSeq
+                        StringPtrT newSeq = new string( seq->begin(), seq->begin() + readLen);
+                        // We no longer need the original sequence
+                        delete seq;
+
+                        // Lookup the ID of this transcript in our transcript -> gene map
+                        auto transcriptIndex = this->_transcriptGeneMap.findTranscriptID( *(tj->header) );
+                        assert( transcriptIndex != this->_transcriptGeneMap.INVALID );
+
+                        /*
+                        // Create the processed read
+                        auto procRead = new TranscriptJob{
+                          tj->header,
+                          newSeq,
+                          transcriptIndex
+                        };
+                        // Push it onto the processed read queue
+                        processedReadQueue.enqueue(procRead);
+                        */
+
+                        // The set of kmers in this transcript
+                        //auto ts = new TranscriptKmerSet{transcriptIndex, {}};
+
+                        // We know how many kmers there will be, so allocate the space
+                        //std::get<1>(*ts).reserve(readLen-merLen+1);
+                        transcriptKmers[transcriptIndex].reserve(readLen - merLen + 1);
+                        size_t offset {0};
+
+                        // Iterate over the kmers and add them to the kmer list for this transcript
+                        while ( offset < readLen - merLen + 1 )  {
+                            auto mer = newSeq->substr( offset, merLen );
+                            auto binMer = jellyfish::parse_dna::mer_string_to_binary( mer.c_str(), merLen );
+                            if ( this->_considered(mer.c_str()) ) {                                
+                                this->_kmerGeneMap[binMer].insert( this->_transcriptGeneMap.gene(transcriptIndex) );
+                                transcriptKmers[transcriptIndex].push_back(binMer);//std::get<1>(*ts).push_back(binMer);
+                            }
+                            ++offset;
+                        }
+                        transcriptLengths[transcriptIndex] = readLen - merLen + 1;
+
+                        ++numJobsProcessed;
+                        ++numRes;
+                        delete tj;
+                        delete newSeq;
+                        //q.enqueue(ts);
+                        //delete tj;
+                    }
+                    // if we're not done, but there is no work, then sleep a bit
+                    boost::this_thread::sleep_for(boost::chrono::milliseconds(250));
+                }
+
+            };
+
+            tp.schedule(task);
+        }
+
+
+        // Read the input data from the trnascript file
+        while ( (read = stream.next_read()) ) {
+            StringPtrT header = new string( read->header, read->hlen );
+            StringPtrT oseq = new string(read->seq_s, std::distance(read->seq_s, read->seq_e) - 1 );
+            auto job = new TranscriptJob { header, oseq, transcriptIndex };
+            workQueue.enqueue(job);
+        }// end while
+
+        tp.wait();
+        std::cerr << "done waiting\n";
+
+        /*
+        // Now that we have all of the processed reads, along with their indices
+        // in the processed read queue; put them in a vector
+        std::vector< TranscriptJob* > transcripts( _transcriptGeneMap.numTranscripts() , nullptr);
+        // Dump the queue into the vector, where each element is in its appropriate
+        // position / index
+        TranscriptJob* tj;
+        while ( processedReadQueue.dequeue(tj) ) { transcripts[tj->index] = tj; }
+        std::cerr << "done processing read queue\n";
+        */
+
+        tbb::task_scheduler_init tbb_init;
+
+        size_t numKmers = 0;
+        std::atomic<size_t> mappedReads{0};
+        
+        tbb::concurrent_unordered_map<uint64_t, uint32_t> genePromiscuousKmers;
+        tbb::parallel_for( _kmerGeneMap.range(),
+        [&genePromiscuousKmers, &mappedReads, this]( KmerMapT::range_type & range ) {
+            for ( auto kg = range.begin(); kg != range.end(); ++kg ) {
+                auto kmer = kg->first;
+                auto &geneSet = kg->second;
+                if ( geneSet.size() > 1 ) {
+                    genePromiscuousKmers[kmer] = geneSet.size();//.insert(kmer);
+                }
+                mappedReads += this->_readHash[kmer];
+            }
+        }
+        );
+        std::cerr << "I counted " << mappedReads << " mapped reads\n";
+        std::cerr << " out of " << _kmerGeneMap.size() << " kmers, " << genePromiscuousKmers.size() << " were gene non-unique\n";
+        std::cerr << 100.0 * (_kmerGeneMap.size() - genePromiscuousKmers.size()) / static_cast<double>(_kmerGeneMap.size()) <<
+                  "% of the kmers are gene unique\n";
+        
+        std::vector<double> abundances( _transcriptGeneMap.numTranscripts(), 0.0 );
+
+        std::atomic<size_t> numGenesProcessed{0};
+
+        auto abundanceEstimateProgress = [&numGenesProcessed, this] () {
+            auto numJobs = this->_transcriptGeneMap.numGenes();
+            boost::progress_display show_progress( numJobs );
+            size_t lastCount = numGenesProcessed;
+            while ( lastCount < numJobs ) {
+                auto diff = numGenesProcessed - lastCount;
+                if ( diff > 0 ) {
+                    show_progress += diff;
+                    lastCount += diff;
+                }
+                boost::this_thread::sleep_for(boost::chrono::milliseconds(250));
+            }
+        };
+        std::cerr << "Processing abundances\n";
+        tp.schedule(abundanceEstimateProgress);
+
+        // process each gene
+        tbb::parallel_for( size_t {0}, _transcriptGeneMap.numGenes(), size_t {1},
+        [&transcriptKmers, &transcriptLengths, &abundances, &numGenesProcessed, 
+            &mappedReads, &transcriptHash, &genePromiscuousKmers, this]( const size_t & geneID ) {
+            auto abundance = this->_estimateIsoformAbundance(geneID, transcriptKmers, transcriptLengths, mappedReads,
+                genePromiscuousKmers);
+            //auto abundance = this->_estimateIsoformAbundanceByCount(geneID, transcriptKmers, transcriptLengths, mappedReads, transcriptHash);
+
+            auto transcripts = this->_transcriptGeneMap.transcriptsForGene(geneID);
+            for ( size_t i = 0; i < abundance.size(); ++i ) {
+                //_transcriptGeneMap.transcriptName( transcripts[i] );
+                abundances[ transcripts[i] ] = abundance[i];
+            }
+            ++numGenesProcessed;
+        }
+        );
+        //for ( size_t geneID = 0; geneID < _transcriptGeneMap.numGenes(); ++geneID ) {
+        //auto abundance = _estimateIsoformAbundance(geneID, transcriptKmers);
+        //}
+
+        std::ofstream afile("abundances.txt");
+        for ( size_t ti = 0; ti < _transcriptGeneMap.numTranscripts(); ++ti ) {
+            /*double effectiveLength = 0.0;
+            for ( auto kmer : transcriptKmers[ti] ) {
+                effectiveLength += (1.0 / (1.0 + transcriptHash[kmer]));
+            }*/
+            afile << _transcriptGeneMap.transcriptName(ti) << '\t' << transcriptLengths[ti] << '\t' << abundances[ti] << '\n';
+        }
+        afile.close();
+
+        std::ofstream gfile("abundances.gene.txt");
+        for ( size_t gi = 0; gi < _transcriptGeneMap.numGenes(); ++gi ) {
+          auto transcripts = _transcriptGeneMap.transcriptsForGene(gi);
+          double exp = 0.0;
+          for ( auto t : transcripts ) { exp += abundances[t]; }
+
+          gfile << _transcriptGeneMap.nameFromGeneID(gi) << '\t' << exp << '\n';
+        }
+        gfile.close();
+        std::exit(1);
+
+        // Our graph will have weighted edges
+        struct EdgePropT {
+            float weight;
+        };
+        typedef boost::property< boost::edge_index_t, size_t, EdgePropT > edge_property;
+
+        // We'll be taking subgraphs, so make sure we implement this concept
+        typedef typename boost::subgraph <
+        boost::adjacency_list < boost::hash_setS,
+              boost::vecS,
+              boost::undirectedS,
+              boost::no_property,
+              edge_property
+              > > GraphT;
+        typedef typename boost::graph_traits<GraphT>::vertex_descriptor Vertex;
+        typedef typename boost::graph_traits<GraphT>::vertices_size_type VertexIndex;
+
+        // Create our transcript overlap graph
+        auto numTranscripts = _transcriptGeneMap.numTranscripts();
+        auto numGenes = _transcriptGeneMap.numGenes();
+        GraphT G( numGenes );
+        std::vector<VertexIndex> rank( numGenes );
+        std::vector<Vertex> parent( numGenes );
+
+        typedef VertexIndex *Rank;
+        typedef Vertex *Parent;
+
+        boost::disjoint_sets<Rank, Parent> ds(&rank[0], &parent[0]);
+        boost::initialize_incremental_components(G, ds);
+        boost::incremental_components(G, ds);
+
+        std::cerr << "Building transcript overlap graph\n";
+        boost::progress_display showGraphProgress(_kmerGeneMap.size());
+        for ( auto & kv : _kmerGeneMap ) {
+            ++showGraphProgress;
+
+            using std::tie;
+            typename GraphT::edge_descriptor e;
+            bool exists {false};
+
+            auto kmer = kv.first;
+            for ( auto u = kv.second.begin(); u != kv.second.end(); ++u ) {//& u : kv.second ) {
+                auto v = u; ++v;
+                for ( ; v != kv.second.end(); ++v ) {//& u : kv.second ) {
+                    //for ( auto& v : kv.second ) {
+                    auto vertU = *u;
+                    auto vertV = *v;
+                    if ( vertU > vertV ) {
+                        std::swap(vertU, vertV);
+                    }
+
+                    tie(e, exists) = boost::edge( vertU, vertV, G );
+                    if ( !exists ) {
+                        auto e = boost::add_edge(vertU, vertV, G);
+                        G[e.first].weight = 1.0f;
+                        ds.union_set(vertU, vertV);
+                    } else {
+                        G[ e ].weight += 1.0;
+                    }
+
+                }
+            }
+
+        }
+
+        typedef boost::component_index<VertexIndex> Components;
+
+        // NOTE: Because we're using vecS for the graph type, we're
+        // effectively using identity_property_map for a vertex index map.
+        // If we were to use listS instead, the index map would need to be
+        // explicitly passed to the component_index constructor.
+        Components components(parent.begin(), parent.end());
+
+        /**
+        * Create a graph (appropriately indexed) for each
+        * connected component of our original graph
+        */
+        // Vector to hold the new indices of each vertex
+        // for vertex i in the original graph, it's new index is relabel[i]
+        std::vector<size_t> relabel( boost::num_vertices(G), 0 );
+        std::vector<size_t> relabelInv( boost::num_vertices(G), 0 );
+        std::vector<size_t> offsets;
+        size_t vnum {0};
+
+        std::cerr << "building the connected component subgraphs \n";
+        std::vector< GraphT > componentGraphs;
+        size_t compID {0};
+        //std::ofstream ofile("nnls_abundance.txt");
+        // Iterate through the component indices
+        BOOST_FOREACH(VertexIndex current_index, components) {
+
+            // The offset index of this subgraph
+            offsets.push_back(vnum);
+            // The graph for this component
+            auto compG = G.create_subgraph();//componentGraphs.back();
+
+            // For each vertex in this component, add it to the graph
+            // and calculate its label
+
+            auto compIter = components[current_index];
+            for ( auto &vert = compIter.first; vert != compIter.second; ++vert ) {
+                boost::add_vertex(*vert, compG);
+                relabel[*vert] = vnum;
+                relabelInv[vnum] = *vert;
+                ++vnum;
+            }
+
+            std::cerr << "subgraph " << compID << "; order = " << boost::num_vertices(compG) << "; size = " <<
+                      boost::num_edges(compG) << "\n";
+
+            // process the graph
+            /*
+            auto res = _computeAbundance( compG, transcriptHash, transcripts );
+            for ( auto& r : res ) {
+              ofile << *(r.name) << '\t' << r.abundance << '\n';
+            }
+            */
+            //componentGraphs.push_back(compG);
+            ++compID;
+        }
+
+
+
+        //ofile.close();
+
+
+        std::ofstream ofile("sharedkmers.gra");
+
+        // Write number of verts, edges and format (edge weights)
+        ofile << boost::num_vertices(G) << " " << boost::num_edges(G)  << " 1\n";
+
+        // We have to iterate over the vertices in numerical order
+        // for this format
+        for ( size_t i = 0; i < boost::num_vertices(G); ++i ) {
+            // Iterate over all edges adjacent to vertex i
+            auto se = boost::out_edges(i, G);
+            for ( auto &e = se.first; e != se.second; ++e ) {
+                // Output the pair (target, weight)
+                ofile << boost::target(*e, G) << " " << G[*e].weight << " ";
+            }
+            ofile << "\n";
+        }
+        ofile.close();
+
+        return true;
+    } // end findOverlappingSegments
+
+};
+
+
+#endif //TRANSCRIPTSEGMENTER_HPP
diff --git a/include/posix.h b/include/posix.h
new file mode 100644
index 0000000..70a0db1
--- /dev/null
+++ b/include/posix.h
@@ -0,0 +1,344 @@
+/*
+ A C++ interface to POSIX functions.
+
+ Copyright (c) 2014 - 2015, Victor Zverovich
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FMT_POSIX_H_
+#define FMT_POSIX_H_
+
+#ifdef __MINGW32__
+// Workaround MinGW bug https://sourceforge.net/p/mingw/bugs/2024/.
+# undef __STRICT_ANSI__
+#endif
+
+#include <errno.h>
+#include <fcntl.h>  // for O_RDONLY
+#include <stdio.h>
+
+#include <cstddef>
+
+#include "format.h"
+
+#ifdef FMT_INCLUDE_POSIX_TEST
+# include "test/posix-test.h"
+#endif
+
+#ifndef FMT_POSIX
+# if defined(_WIN32) && !defined(__MINGW32__)
+// Fix warnings about deprecated symbols.
+#  define FMT_POSIX(call) _##call
+# else
+#  define FMT_POSIX(call) call
+# endif
+#endif
+
+// Calls to system functions are wrapped in FMT_SYSTEM for testability.
+#ifdef FMT_SYSTEM
+# define FMT_POSIX_CALL(call) FMT_SYSTEM(call)
+#else
+# define FMT_SYSTEM(call) call
+# ifdef _WIN32
+// Fix warnings about deprecated symbols.
+#  define FMT_POSIX_CALL(call) ::_##call
+# else
+#  define FMT_POSIX_CALL(call) ::call
+# endif
+#endif
+
+#if FMT_GCC_VERSION >= 407
+# define FMT_UNUSED __attribute__((unused))
+#else
+# define FMT_UNUSED
+#endif
+
+#if FMT_USE_STATIC_ASSERT || FMT_HAS_CPP_ATTRIBUTE(cxx_static_assert) || \
+  (FMT_GCC_VERSION >= 403 && FMT_HAS_GXX_CXX11) || _MSC_VER >= 1600
+# define FMT_STATIC_ASSERT(cond, message) static_assert(cond, message)
+#else
+# define FMT_CONCAT_(a, b) FMT_CONCAT(a, b)
+# define FMT_STATIC_ASSERT(cond, message) \
+  typedef int FMT_CONCAT_(Assert, __LINE__)[(cond) ? 1 : -1] FMT_UNUSED
+#endif
+
+// Retries the expression while it evaluates to error_result and errno
+// equals to EINTR.
+#ifndef _WIN32
+# define FMT_RETRY_VAL(result, expression, error_result) \
+  do { \
+    result = (expression); \
+  } while (result == error_result && errno == EINTR)
+#else
+# define FMT_RETRY_VAL(result, expression, error_result) result = (expression)
+#endif
+
+#define FMT_RETRY(result, expression) FMT_RETRY_VAL(result, expression, -1)
+
+namespace fmt {
+
+// An error code.
+class ErrorCode {
+ private:
+  int value_;
+
+ public:
+  explicit ErrorCode(int value = 0) FMT_NOEXCEPT : value_(value) {}
+
+  int get() const FMT_NOEXCEPT { return value_; }
+};
+
+// A buffered file.
+class BufferedFile {
+ private:
+  FILE *file_;
+
+  friend class File;
+
+  explicit BufferedFile(FILE *f) : file_(f) {}
+
+ public:
+  // Constructs a BufferedFile object which doesn't represent any file.
+  BufferedFile() FMT_NOEXCEPT : file_(0) {}
+
+  // Destroys the object closing the file it represents if any.
+  ~BufferedFile() FMT_NOEXCEPT;
+
+#if !FMT_USE_RVALUE_REFERENCES
+  // Emulate a move constructor and a move assignment operator if rvalue
+  // references are not supported.
+
+ private:
+  // A proxy object to emulate a move constructor.
+  // It is private to make it impossible call operator Proxy directly.
+  struct Proxy {
+    FILE *file;
+  };
+
+public:
+  // A "move constructor" for moving from a temporary.
+  BufferedFile(Proxy p) FMT_NOEXCEPT : file_(p.file) {}
+
+  // A "move constructor" for for moving from an lvalue.
+  BufferedFile(BufferedFile &f) FMT_NOEXCEPT : file_(f.file_) {
+    f.file_ = 0;
+  }
+
+  // A "move assignment operator" for moving from a temporary.
+  BufferedFile &operator=(Proxy p) {
+    close();
+    file_ = p.file;
+    return *this;
+  }
+
+  // A "move assignment operator" for moving from an lvalue.
+  BufferedFile &operator=(BufferedFile &other) {
+    close();
+    file_ = other.file_;
+    other.file_ = 0;
+    return *this;
+  }
+
+  // Returns a proxy object for moving from a temporary:
+  //   BufferedFile file = BufferedFile(...);
+  operator Proxy() FMT_NOEXCEPT {
+    Proxy p = {file_};
+    file_ = 0;
+    return p;
+  }
+
+#else
+ private:
+  FMT_DISALLOW_COPY_AND_ASSIGN(BufferedFile);
+
+ public:
+  BufferedFile(BufferedFile &&other) FMT_NOEXCEPT : file_(other.file_) {
+    other.file_ = 0;
+  }
+
+  BufferedFile& operator=(BufferedFile &&other) {
+    close();
+    file_ = other.file_;
+    other.file_ = 0;
+    return *this;
+  }
+#endif
+
+  // Opens a file.
+  BufferedFile(fmt::StringRef filename, fmt::StringRef mode);
+
+  // Closes the file.
+  void close();
+
+  // Returns the pointer to a FILE object representing this file.
+  FILE *get() const FMT_NOEXCEPT { return file_; }
+
+  // We place parentheses around fileno to workaround a bug in some versions
+  // of MinGW that define fileno as a macro.
+  int (fileno)() const;
+
+  void print(fmt::StringRef format_str, const ArgList &args) {
+    fmt::print(file_, format_str, args);
+  }
+  FMT_VARIADIC(void, print, fmt::StringRef)
+};
+
+// A file. Closed file is represented by a File object with descriptor -1.
+// Methods that are not declared with FMT_NOEXCEPT may throw
+// fmt::SystemError in case of failure. Note that some errors such as
+// closing the file multiple times will cause a crash on Windows rather
+// than an exception. You can get standard behavior by overriding the
+// invalid parameter handler with _set_invalid_parameter_handler.
+class File {
+ private:
+  int fd_;  // File descriptor.
+
+  // Constructs a File object with a given descriptor.
+  explicit File(int fd) : fd_(fd) {}
+
+ public:
+  // Possible values for the oflag argument to the constructor.
+  enum {
+    RDONLY = FMT_POSIX(O_RDONLY), // Open for reading only.
+    WRONLY = FMT_POSIX(O_WRONLY), // Open for writing only.
+    RDWR   = FMT_POSIX(O_RDWR)    // Open for reading and writing.
+  };
+
+  // Constructs a File object which doesn't represent any file.
+  File() FMT_NOEXCEPT : fd_(-1) {}
+
+  // Opens a file and constructs a File object representing this file.
+  File(fmt::StringRef path, int oflag);
+
+#if !FMT_USE_RVALUE_REFERENCES
+  // Emulate a move constructor and a move assignment operator if rvalue
+  // references are not supported.
+
+ private:
+  // A proxy object to emulate a move constructor.
+  // It is private to make it impossible call operator Proxy directly.
+  struct Proxy {
+    int fd;
+  };
+
+ public:
+  // A "move constructor" for moving from a temporary.
+  File(Proxy p) FMT_NOEXCEPT : fd_(p.fd) {}
+
+  // A "move constructor" for for moving from an lvalue.
+  File(File &other) FMT_NOEXCEPT : fd_(other.fd_) {
+    other.fd_ = -1;
+  }
+
+  // A "move assignment operator" for moving from a temporary.
+  File &operator=(Proxy p) {
+    close();
+    fd_ = p.fd;
+    return *this;
+  }
+
+  // A "move assignment operator" for moving from an lvalue.
+  File &operator=(File &other) {
+    close();
+    fd_ = other.fd_;
+    other.fd_ = -1;
+    return *this;
+  }
+
+  // Returns a proxy object for moving from a temporary:
+  //   File file = File(...);
+  operator Proxy() FMT_NOEXCEPT {
+    Proxy p = {fd_};
+    fd_ = -1;
+    return p;
+  }
+
+#else
+ private:
+  FMT_DISALLOW_COPY_AND_ASSIGN(File);
+
+ public:
+  File(File &&other) FMT_NOEXCEPT : fd_(other.fd_) {
+    other.fd_ = -1;
+  }
+
+  File& operator=(File &&other) {
+    close();
+    fd_ = other.fd_;
+    other.fd_ = -1;
+    return *this;
+  }
+#endif
+
+  // Destroys the object closing the file it represents if any.
+  ~File() FMT_NOEXCEPT;
+
+  // Returns the file descriptor.
+  int descriptor() const FMT_NOEXCEPT { return fd_; }
+
+  // Closes the file.
+  void close();
+
+  // Returns the file size.
+  fmt::LongLong size() const;
+
+  // Attempts to read count bytes from the file into the specified buffer.
+  std::size_t read(void *buffer, std::size_t count);
+
+  // Attempts to write count bytes from the specified buffer to the file.
+  std::size_t write(const void *buffer, std::size_t count);
+
+  // Duplicates a file descriptor with the dup function and returns
+  // the duplicate as a file object.
+  static File dup(int fd);
+
+  // Makes fd be the copy of this file descriptor, closing fd first if
+  // necessary.
+  void dup2(int fd);
+
+  // Makes fd be the copy of this file descriptor, closing fd first if
+  // necessary.
+  void dup2(int fd, ErrorCode &ec) FMT_NOEXCEPT;
+
+  // Creates a pipe setting up read_end and write_end file objects for reading
+  // and writing respectively.
+  static void pipe(File &read_end, File &write_end);
+
+  // Creates a BufferedFile object associated with this file and detaches
+  // this File object from the file.
+  BufferedFile fdopen(const char *mode);
+};
+
+// Returns the memory page size.
+long getpagesize();
+}  // namespace fmt
+
+#if !FMT_USE_RVALUE_REFERENCES
+namespace std {
+// For compatibility with C++98.
+inline fmt::BufferedFile &move(fmt::BufferedFile &f) { return f; }
+inline fmt::File &move(fmt::File &f) { return f; }
+}
+#endif
+
+#endif  // FMT_POSIX_H_
diff --git a/include/readerwriterqueue.h b/include/readerwriterqueue.h
new file mode 100644
index 0000000..1451e05
--- /dev/null
+++ b/include/readerwriterqueue.h
@@ -0,0 +1,630 @@
+// ©2013-2015 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+
+#pragma once
+
+#include "atomicops.h"
+#include <type_traits>
+#include <utility>
+#include <cassert>
+#include <stdexcept>
+#include <cstdint>
+#include <cstdlib>		// For malloc/free & size_t
+
+
+// A lock-free queue for a single-consumer, single-producer architecture.
+// The queue is also wait-free in the common path (except if more memory
+// needs to be allocated, in which case malloc is called).
+// Allocates memory sparingly (O(lg(n) times, amortized), and only once if
+// the original maximum size estimate is never exceeded.
+// Tested on x86/x64 processors, but semantics should be correct for all
+// architectures (given the right implementations in atomicops.h), provided
+// that aligned integer and pointer accesses are naturally atomic.
+// Note that there should only be one consumer thread and producer thread;
+// Switching roles of the threads, or using multiple consecutive threads for
+// one role, is not safe unless properly synchronized.
+// Using the queue exclusively from one thread is fine, though a bit silly.
+
+#define CACHE_LINE_SIZE 64
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4324)	// structure was padded due to __declspec(align())
+#pragma warning(disable: 4820)	// padding was added
+#pragma warning(disable: 4127)	// conditional expression is constant
+#endif
+
+namespace moodycamel {
+
+template<typename T, size_t MAX_BLOCK_SIZE = 512>
+class ReaderWriterQueue
+{
+	// Design: Based on a queue-of-queues. The low-level queues are just
+	// circular buffers with front and tail indices indicating where the
+	// next element to dequeue is and where the next element can be enqueued,
+	// respectively. Each low-level queue is called a "block". Each block
+	// wastes exactly one element's worth of space to keep the design simple
+	// (if front == tail then the queue is empty, and can't be full).
+	// The high-level queue is a circular linked list of blocks; again there
+	// is a front and tail, but this time they are pointers to the blocks.
+	// The front block is where the next element to be dequeued is, provided
+	// the block is not empty. The back block is where elements are to be
+	// enqueued, provided the block is not full.
+	// The producer thread owns all the tail indices/pointers. The consumer
+	// thread owns all the front indices/pointers. Both threads read each
+	// other's variables, but only the owning thread updates them. E.g. After
+	// the consumer reads the producer's tail, the tail may change before the
+	// consumer is done dequeuing an object, but the consumer knows the tail
+	// will never go backwards, only forwards.
+	// If there is no room to enqueue an object, an additional block (of
+	// equal size to the last block) is added. Blocks are never removed.
+
+public:
+	// Constructs a queue that can hold maxSize elements without further
+	// allocations. If more than MAX_BLOCK_SIZE elements are requested,
+	// then several blocks of MAX_BLOCK_SIZE each are reserved (including
+	// at least one extra buffer block).
+	explicit ReaderWriterQueue(size_t maxSize = 15)
+#ifndef NDEBUG
+		: enqueuing(false)
+		,dequeuing(false)
+#endif
+	{
+		assert(maxSize > 0);
+		assert(MAX_BLOCK_SIZE == ceilToPow2(MAX_BLOCK_SIZE) && "MAX_BLOCK_SIZE must be a power of 2");
+		assert(MAX_BLOCK_SIZE >= 2 && "MAX_BLOCK_SIZE must be at least 2");
+		
+		Block* firstBlock = nullptr;
+		
+		largestBlockSize = ceilToPow2(maxSize + 1);		// We need a spare slot to fit maxSize elements in the block
+		if (largestBlockSize > MAX_BLOCK_SIZE * 2) {
+			// We need a spare block in case the producer is writing to a different block the consumer is reading from, and
+			// wants to enqueue the maximum number of elements. We also need a spare element in each block to avoid the ambiguity
+			// between front == tail meaning "empty" and "full".
+			// So the effective number of slots that are guaranteed to be usable at any time is the block size - 1 times the
+			// number of blocks - 1. Solving for maxSize and applying a ceiling to the division gives us (after simplifying):
+			size_t initialBlockCount = (maxSize + MAX_BLOCK_SIZE * 2 - 3) / (MAX_BLOCK_SIZE - 1);
+			largestBlockSize = MAX_BLOCK_SIZE;
+			Block* lastBlock = nullptr;
+			for (size_t i = 0; i != initialBlockCount; ++i) {
+				auto block = make_block(largestBlockSize);
+				if (block == nullptr) {
+					throw std::bad_alloc();
+				}
+				if (firstBlock == nullptr) {
+					firstBlock = block;
+				}
+				else {
+					lastBlock->next = block;
+				}
+				lastBlock = block;
+				block->next = firstBlock;
+			}
+		}
+		else {
+			firstBlock = make_block(largestBlockSize);
+			if (firstBlock == nullptr) {
+				throw std::bad_alloc();
+			}
+			firstBlock->next = firstBlock;
+		}
+		frontBlock = firstBlock;
+		tailBlock = firstBlock;
+		
+		// Make sure the reader/writer threads will have the initialized memory setup above:
+		fence(memory_order_sync);
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	~ReaderWriterQueue()
+	{
+		// Make sure we get the latest version of all variables from other CPUs:
+		fence(memory_order_sync);
+
+		// Destroy any remaining objects in queue and free memory
+		Block* frontBlock_ = frontBlock;
+		Block* block = frontBlock_;
+		do {
+			Block* nextBlock = block->next;
+			size_t blockFront = block->front;
+			size_t blockTail = block->tail;
+
+			for (size_t i = blockFront; i != blockTail; i = (i + 1) & block->sizeMask) {
+				auto element = reinterpret_cast<T*>(block->data + i * sizeof(T));
+				element->~T();
+				(void)element;
+			}
+			
+			auto rawBlock = block->rawThis;
+			block->~Block();
+			std::free(rawBlock);
+			block = nextBlock;
+		} while (block != frontBlock_);
+	}
+
+
+	// Enqueues a copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T const& element)
+	{
+		return inner_enqueue<CannotAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T&& element)
+	{
+		return inner_enqueue<CannotAlloc>(std::forward<T>(element));
+	}
+
+
+	// Enqueues a copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T const& element)
+	{
+		return inner_enqueue<CanAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T&& element)
+	{
+		return inner_enqueue<CanAlloc>(std::forward<T>(element));
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// returns false instead. If the queue has at least one element,
+	// moves front to result using operator=, then returns true.
+	template<typename U>
+	bool try_dequeue(U& result)
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+
+		// High-level pseudocode:
+		// Remember where the tail block is
+		// If the front block has an element in it, dequeue it
+		// Else
+		//     If front block was the tail block when we entered the function, return false
+		//     Else advance to next block and dequeue the item there
+
+		// Note that we have to use the value of the tail block from before we check if the front
+		// block is full or not, in case the front block is empty and then, before we check if the
+		// tail block is at the front block or not, the producer fills up the front block *and
+		// moves on*, which would make us skip a filled block. Seems unlikely, but was consistently
+		// reproducible in practice.
+		// In order to avoid overhead in the common case, though, we do a double-checked pattern
+		// where we have the fast path if the front block is not empty, then read the tail block,
+		// then re-read the front block and check if it's not empty again, then check if the tail
+		// block has advanced.
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			// Front block not empty, dequeue from here
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			result = std::move(*element);
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				// Oh look, the front block isn't empty after all
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			// Don't need an acquire fence here since next can only ever be set on the tailBlock,
+			// and we're not the tailBlock, and we did an acquire earlier after reading tailBlock which
+			// ensures next is up-to-date on this CPU in case we recently were at tailBlock.
+
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			// Since the tailBlock is only ever advanced after being written to,
+			// we know there's for sure an element to dequeue on it
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			// We're done with this block, let the producer use it if it needs
+			fence(memory_order_release);		// Expose possibly pending changes to frontBlock->front from last dequeue
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);	// Not strictly needed
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			
+			result = std::move(*element);
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+
+
+	// Returns a pointer to the front element in the queue (the one that
+	// would be removed next by a call to `try_dequeue` or `pop`). If the
+	// queue appears empty at the time the method is called, nullptr is
+	// returned instead.
+	// Must be called only from the consumer thread.
+	T* peek()
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+		non_empty_front_block:
+			return reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlock->tail.load());
+			return reinterpret_cast<T*>(nextBlock->data + nextBlockFront * sizeof(T));
+		}
+		
+		return nullptr;
+	}
+	
+	// Removes the front element from the queue, if any, without returning it.
+	// Returns true on success, or false if the queue appeared empty at the time
+	// `pop` was called.
+	bool pop()
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			fence(memory_order_release);
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+	
+	// Returns the approximate number of items currently in the queue.
+	// Safe to call from both the producer and consumer threads.
+	inline size_t size_approx() const
+	{
+		size_t result = 0;
+		Block* frontBlock_ = frontBlock.load();
+		Block* block = frontBlock_;
+		do {
+			fence(memory_order_acquire);
+			size_t blockFront = block->front.load();
+			size_t blockTail = block->tail.load();
+			result += (blockTail - blockFront) & block->sizeMask;
+			block = block->next.load();
+		} while (block != frontBlock_);
+		return result;
+	}
+
+
+private:
+	enum AllocationMode { CanAlloc, CannotAlloc };
+
+	template<AllocationMode canAlloc, typename U>
+	bool inner_enqueue(U&& element)
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->enqueuing);
+#endif
+
+		// High-level pseudocode (assuming we're allowed to alloc a new block):
+		// If room in tail block, add to tail
+		// Else check next block
+		//     If next block is not the head block, enqueue on next block
+		//     Else create a new block and enqueue there
+		//     Advance tail to the block we just enqueued to
+
+		Block* tailBlock_ = tailBlock.load();
+		size_t blockFront = tailBlock_->localFront;
+		size_t blockTail = tailBlock_->tail.load();
+
+		size_t nextBlockTail = (blockTail + 1) & tailBlock_->sizeMask;
+		if (nextBlockTail != blockFront || nextBlockTail != (tailBlock_->localFront = tailBlock_->front.load())) {
+			fence(memory_order_acquire);
+			// This block has room for at least one more element
+			char* location = tailBlock_->data + blockTail * sizeof(T);
+			new (location) T(std::forward<U>(element));
+
+			fence(memory_order_release);
+			tailBlock_->tail = nextBlockTail;
+		}
+		else {
+			fence(memory_order_acquire);
+			if (tailBlock_->next.load() != frontBlock) {
+				// Note that the reason we can't advance to the frontBlock and start adding new entries there
+				// is because if we did, then dequeue would stay in that block, eventually reading the new values,
+				// instead of advancing to the next full block (whose values were enqueued first and so should be
+				// consumed first).
+				
+				fence(memory_order_acquire);		// Ensure we get latest writes if we got the latest frontBlock
+
+				// tailBlock is full, but there's a free block ahead, use it
+				Block* tailBlockNext = tailBlock_->next.load();
+				size_t nextBlockFront = tailBlockNext->localFront = tailBlockNext->front.load();
+				nextBlockTail = tailBlockNext->tail.load();
+				fence(memory_order_acquire);
+
+				// This block must be empty since it's not the head block and we
+				// go through the blocks in a circle
+				assert(nextBlockFront == nextBlockTail);
+				tailBlockNext->localFront = nextBlockFront;
+
+				char* location = tailBlockNext->data + nextBlockTail * sizeof(T);
+				new (location) T(std::forward<U>(element));
+
+				tailBlockNext->tail = (nextBlockTail + 1) & tailBlockNext->sizeMask;
+
+				fence(memory_order_release);
+				tailBlock = tailBlockNext;
+			}
+			else if (canAlloc == CanAlloc) {
+				// tailBlock is full and there's no free block ahead; create a new block
+				auto newBlockSize = largestBlockSize >= MAX_BLOCK_SIZE ? largestBlockSize : largestBlockSize * 2;
+				auto newBlock = make_block(newBlockSize);
+				if (newBlock == nullptr) {
+					// Could not allocate a block!
+					return false;
+				}
+				largestBlockSize = newBlockSize;
+
+				new (newBlock->data) T(std::forward<U>(element));
+
+				assert(newBlock->front == 0);
+				newBlock->tail = newBlock->localTail = 1;
+
+				newBlock->next = tailBlock_->next.load();
+				tailBlock_->next = newBlock;
+
+				// Might be possible for the dequeue thread to see the new tailBlock->next
+				// *without* seeing the new tailBlock value, but this is OK since it can't
+				// advance to the next block until tailBlock is set anyway (because the only
+				// case where it could try to read the next is if it's already at the tailBlock,
+				// and it won't advance past tailBlock in any circumstance).
+				
+				fence(memory_order_release);
+				tailBlock = newBlock;
+			}
+			else if (canAlloc == CannotAlloc) {
+				// Would have had to allocate a new block to enqueue, but not allowed
+				return false;
+			}
+			else {
+				assert(false && "Should be unreachable code");
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+
+	// Disable copying
+	ReaderWriterQueue(ReaderWriterQueue const&) {  }
+
+	// Disable assignment
+	ReaderWriterQueue& operator=(ReaderWriterQueue const&) {  }
+
+
+
+	AE_FORCEINLINE static size_t ceilToPow2(size_t x)
+	{
+		// From http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (size_t i = 1; i < sizeof(size_t); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename U>
+	static AE_FORCEINLINE char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+private:
+#ifndef NDEBUG
+	struct ReentrantGuard
+	{
+		ReentrantGuard(bool& _inSection)
+			: inSection(_inSection)
+		{
+			assert(!inSection);
+			if (inSection) {
+				throw std::runtime_error("ReaderWriterQueue does not support enqueuing or dequeuing elements from other elements' ctors and dtors");
+			}
+
+			inSection = true;
+		}
+
+		~ReentrantGuard() { inSection = false; }
+
+	private:
+		ReentrantGuard& operator=(ReentrantGuard const&);
+
+	private:
+		bool& inSection;
+	};
+#endif
+
+	struct Block
+	{
+		// Avoid false-sharing by putting highly contended variables on their own cache lines
+		weak_atomic<size_t> front;	// (Atomic) Elements are read from here
+		size_t localTail;			// An uncontended shadow copy of tail, owned by the consumer
+		
+		char cachelineFiller0[CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];
+		weak_atomic<size_t> tail;	// (Atomic) Elements are enqueued here
+		size_t localFront;
+		
+		char cachelineFiller1[CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];	// next isn't very contended, but we don't want it on the same cache line as tail (which is)
+		weak_atomic<Block*> next;	// (Atomic)
+		
+		char* data;		// Contents (on heap) are aligned to T's alignment
+
+		const size_t sizeMask;
+
+
+		// size must be a power of two (and greater than 0)
+		Block(size_t const& _size, char* _rawThis, char* _data)
+			: front(0), localTail(0), tail(0), localFront(0), next(nullptr), data(_data), sizeMask(_size - 1), rawThis(_rawThis)
+		{
+		}
+
+	private:
+		// C4512 - Assignment operator could not be generated
+		Block& operator=(Block const&);
+
+	public:
+		char* rawThis;
+	};
+	
+	
+	static Block* make_block(size_t capacity)
+	{
+		// Allocate enough memory for the block itself, as well as all the elements it will contain
+		auto size = sizeof(Block) + std::alignment_of<Block>::value - 1;
+		size += sizeof(T) * capacity + std::alignment_of<T>::value - 1;
+		auto newBlockRaw = static_cast<char*>(std::malloc(size));
+		if (newBlockRaw == nullptr) {
+			return nullptr;
+		}
+		
+		auto newBlockAligned = align_for<Block>(newBlockRaw);
+		auto newBlockData = align_for<T>(newBlockAligned + sizeof(Block));
+		return new (newBlockAligned) Block(capacity, newBlockRaw, newBlockData);
+	}
+
+private:
+	weak_atomic<Block*> frontBlock;		// (Atomic) Elements are enqueued to this block
+	
+	char cachelineFiller[CACHE_LINE_SIZE - sizeof(weak_atomic<Block*>)];
+	weak_atomic<Block*> tailBlock;		// (Atomic) Elements are dequeued from this block
+
+	size_t largestBlockSize;
+
+#ifndef NDEBUG
+	bool enqueuing;
+	bool dequeuing;
+#endif
+};
+
+}    // end namespace moodycamel
+
+#ifdef AE_VCPP
+#pragma warning(pop)
+#endif
diff --git a/include/safe_btree.h b/include/safe_btree.h
new file mode 100755
index 0000000..2d85c70
--- /dev/null
+++ b/include/safe_btree.h
@@ -0,0 +1,395 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A safe_btree<> wraps around a btree<> and removes the caveat that insertion
+// and deletion invalidate iterators. A safe_btree<> maintains a generation
+// number that is incremented on every mutation. A safe_btree<>::iterator keeps
+// a pointer to the safe_btree<> it came from, the generation of the tree when
+// it was last validated and the key the underlying btree<>::iterator points
+// to. If an iterator is accessed and its generation differs from the tree
+// generation it is revalidated.
+//
+// References and pointers returned by safe_btree iterators are not safe.
+//
+// See the incorrect usage examples mentioned in safe_btree_set.h and
+// safe_btree_map.h.
+
+#ifndef UTIL_BTREE_SAFE_BTREE_H__
+#define UTIL_BTREE_SAFE_BTREE_H__
+
+#include <stddef.h>
+#include <iosfwd>
+#include <utility>
+
+#include "btree.h"
+
+namespace btree {
+
+template <typename Tree, typename Iterator>
+class safe_btree_iterator {
+ public:
+  typedef typename Iterator::key_type key_type;
+  typedef typename Iterator::value_type value_type;
+  typedef typename Iterator::size_type size_type;
+  typedef typename Iterator::difference_type difference_type;
+  typedef typename Iterator::pointer pointer;
+  typedef typename Iterator::reference reference;
+  typedef typename Iterator::const_pointer const_pointer;
+  typedef typename Iterator::const_reference const_reference;
+  typedef typename Iterator::iterator_category iterator_category;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+  typedef safe_btree_iterator<Tree, Iterator> self_type;
+
+  void update() const {
+    if (iter_ != tree_->internal_btree()->end()) {
+      // A positive generation indicates a valid key.
+      generation_ = tree_->generation();
+      key_ = iter_.key();
+    } else {
+      // Use a negative generation to indicate iter_ points to end().
+      generation_ = -tree_->generation();
+    }
+  }
+
+ public:
+  safe_btree_iterator()
+      : generation_(0),
+        key_(),
+        iter_(),
+        tree_(NULL) {
+  }
+  safe_btree_iterator(const iterator &x)
+      : generation_(x.generation()),
+        key_(x.key()),
+        iter_(x.iter()),
+        tree_(x.tree()) {
+  }
+  safe_btree_iterator(Tree *tree, const Iterator &iter)
+      : generation_(),
+        key_(),
+        iter_(iter),
+        tree_(tree) {
+    update();
+  }
+
+  Tree* tree() const { return tree_; }
+  int64_t generation() const { return generation_; }
+
+  Iterator* mutable_iter() const {
+    if (generation_ != tree_->generation()) {
+      if (generation_ > 0) {
+        // This does the wrong thing for a multi{set,map}. If my iter was
+        // pointing to the 2nd of 2 values with the same key, then this will
+        // reset it to point to the first. This is why we don't provide a
+        // safe_btree_multi{set,map}.
+        iter_ = tree_->internal_btree()->lower_bound(key_);
+        update();
+      } else if (-generation_ != tree_->generation()) {
+        iter_ = tree_->internal_btree()->end();
+        generation_ = -tree_->generation();
+      }
+    }
+    return &iter_;
+  }
+  const Iterator& iter() const {
+    return *mutable_iter();
+  }
+
+  // Equality/inequality operators.
+  bool operator==(const const_iterator &x) const {
+    return iter() == x.iter();
+  }
+  bool operator!=(const const_iterator &x) const {
+    return iter() != x.iter();
+  }
+
+  // Accessors for the key/value the iterator is pointing at.
+  const key_type& key() const {
+    return key_;
+  }
+  // This reference value is potentially invalidated by any non-const
+  // method on the tree; it is NOT safe.
+  reference operator*() const {
+    assert(generation_ > 0);
+    return iter().operator*();
+  }
+  // This pointer value is potentially invalidated by any non-const
+  // method on the tree; it is NOT safe.
+  pointer operator->() const {
+    assert(generation_ > 0);
+    return iter().operator->();
+  }
+
+  // Increment/decrement operators.
+  self_type& operator++() {
+    ++(*mutable_iter());
+    update();
+    return *this;
+  }
+  self_type& operator--() {
+    --(*mutable_iter());
+    update();
+    return *this;
+  }
+  self_type operator++(int) {
+    self_type tmp = *this;
+    ++*this;
+    return tmp;
+  }
+  self_type operator--(int) {
+    self_type tmp = *this;
+    --*this;
+    return tmp;
+  }
+
+ private:
+  // The generation of the tree when "iter" was updated.
+  mutable int64_t generation_;
+  // The key the iterator points to.
+  mutable key_type key_;
+  // The underlying iterator.
+  mutable Iterator iter_;
+  // The tree the iterator is associated with.
+  Tree *tree_;
+};
+
+template <typename Params>
+class safe_btree {
+  typedef safe_btree<Params> self_type;
+
+  typedef btree<Params> btree_type;
+  typedef typename btree_type::iterator tree_iterator;
+  typedef typename btree_type::const_iterator tree_const_iterator;
+
+ public:
+  typedef typename btree_type::params_type params_type;
+  typedef typename btree_type::key_type key_type;
+  typedef typename btree_type::data_type data_type;
+  typedef typename btree_type::mapped_type mapped_type;
+  typedef typename btree_type::value_type value_type;
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+  typedef typename btree_type::pointer pointer;
+  typedef typename btree_type::const_pointer const_pointer;
+  typedef typename btree_type::reference reference;
+  typedef typename btree_type::const_reference const_reference;
+  typedef typename btree_type::size_type size_type;
+  typedef typename btree_type::difference_type difference_type;
+  typedef safe_btree_iterator<self_type, tree_iterator> iterator;
+  typedef safe_btree_iterator<
+    const self_type, tree_const_iterator> const_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+
+ public:
+  // Default constructor.
+  safe_btree(const key_compare &comp, const allocator_type &alloc)
+      : tree_(comp, alloc),
+        generation_(1) {
+  }
+
+  // Copy constructor.
+  safe_btree(const self_type &x)
+      : tree_(x.tree_),
+        generation_(1) {
+  }
+
+  iterator begin() {
+    return iterator(this, tree_.begin());
+  }
+  const_iterator begin() const {
+    return const_iterator(this, tree_.begin());
+  }
+  iterator end() {
+    return iterator(this, tree_.end());
+  }
+  const_iterator end() const {
+    return const_iterator(this, tree_.end());
+  }
+  reverse_iterator rbegin() {
+    return reverse_iterator(end());
+  }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  reverse_iterator rend() {
+    return reverse_iterator(begin());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  // Lookup routines.
+  iterator lower_bound(const key_type &key) {
+    return iterator(this, tree_.lower_bound(key));
+  }
+  const_iterator lower_bound(const key_type &key) const {
+    return const_iterator(this, tree_.lower_bound(key));
+  }
+  iterator upper_bound(const key_type &key) {
+    return iterator(this, tree_.upper_bound(key));
+  }
+  const_iterator upper_bound(const key_type &key) const {
+    return const_iterator(this, tree_.upper_bound(key));
+  }
+  std::pair<iterator, iterator> equal_range(const key_type &key) {
+    std::pair<tree_iterator, tree_iterator> p = tree_.equal_range(key);
+    return std::make_pair(iterator(this, p.first),
+                     iterator(this, p.second));
+  }
+  std::pair<const_iterator, const_iterator> equal_range(const key_type &key) const {
+    std::pair<tree_const_iterator, tree_const_iterator> p = tree_.equal_range(key);
+    return std::make_pair(const_iterator(this, p.first),
+                     const_iterator(this, p.second));
+  }
+  iterator find_unique(const key_type &key) {
+    return iterator(this, tree_.find_unique(key));
+  }
+  const_iterator find_unique(const key_type &key) const {
+    return const_iterator(this, tree_.find_unique(key));
+  }
+  iterator find_multi(const key_type &key) {
+    return iterator(this, tree_.find_multi(key));
+  }
+  const_iterator find_multi(const key_type &key) const {
+    return const_iterator(this, tree_.find_multi(key));
+  }
+  size_type count_unique(const key_type &key) const {
+    return tree_.count_unique(key);
+  }
+  size_type count_multi(const key_type &key) const {
+    return tree_.count_multi(key);
+  }
+
+  // Insertion routines.
+  template <typename ValuePointer>
+  std::pair<iterator, bool> insert_unique(const key_type &key, ValuePointer value) {
+    std::pair<tree_iterator, bool> p = tree_.insert_unique(key, value);
+    generation_ += p.second;
+    return std::make_pair(iterator(this, p.first), p.second);
+  }
+  std::pair<iterator, bool> insert_unique(const value_type &v) {
+    std::pair<tree_iterator, bool> p = tree_.insert_unique(v);
+    generation_ += p.second;
+    return std::make_pair(iterator(this, p.first), p.second);
+  }
+  iterator insert_unique(iterator position, const value_type &v) {
+    tree_iterator tree_pos = position.iter();
+    ++generation_;
+    return iterator(this, tree_.insert_unique(tree_pos, v));
+  }
+  template <typename InputIterator>
+  void insert_unique(InputIterator b, InputIterator e) {
+    for (; b != e; ++b) {
+      insert_unique(*b);
+    }
+  }
+  iterator insert_multi(const value_type &v) {
+    ++generation_;
+    return iterator(this, tree_.insert_multi(v));
+  }
+  iterator insert_multi(iterator position, const value_type &v) {
+    tree_iterator tree_pos = position.iter();
+    ++generation_;
+    return iterator(this, tree_.insert_multi(tree_pos, v));
+  }
+  template <typename InputIterator>
+  void insert_multi(InputIterator b, InputIterator e) {
+    for (; b != e; ++b) {
+      insert_multi(*b);
+    }
+  }
+  self_type& operator=(const self_type &x) {
+    if (&x == this) {
+      // Don't copy onto ourselves.
+      return *this;
+    }
+    ++generation_;
+    tree_ = x.tree_;
+    return *this;
+  }
+
+  // Deletion routines.
+  void erase(const iterator &begin, const iterator &end) {
+    tree_.erase(begin.iter(), end.iter());
+    ++generation_;
+  }
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(iterator iter) {
+    tree_iterator res = tree_.erase(iter.iter());
+    ++generation_;
+    return iterator(this, res);
+  }
+  int erase_unique(const key_type &key) {
+    int res = tree_.erase_unique(key);
+    generation_ += res;
+    return res;
+  }
+  int erase_multi(const key_type &key) {
+    int res = tree_.erase_multi(key);
+    generation_ += res;
+    return res;
+  }
+
+  // Access to the underlying btree.
+  btree_type* internal_btree() { return &tree_; }
+  const btree_type* internal_btree() const { return &tree_; }
+
+  // Utility routines.
+  void clear() {
+    ++generation_;
+    tree_.clear();
+  }
+  void swap(self_type &x) {
+    ++generation_;
+    ++x.generation_;
+    tree_.swap(x.tree_);
+  }
+  void dump(std::ostream &os) const {
+    tree_.dump(os);
+  }
+  void verify() const {
+    tree_.verify();
+  }
+  int64_t generation() const {
+    return generation_;
+  }
+  key_compare key_comp() const { return tree_.key_comp(); }
+
+  // Size routines.
+  size_type size() const { return tree_.size(); }
+  size_type max_size() const { return tree_.max_size(); }
+  bool empty() const { return tree_.empty(); }
+  size_type height() const { return tree_.height(); }
+  size_type internal_nodes() const { return tree_.internal_nodes(); }
+  size_type leaf_nodes() const { return tree_.leaf_nodes(); }
+  size_type nodes() const { return tree_.nodes(); }
+  size_type bytes_used() const { return tree_.bytes_used(); }
+  static double average_bytes_per_value() {
+    return btree_type::average_bytes_per_value();
+  }
+  double fullness() const { return tree_.fullness(); }
+  double overhead() const { return tree_.overhead(); }
+
+ private:
+  btree_type tree_;
+  int64_t generation_;
+};
+
+}  // namespace btree
+
+#endif  // UTIL_BTREE_SAFE_BTREE_H__
diff --git a/include/safe_btree_map.h b/include/safe_btree_map.h
new file mode 100755
index 0000000..a0668f1
--- /dev/null
+++ b/include/safe_btree_map.h
@@ -0,0 +1,89 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The safe_btree_map<> is like btree_map<> except that it removes the caveat
+// about insertion and deletion invalidating existing iterators at a small cost
+// in making iterators larger and slower.
+//
+// Revalidation occurs whenever an iterator is accessed.  References
+// and pointers returned by safe_btree_map<> iterators are not stable,
+// they are potentially invalidated by any non-const method on the map.
+//
+// BEGIN INCORRECT EXAMPLE
+//   for (auto i = safe_map->begin(); i != safe_map->end(); ++i) {
+//     const T *value = &i->second;  // DO NOT DO THIS
+//     [code that modifies safe_map and uses value];
+//   }
+// END INCORRECT EXAMPLE
+#ifndef UTIL_BTREE_SAFE_BTREE_MAP_H__
+#define UTIL_BTREE_SAFE_BTREE_MAP_H__
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "btree_container.h"
+#include "btree_map.h"
+#include "safe_btree.h"
+
+namespace btree {
+
+// The safe_btree_map class is needed mainly for its constructors.
+template <typename Key, typename Value,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<std::pair<const Key, Value> >,
+          int TargetNodeSize = 256>
+class safe_btree_map : public btree_map_container<
+  safe_btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef safe_btree_map<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_map_params<
+    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
+  typedef safe_btree<params_type> btree_type;
+  typedef btree_map_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+  // Default constructor.
+  safe_btree_map(const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  safe_btree_map(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  safe_btree_map(InputIterator b, InputIterator e,
+                 const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename V, typename C, typename A, int N>
+inline void swap(safe_btree_map<K, V, C, A, N> &x,
+                 safe_btree_map<K, V, C, A, N> &y) {
+  x.swap(y);
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_SAFE_BTREE_MAP_H__
diff --git a/include/safe_btree_set.h b/include/safe_btree_set.h
new file mode 100755
index 0000000..a6cd541
--- /dev/null
+++ b/include/safe_btree_set.h
@@ -0,0 +1,88 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The safe_btree_set<> is like btree_set<> except that it removes the caveat
+// about insertion and deletion invalidating existing iterators at a small cost
+// in making iterators larger and slower.
+//
+// Revalidation occurs whenever an iterator is accessed.  References
+// and pointers returned by safe_btree_map<> iterators are not stable,
+// they are potentially invalidated by any non-const method on the set.
+//
+// BEGIN INCORRECT EXAMPLE
+//   for (auto i = safe_set->begin(); i != safe_set->end(); ++i) {
+//     const T &value = *i;  // DO NOT DO THIS
+//     [code that modifies safe_set and uses value];
+//   }
+// END INCORRECT EXAMPLE
+
+#ifndef UTIL_BTREE_SAFE_BTREE_SET_H__
+#define UTIL_BTREE_SAFE_BTREE_SET_H__
+
+#include <functional>
+#include <memory>
+
+#include "btree_container.h"
+#include "btree_set.h"
+#include "safe_btree.h"
+
+namespace btree {
+
+// The safe_btree_set class is needed mainly for its constructors.
+template <typename Key,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<Key>,
+          int TargetNodeSize = 256>
+class safe_btree_set : public btree_unique_container<
+  safe_btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef safe_btree_set<Key, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
+  typedef safe_btree<params_type> btree_type;
+  typedef btree_unique_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+  // Default constructor.
+  safe_btree_set(const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  safe_btree_set(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  safe_btree_set(InputIterator b, InputIterator e,
+                 const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename C, typename A, int N>
+inline void swap(safe_btree_set<K, C, A, N> &x,
+                 safe_btree_set<K, C, A, N> &y) {
+  x.swap(y);
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_SAFE_BTREE_SET_H__
diff --git a/include/tensemble/BaseForest.h b/include/tensemble/BaseForest.h
new file mode 100644
index 0000000..3a235ec
--- /dev/null
+++ b/include/tensemble/BaseForest.h
@@ -0,0 +1,513 @@
+/* * * * *
+ *  BaseForest.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_Forest_h
+#define libTM_Forest_h
+#include "TreeRegressor.h"
+#include "TreeClassifier.h"
+#include "algorithm"
+#include "EvaluateMetric.h"
+#include "stdlib.h"
+#define FOREST_TREE_MAX_DEPTH 20
+
+boost::mutex RF_mutex;
+boost::mutex random_number_mutex;
+uint RF_n_trees_done;
+
+class BaseForest;
+
+
+class BaseForest {
+    //Base class for random forests
+    
+public:
+    BaseForest();
+    
+    BaseForest(int split_criterion,uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,uint find_split_algorithm,bool bootstrap,bool oob,bool compute_importance,uint random_seed,uint n_jobs,bool verbose);
+    
+    virtual ~BaseForest();
+    
+    virtual int build(REAL** X,REAL* y,uint n_samples)=0;
+    
+    virtual void predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures)=0;
+    
+    REAL* GetImportances();
+    
+    virtual int save_model(const char* filename)=0;
+    
+    bool load_model(const char* filename);
+protected:
+    void save_model(FILE* fp=NULL);
+    bool check_parameters();
+#ifdef DEBUG
+    void print_parameters();
+#endif
+    
+public:
+    Tree** tree;
+    uint n_jobs;
+    uint n_trees;
+    uint n_features;
+    uint n_classes;
+    uint max_depth;
+    uint min_sample_leaf;
+    uint max_features;
+    uint find_split_algorithm;
+    REAL oob_scores;
+    REAL* importances;
+    int bootstrap;
+    int oob;
+    int compute_importance;
+    uint random_seed;
+    int verbose;
+
+    int split_criterion;
+    int *original_classes;// converted classes=>original_classes
+};
+BaseForest::BaseForest(){
+    this->tree=NULL;
+    this->importances=NULL;
+    this->original_classes=NULL;
+}
+BaseForest::BaseForest(int split_criterion,uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,uint find_split_algorithm,bool bootstrap,bool oob,bool compute_importance,uint random_seed,uint n_jobs,bool verbose){
+    
+    RF_n_trees_done=0;
+    int max_thread=boost::thread::hardware_concurrency();
+    if (n_jobs>n_trees) {
+        n_jobs=n_trees;
+    }
+    if (n_jobs>max_thread) {
+        fprintf(stderr, "WARN: Number of thread is exceed the maximum number of hardware concurrency.Now set number of thread = %d\n",max_thread);
+        n_jobs=max_thread;
+    }
+    this->tree=NULL;
+    this->importances=NULL;
+    this->original_classes=NULL;
+    this->split_criterion=split_criterion;
+    this->tree=new Tree*[n_trees];
+    this->n_jobs=n_jobs;
+    this->n_trees=n_trees;
+    this->n_features=n_features;
+    this->find_split_algorithm=find_split_algorithm;
+    this->max_depth=MIN(FOREST_TREE_MAX_DEPTH, max_depth);
+    this->min_sample_leaf=min_sample_leaf;
+    this->bootstrap=bootstrap;
+    //if compute oob scores,must bootstrap=true
+    if (oob && !bootstrap) {
+        fprintf(stderr, "Out of bag estimation only available if bootstrap=True,auto turn out of bag estimation off.\n");
+        oob=false;
+    }
+    this->oob=oob;
+    this->oob_scores=-1;
+    this->compute_importance=compute_importance;
+    this->random_seed=random_seed;
+    this->verbose=verbose;
+#ifdef DEBUG
+    assert(max_features_ratio<=1.0);
+#endif
+    if (max_features_ratio==MTRY_DEFAULT) {
+        this->max_features=sqrt(n_features);
+    }else {
+        this->max_features=MAX(1,MIN(max_features_ratio,1.0)*n_features);
+    }
+    if (compute_importance) {
+        importances=new REAL[n_features];
+    }
+    //initialize all tree to null
+    for (uint i=0; i<n_trees; i++) {
+        this->tree[i]=NULL;
+    }
+}
+BaseForest::~BaseForest(){
+    if (tree) {
+        for (uint i=0; i<n_trees; i++) {
+            if (tree[i]) {
+                delete tree[i];
+                tree[i]=NULL;
+            }
+        }
+        delete []tree;
+        tree=NULL;
+    }
+    if (importances) {
+        delete [] importances;
+        importances=NULL;
+    }
+    if (original_classes) {
+        delete []original_classes;
+        original_classes=NULL;
+    }
+}
+REAL* BaseForest::GetImportances(){
+    return this->importances;
+}
+
+void BaseForest::save_model(FILE *fp){
+    if (!fp) {
+        fprintf(stderr, "Invalid FILE handler for save GBM model.\n");
+        return;
+    }
+    fprintf(fp, "split_criterion %d\n",split_criterion);
+    fprintf(fp, "n_classes %d\n",n_classes);
+    if (split_criterion==CRITERION_ENTROPY || split_criterion==CRITERION_GINI) {
+#ifdef DEBUG
+        assert(n_classes>=2);
+#endif
+        fprintf(fp, "original_classes");
+        for (uint i=0; i<n_classes; i++) {
+            fprintf(fp, " %d ",original_classes[i]);
+        }
+        fprintf(fp, "\n");
+    }else {
+        assert(n_classes==1);
+    }
+    fprintf(fp, "n_jobs %d\n",n_jobs);
+    fprintf(fp, "n_trees %d\n",n_trees);
+    fprintf(fp, "n_features %d\n",n_features);
+    fprintf(fp, "max_depth %d\n",max_depth);
+    fprintf(fp, "min_sample_leaf %d\n",min_sample_leaf);
+    fprintf(fp, "max_features %d\n",max_features);
+    fprintf(fp, "find_split_algorithm %d\n",find_split_algorithm);
+    fprintf(fp, "bootstrap %d\n",bootstrap);
+    fprintf(fp, "oob %d\n",oob);
+    if (oob) {
+        fprintf(fp, "oob_score %lf\n",oob_scores);
+    }
+    fprintf(fp, "compute_importance %d\n",compute_importance);
+    fprintf(fp, "random_seed %d\n",random_seed);
+    fprintf(fp, "verbose %d\n",verbose);
+    
+    fprintf(fp, "trees\n");
+    for (uint i=0; i<n_trees; i++) {
+        Tree *t=tree[i];
+        fprintf(fp, "nodes %d\n",t->numNodes);
+        for (uint k=ROOT; k<=t->numNodes; k++) {
+            fprintf(fp, "%u %.16lg %u %u %u %lg %lg %d ",\
+                    t->nodes[k]->feature_split,t->nodes[k]->value_split,\
+                    t->nodes[k]->nSamples,t->nodes[k]->left_child,\
+                    t->nodes[k]->right_child,t->nodes[k]->ini_error,\
+                    t->nodes[k]->best_error,t->nodes[k]->leaf);
+            if (t->nodes[k]->leaf) {
+                for (uint j=0; j<n_classes; j++) {
+                    fprintf(fp, "%.16lg ",t->nodes[k]->pred[j]);
+                }
+            }
+        }
+        fprintf(fp, "\n");
+    }
+}
+
+bool BaseForest::load_model(const char *filename){
+    int is_leaf;
+    original_classes=NULL;
+    FILE* fp=fopen(filename,"r");
+    if (!fp) {
+        fprintf(stderr, "Error: Cannot open file %s for load RandomForest model.Please check your model file.\n",filename);
+        return false;
+    }
+    int parameter_count=0;
+    char cmd[100];
+    fscanf(fp, "%*s");
+    while (1) {
+        fscanf(fp, "%100s",cmd);
+        if (strcmp(cmd, "split_criterion")==0) {
+            parameter_count++;
+            fscanf(fp, "%d",&split_criterion);
+        }
+        else if(strcmp(cmd, "n_classes")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&n_classes);
+        }
+        else if(strcmp(cmd, "n_jobs")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&n_jobs);
+        }
+        else if(strcmp(cmd, "n_trees")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&n_trees);
+        }
+        else if(strcmp(cmd, "n_features")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&n_features);
+        }
+        else if(strcmp(cmd, "max_depth")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&max_depth);
+        }
+        else if(strcmp(cmd, "min_sample_leaf")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&min_sample_leaf);
+        }
+        else if(strcmp(cmd, "max_features")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&max_features);
+        }
+        else if(strcmp(cmd, "find_split_algorithm")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&find_split_algorithm);
+        }
+        else if(strcmp(cmd, "bootstrap")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&bootstrap);
+        }
+        else if(strcmp(cmd, "oob")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&oob);
+        }
+        else if(strcmp(cmd, "random_seed")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&random_seed);
+        }
+        else if(strcmp(cmd, "verbose")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&verbose);
+        }
+        else if(strcmp(cmd, "oob_score")==0){
+//            parameter_count++;
+            fscanf(fp, "%lf",&oob_scores);
+        }
+        else if(strcmp(cmd, "compute_importance")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&compute_importance);
+        }
+        else if(strcmp(cmd, "original_classes")==0){
+            parameter_count++;
+            original_classes=new int[n_classes];
+            for (uint i=0; i<n_classes; i++) {
+                fscanf(fp, "%d",&original_classes[i]);
+            }
+        }
+        else if(strcmp(cmd,"trees")==0){
+            break;
+        }
+        else {
+            fprintf(stderr, "RandomForest read model failed,unknowed parameter %s.model file %s is corrupted.\n",cmd,filename);
+            return false;
+        }
+    }
+    if (!check_parameters()) {
+        fprintf(stderr, "RandomForest read model failed,model file %s is corrupted.\n", filename);
+        return false;
+    }
+    //allocate trees
+    tree=new Tree*[n_trees];
+    for (uint i=0; i<n_trees; i++) {
+        tree[i]=NULL;
+    }
+    for (uint i=0; i<n_trees; i++) {
+        fscanf(fp, "%100s",cmd);
+        if (strcmp(cmd, "nodes")!=0) {
+            fprintf(stderr, "RandomForest read model failed,model file %s is corrupted.\n",filename);
+            return false;
+        }
+        Tree *t=new Tree(split_criterion, n_classes, n_features, max_features, min_sample_leaf, max_depth,FIND_BEST,random_seed,1);
+        fscanf(fp, "%d\n",&t->numNodes);
+        t->max_nNodes=t->numNodes+1;
+        t->nodes=new TreeNode*[t->max_nNodes];
+        t->nodes[0]=new TreeNode();//dummy node.
+        for (uint k=ROOT; k<=t->numNodes; k++) {
+            t->nodes[k]=new TreeNode();
+            t->nodes[k]->leaf=false;
+            fscanf(fp, "%u %lf %u %u %u %lf %lf %d ",\
+                   &t->nodes[k]->feature_split,&t->nodes[k]->value_split,\
+                   &t->nodes[k]->nSamples,&t->nodes[k]->left_child,\
+                   &t->nodes[k]->right_child,&t->nodes[k]->ini_error,\
+                   &t->nodes[k]->best_error,&is_leaf);
+            if (is_leaf) {
+                t->nodes[k]->leaf=true;
+                t->nodes[k]->pred=new REAL[n_classes];
+                for (uint j=0; j<n_classes; j++) {
+                    fscanf(fp, "%lf ",&t->nodes[k]->pred[j]);
+                }
+            }
+        }
+        tree[i]=t;
+    }
+    fclose(fp);
+    return true;
+}
+
+void RF_build_trees_range(REAL** X,REAL* y,uint n_samples,BaseForest* forest,pair<uint, uint> n_trees_range,uint* oob_count,REAL* oob_prediction,REAL* importances){
+#ifdef DEBUG
+    if (forest->split_criterion==CRITERION_MSE) {
+        assert(forest->n_classes==1);
+    }
+#endif
+    char* str=new char[100];
+    int split_criterion=forest->split_criterion;
+    Tree** tree=forest->tree;
+    uint n_features=forest->n_features;
+    uint max_features=forest->max_features;
+    uint max_depth=forest->max_depth;
+    uint min_sample_leaf=forest->min_sample_leaf;
+    uint find_split_algorithm=forest->find_split_algorithm;
+    uint random_seed=forest->random_seed;
+    uint n_classes=forest->n_classes;
+    uint n_trees_beg=n_trees_range.first;
+    uint n_trees_end=n_trees_range.second;
+    uint verbose=forest->verbose;
+    bool oob=forest->oob;
+    bool compute_importance=forest->compute_importance;
+    bool bootstrap=forest->bootstrap;
+    
+    uint i,j,k;
+    REAL *oob_prediction_tmp,*importance_tmp;
+    REAL** sub_X;
+    REAL* sub_y;
+    bool* mask;//for oob prediction
+    if (oob) {
+        oob_prediction_tmp=new REAL[n_samples*n_classes];
+        for (i=0; i<n_samples*n_classes; i++) {
+            oob_prediction[i]=0;
+        }
+        for (i=0; i<n_samples; i++){
+            oob_count[i]=0;
+        }
+        mask=new bool[n_samples];
+    }
+    if (compute_importance) {
+        importance_tmp=new REAL[n_features];
+        for (j=0; j<n_features; j++) {
+            importances[j]=0;
+        }
+    }
+    if (bootstrap) {
+        sub_X=new REAL* [n_samples];
+        sub_y=new REAL[n_samples];
+    }
+    for (i=n_trees_beg; i<n_trees_end; i++) {
+        boost::mutex::scoped_lock rand_lock(random_number_mutex);
+        uint job_random_seed=randomMT();
+        rand_lock.unlock();
+        if (oob) {
+            for (j=0; j<n_samples; j++) {
+                mask[j]=false;
+            }
+        }
+        if (bootstrap) {
+            for (j=0; j<n_samples; j++) {
+                uint idx=rand_r(&job_random_seed)%n_samples;
+//                uint idx=randomMT()%n_samples;
+                sub_X[j]=X[idx];
+                sub_y[j]=y[idx];
+                if (oob) {
+                    mask[idx]=true;
+                }
+            }
+        }else {
+            sub_X=X;
+            sub_y=y;
+        }
+        //build tree
+        if (split_criterion==CRITERION_MSE) {
+            tree[i]=new TreeRegressor(n_features,\
+                                      max_features,\
+                                      min_sample_leaf,\
+                                      max_depth,\
+                                      find_split_algorithm,\
+                                      job_random_seed,\
+                                      1);
+        }else{
+            tree[i]=new TreeClassifier(split_criterion,\
+                                       n_classes,\
+                                       n_features,\
+                                       max_features,\
+                                       min_sample_leaf,\
+                                       max_depth,\
+                                       find_split_algorithm,\
+                                       job_random_seed,\
+                                       1);
+        }
+        tree[i]->build(sub_X, sub_y, n_samples);
+        if (oob) {
+            tree[i]->predict(X, oob_prediction_tmp, mask, n_samples, n_features);
+            for (j=0; j<n_samples; j++) {
+                if (mask[j]) {
+                    continue;
+                }
+                oob_count[j]++;
+                for (k=0; k<n_classes; k++) {
+                    oob_prediction[j*n_classes+k]+=oob_prediction_tmp[j*n_classes+k];
+                }
+            }
+        }
+        if (compute_importance) {
+            tree[i]->compute_importance(importance_tmp);
+            for (j=0; j<n_features; j++) {
+                importances[j]+=importance_tmp[j];
+            }
+            
+        }
+        
+        //print info
+        boost::mutex::scoped_lock lock(RF_mutex);
+        RF_n_trees_done++;
+        if (verbose) {
+            if (RF_n_trees_done!=1) {
+                for (j=0; j<50; j++) {
+                    fprintf(stderr, "\b");
+                }
+            }
+            str[0]='\0';
+            sprintf(str, "Random forest progress: %d/%d",RF_n_trees_done,forest->n_trees);
+            int str_len=strlen(str);
+            fprintf(stderr, "%s",str);
+            for (j=str_len; j<50; j++) {
+                fprintf(stderr, " ");
+            }
+        }
+        lock.unlock();
+    }
+    if (oob) {
+        delete []mask;
+        delete []oob_prediction_tmp;
+    }
+    if (compute_importance) {
+        delete []importance_tmp;
+    }
+    if (bootstrap) {
+        delete []sub_X;
+        delete []sub_y;
+    }
+    delete []str;
+}
+bool BaseForest::check_parameters(){
+    bool ret=true;
+    if (n_classes<=0 || n_trees<=0 || n_features<=0 || max_features>n_features || n_jobs<1) {
+        ret=false;
+    }
+    if (min_sample_leaf<=0 ||max_depth<2 || ( find_split_algorithm!=FIND_BEST && find_split_algorithm!=FIND_RANDOM) ) {
+        ret=false;
+    }
+    return ret;
+}
+#ifdef DEBUG
+void BaseForest::print_parameters(){
+    fprintf(stderr, "n_classes %d\n",n_classes);
+    fprintf(stderr, "n_jobs %d\n",n_jobs);
+    fprintf(stderr, "n_trees %d\n",n_trees);
+    fprintf(stderr, "n_features %d\n",n_features);
+    fprintf(stderr, "max_depth %d\n",max_depth);
+    fprintf(stderr, "min_sample_leaf %d\n",min_sample_leaf);
+    fprintf(stderr, "max_features %d\n",max_features);
+    fprintf(stderr, "find_split_algorithm %d\n",find_split_algorithm);
+    fprintf(stderr, "bootstrap %d\n",bootstrap);
+    fprintf(stderr, "oob %d\n",oob);
+    if (oob) {
+        fprintf(stderr, "oob_score %lf\n",oob_scores);
+    }
+    fprintf(stderr, "compute_importance %d\n",compute_importance);
+    fprintf(stderr, "random_seed %d\n",random_seed);
+    fprintf(stderr, "verbose %d\n",verbose);
+}
+#endif
+#endif
diff --git a/include/tensemble/BaseGBM.h b/include/tensemble/BaseGBM.h
new file mode 100644
index 0000000..88f9525
--- /dev/null
+++ b/include/tensemble/BaseGBM.h
@@ -0,0 +1,426 @@
+/* * * * *
+ *  BaseGBM.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_BaseGBM_h
+#define libTM_BaseGBM_h
+#include "LossFunction.h"
+#include "Tree.h"
+#include "EvaluateMetric.h"
+#define GBM_TREE_MAX_DEEP 15
+
+/* define loss */
+#define SQUARE_LOSS 0
+#define BINOMIAL_DEVIANCE   1
+#define MULTINOMIAL_DEVIANCE    2
+
+
+class BaseGBM {
+    /* Base gradient boosting machine,
+     * Don't use it directly.
+     */
+    
+public:
+    BaseGBM();
+    
+    BaseGBM(int loss_function,uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,REAL subsample,REAL learning_rate,bool oob,bool compute_importance,uint random_seed,uint n_jobs,int verbose);
+    
+    virtual ~BaseGBM();
+    
+    virtual int build(REAL **X, REAL *y, uint n_samples,
+                      REAL** val_X=NULL,REAL* val_y=NULL,uint n_val_samples=0)=0;    
+    
+    virtual void predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures,uint k_trees)=0;
+    
+    //may useful for validation
+    virtual void predict_verbose(REAL** X,REAL* y,REAL* pred,uint nSamples,uint nFeatures,\
+                                 uint k_trees)=0;
+    
+    virtual int save_model(const char* filename)=0;
+    
+    bool load_model(const char* filename);
+    
+    REAL* GetImportances();
+    
+protected:
+    void save_model(FILE* fp=NULL);
+    bool check_parameters();
+#ifdef DEBUG
+    void print_parameters();
+#endif
+    
+public:
+    int loss_function;
+    LossFunction* loss;
+    uint n_classes;
+    
+    Tree** tree;
+    uint n_jobs;
+    uint n_trees;
+    uint n_features;
+    uint max_depth;
+    uint min_sample_leaf;
+    uint max_features;
+    REAL subsample;
+    REAL learning_rate;
+    REAL* importances;
+    int oob;
+    int compute_importance;
+    uint random_seed;
+    int verbose;
+    
+    /* for regresson prior_pred has size 1,and original_classes=NULL
+     * for two-classes problems,prior_pred is the prior probability of positive classes.
+     */
+    REAL *prior_pred;
+    int *original_classes;// converted classes=>original_classes
+};
+
+BaseGBM::BaseGBM(){
+    tree=NULL;
+    loss=NULL;
+    importances=NULL;
+    prior_pred=NULL;
+    original_classes=NULL;
+}
+
+BaseGBM::BaseGBM(int loss_function,uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,REAL subsample,REAL learning_rate,bool oob,bool compute_importance,uint random_seed,uint n_jobs,int verbose){
+#ifdef DEBUG
+    assert(subsample<=1.0 && subsample>0.0);
+    assert(loss_function==SQUARE_LOSS || loss_function==MULTINOMIAL_DEVIANCE || loss_function==BINOMIAL_DEVIANCE);
+    assert(max_features_ratio<=1.0 && max_features_ratio>0.0);
+#endif
+    if (max_features_ratio==MTRY_DEFAULT) {
+        this->max_features=(uint)sqrt((REAL)n_features);
+    }else {
+        this->max_features=(uint)MAX(1.0,MIN(max_features_ratio,1.0)*n_features);
+    }
+    if (max_depth>GBM_TREE_MAX_DEEP) {
+        max_depth=GBM_TREE_MAX_DEEP;
+    }
+    //subsample rate must be less than or equal to 1.0 
+    if (subsample>1.0 || subsample<=0.0) {
+        subsample=1.0;
+    }
+    int max_thread=boost::thread::hardware_concurrency();
+    if (n_jobs>max_thread) {
+        fprintf(stderr, "WARNNING: Number of thread is exceed the maximum number of hardware concurrency.Now set number of thread = %d\n",max_thread);  
+        n_jobs=max_thread;
+    }
+    //reset number of threads if max_features is too small
+    if (n_jobs>1 && max_features/n_jobs<THREAD_MIN_FEATURES) {
+        fprintf(stderr, "WARNNING: Number of thread is %d,but max_feature=%d.Each thread at least need %d features to find best split for internal nodes in decision tree.Now set number of thread = %d\n",n_jobs,max_features,THREAD_MIN_FEATURES,MAX(1,(uint)max_features/THREAD_MIN_FEATURES));
+        n_jobs=MAX(1,(uint)max_features/THREAD_MIN_FEATURES);
+    }
+    //if compute oob scores,must bootstrap=true
+    if (oob && subsample>=1.0) {
+        fprintf(stderr, "Out of bag estimation only available if subsample<1.0,auto turn out of bag estimation off.\n");
+        oob=false;
+    }
+    this->loss_function=loss_function;
+    this->n_classes=-1;
+    this->tree=NULL;
+    this->importances=NULL;
+    this->loss=NULL;
+    this->prior_pred=NULL;
+    this->original_classes=NULL;
+    this->n_jobs=n_jobs;
+    this->n_trees=n_trees;
+    this->n_features=n_features;
+    this->max_depth=max_depth;
+    this->min_sample_leaf=min_sample_leaf;
+    this->learning_rate=learning_rate;
+    this->subsample=subsample;
+    this->oob=oob;
+    this->compute_importance=compute_importance;
+    this->random_seed=random_seed;
+    this->verbose=verbose;
+    if (compute_importance) {
+        importances=new REAL[n_features];
+    }
+}
+BaseGBM::~BaseGBM(){
+    if (tree) {
+#ifdef DEBUG
+        if (loss_function!=MULTINOMIAL_DEVIANCE) {
+            assert(n_classes==1);
+        }
+#endif
+        for (uint i=0; i<n_trees*n_classes; i++) {
+            delete tree[i];
+            tree[i]=NULL;
+        }
+        delete []tree;
+        tree=NULL;
+    }
+    if (importances) {
+        delete [] importances;
+        importances=NULL;
+    }
+    if (loss) {
+        delete loss;
+        loss=NULL;
+    }
+    if (prior_pred) {
+        delete []prior_pred;
+        prior_pred=NULL;
+    }
+    if (original_classes) {
+        delete []original_classes;
+        original_classes=NULL;
+    }
+}
+
+void BaseGBM::save_model(FILE* fp){
+    /* SAVE GBM MODEL
+     * FORMAT:
+     * loss_function
+     * n_classes
+     * prior_pred
+     * original_classes(optional)
+     * n_jobs
+     * n_trees
+     * n_features
+     * max_depth
+     * min_sample_leaf
+     * max_features
+     * subsample
+     * learning_rate
+     * oob
+     * compute_importance
+     * random_seed
+     * verbose
+     * tree#1
+     * tree#2
+     * ...
+     * tree#n
+     */
+    if (!fp) {
+        fprintf(stderr, "Invalid FILE handler for save GBM model.\n");
+        return;
+    }
+    fprintf(fp, "loss_function %d\n",loss_function);
+    fprintf(fp, "n_classes %d\n",n_classes);
+    fprintf(fp, "prior_pred ");
+    for (uint i=0; i<n_classes; i++) {
+        fprintf(fp, " %lf",prior_pred[i]);
+    }
+    fprintf(fp, "\n");
+    if (loss_function==BINOMIAL_DEVIANCE) {
+        n_classes=2;
+    }
+    if (loss_function==BINOMIAL_DEVIANCE || loss_function==MULTINOMIAL_DEVIANCE) {
+        fprintf(fp, "original_classes ");
+        for (uint i=0; i<n_classes; i++) {
+            fprintf(fp, " %d",original_classes[i]);
+        }
+        fprintf(fp, "\n");
+    }
+    if (loss_function==BINOMIAL_DEVIANCE) {
+        n_classes=1;
+    }
+    fprintf(fp, "n_jobs %d\n",n_jobs);
+    fprintf(fp, "n_trees %d\n",n_trees);
+    fprintf(fp, "n_features %d\n",n_features);
+    fprintf(fp, "max_depth %d\n",max_depth);
+    fprintf(fp, "min_sample_leaf %d\n",min_sample_leaf);
+    fprintf(fp, "max_features %d\n",max_features);
+    fprintf(fp, "subsample %lf\n",subsample);
+    fprintf(fp, "learning_rate %lf\n",learning_rate);
+    fprintf(fp, "oob %d\n",oob);
+    fprintf(fp, "compute_importance %d\n",compute_importance);
+    fprintf(fp, "random_seed %d\n",random_seed);
+    fprintf(fp, "verbose %d\n",verbose);
+    fprintf(fp, "trees\n");
+    for (uint i=0; i<n_trees; i++) {
+        for (uint j=0; j<n_classes; j++) {
+            Tree *t=tree[i*n_classes+j];
+            fprintf(fp, "nodes %d\n",t->numNodes);
+            for (uint k=ROOT; k<=t->numNodes; k++) {
+                fprintf(fp, "%u %.20lg %u %u %u %lg %lg %d ",\
+                        t->nodes[k]->feature_split,t->nodes[k]->value_split,\
+                        t->nodes[k]->nSamples,t->nodes[k]->left_child,\
+                        t->nodes[k]->right_child,t->nodes[k]->ini_error,\
+                        t->nodes[k]->best_error,t->nodes[k]->leaf);
+                if (t->nodes[k]->leaf) {
+                    fprintf(fp, "%.20lg ",t->nodes[k]->pred[0]);
+                }
+            }
+            fprintf(fp, "\n");
+        }
+    }
+}
+bool BaseGBM::load_model(const char *filename){
+    
+    int is_leaf;
+    FILE* fp=fopen(filename,"r");
+    if (!fp) {
+        fscanf(stderr, "Error: Cannot open file %s for load GBM model.Please check your model file.\n",filename);
+        return false;
+    }
+    int parameter_count=0;
+    char cmd[100];
+    fscanf(fp, "%*s");
+    while (1) {
+        fscanf(fp, "%100s",cmd);
+        if (strcmp(cmd, "loss_function")==0) {
+            parameter_count++;
+            fscanf(fp, "%d",&loss_function);
+        }
+        else if(strcmp(cmd, "n_classes")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&n_classes);
+        }
+        else if(strcmp(cmd, "n_jobs")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&n_jobs);
+        }
+        else if(strcmp(cmd, "n_trees")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&n_trees);
+        }
+        else if(strcmp(cmd, "n_features")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&n_features);
+        }
+        else if(strcmp(cmd, "max_depth")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&max_depth);
+        }
+        else if(strcmp(cmd, "min_sample_leaf")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&min_sample_leaf);
+        }
+        else if(strcmp(cmd, "max_features")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&max_features);
+        }
+        else if(strcmp(cmd, "learning_rate")==0){
+            parameter_count++;
+            fscanf(fp, "%lf",&learning_rate);
+        }
+        else if(strcmp(cmd, "subsample")==0){
+            parameter_count++;
+            fscanf(fp, "%lf",&subsample);
+        }
+        else if(strcmp(cmd, "oob")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&oob);
+        }
+        else if(strcmp(cmd, "random_seed")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&random_seed);
+        }
+        else if(strcmp(cmd, "verbose")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&verbose);
+        }
+        else if(strcmp(cmd, "compute_importance")==0){
+            parameter_count++;
+            fscanf(fp, "%d",&compute_importance);
+        }
+        else if(strcmp(cmd, "original_classes")==0){
+            parameter_count++;
+            original_classes=new int[MAX(2,n_classes)];
+            for (uint i=0; i<MAX(2,n_classes); i++) {
+                fscanf(fp, "%d",&original_classes[i]);
+            }
+        }
+        else if(strcmp(cmd, "prior_pred")==0){
+            parameter_count++;
+            prior_pred=new REAL[n_classes];
+            for (uint i=0; i<n_classes; i++) {
+                fscanf(fp, "%lf",&prior_pred[i]);
+            }
+        }
+        else if(strcmp(cmd,"trees")==0){
+            break;
+        }
+        else {
+            fprintf(stderr, "Error: GBM read model failed,unknowed parameter %s.model file %s is corrupted.\n",cmd,filename);
+            return false;
+        }
+    }
+    if (!check_parameters()) {
+        fprintf(stderr, "Error: GBM read model failed,model file %s is corrupted.\n", filename);
+        return false;
+    }
+    //allocate trees
+    tree=new Tree*[n_trees*n_classes];
+    for (uint i=0; i<n_trees; i++) {
+        tree[i]=NULL;
+    }
+    for (uint i=0; i<n_trees; i++) {
+        for (uint j=0; j<n_classes; j++) {
+            fscanf(fp, "%100s",cmd);
+            if (strcmp(cmd, "nodes")!=0) {
+                fprintf(stderr, "Error: GBM read model failed,model file %s is corrupted.\n",filename);
+                return false;
+            }
+            Tree *t=new Tree(CRITERION_MSE, 1, n_features, max_features, min_sample_leaf, max_depth,FIND_BEST,random_seed,n_jobs);
+            fscanf(fp, "%d\n",&t->numNodes);
+            t->max_nNodes=t->numNodes+1;
+            t->nodes=new TreeNode*[t->max_nNodes];
+            t->nodes[0]=new TreeNode();//dummy node.
+            for (uint k=ROOT; k<=t->numNodes; k++) {
+                t->nodes[k]=new TreeNode();
+                t->nodes[k]->leaf=false;
+                fscanf(fp, "%u %lf %u %u %u %lf %lf %d ",\
+                        &t->nodes[k]->feature_split,&t->nodes[k]->value_split,\
+                        &t->nodes[k]->nSamples,&t->nodes[k]->left_child,\
+                        &t->nodes[k]->right_child,&t->nodes[k]->ini_error,\
+                        &t->nodes[k]->best_error,&is_leaf);
+                if (is_leaf) {
+                    t->nodes[k]->leaf=true;
+                    t->nodes[k]->pred=new REAL;
+                    fscanf(fp, "%lf ",&t->nodes[k]->pred[0]);
+                }
+            }
+            tree[i*n_classes+j]=t;
+        }
+    }
+    fclose(fp);
+    return true;
+}
+REAL* BaseGBM::GetImportances(){
+    return this->importances;
+}
+bool BaseGBM::check_parameters(){
+    bool ret=true;
+    if (n_classes<=0 || n_trees<=0 || n_features<=0 || max_features>n_features || n_jobs<1) {
+        ret=false;
+    }
+    if (min_sample_leaf<=0 ||max_depth<2 || subsample<=0.0 || subsample>1.0 || learning_rate<=0.0 || learning_rate>=1.0) {
+        ret=false;
+    }
+    if (loss_function!=SQUARE_LOSS && loss_function!=BINOMIAL_DEVIANCE && loss_function!=MULTINOMIAL_DEVIANCE) {
+        ret=false;
+    }
+    return ret;
+}
+#ifdef DEBUG
+void BaseGBM::print_parameters(){
+    fprintf(stderr, "n_classes %d\n",n_classes);
+    fprintf(stderr, "n_jobs %d\n",n_jobs);
+    fprintf(stderr, "n_trees %d\n",n_trees);
+    fprintf(stderr, "n_features %d\n",n_features);
+    fprintf(stderr, "max_depth %d\n",max_depth);
+    fprintf(stderr, "min_sample_leaf %d\n",min_sample_leaf);
+    fprintf(stderr, "max_features %d\n",max_features);
+    fprintf(stderr, "loss_function %d\n",loss_function);
+    fprintf(stderr, "learning_rate %lf\n",learning_rate);
+    fprintf(stderr, "subsample %f\n",subsample);
+    fprintf(stderr, "compute_importance %d\n",compute_importance);
+    fprintf(stderr, "random_seed %d\n",random_seed);
+    fprintf(stderr, "verbose %d\n",verbose);
+}
+#endif
+#endif
diff --git a/include/tensemble/ClassificationCriterion.h b/include/tensemble/ClassificationCriterion.h
new file mode 100644
index 0000000..885e137
--- /dev/null
+++ b/include/tensemble/ClassificationCriterion.h
@@ -0,0 +1,201 @@
+/* * * * *
+ *  ClassificationCriterion.h 
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_ClassificationCriterion_h
+#define libTM_ClassificationCriterion_h
+#include "Criterion.h"
+#include "cmath"
+class ClassificationCriterion:public Criterion{
+    //Abstract criterion for classification.
+public:
+    ClassificationCriterion(uint n_classes);
+    virtual ~ClassificationCriterion();
+    virtual void init(REAL *y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples);
+    virtual void estimate(REAL* pred);
+    virtual void reset();
+    virtual REAL eval()=0;
+    virtual int update (REAL *y,uint* loc,uint a,uint b);
+    virtual int update_next(REAL *y,uint a);
+public:
+    uint *label_count_init;
+    uint *label_count_left;
+    uint *label_count_right;
+};
+ClassificationCriterion::ClassificationCriterion(uint n_classes){
+#ifdef DEBUG
+    assert(n_classes>=2);
+#endif
+    this->n_classes=n_classes;
+    label_count_init=new uint[n_classes];
+    label_count_left=new uint[n_classes];
+    label_count_right=new uint[n_classes];
+}
+ClassificationCriterion::~ClassificationCriterion(){
+    delete []label_count_left;
+    delete []label_count_right;
+    delete []label_count_init;
+}
+void ClassificationCriterion::init(REAL *y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples){
+    /* Initialise the criterion class; assume all samples
+     * are in the right branch.
+     * put all samples y[idx] which idx belong to (sample_ind[s_ind_beg],sample_ind[s_ind_end]) 
+     * to right right branch 
+     * sample_ind stored the real index in y
+     * y[i] must be non-negative and less than n_classes
+     */
+    this->nSamples=nSamples;
+    this->nLeft=0;
+    for (uint i=0; i<n_classes; i++) {
+        label_count_left[i]=0;
+        label_count_right[i]=0;
+        label_count_init[i]=0;
+    }
+#ifdef DEBUG
+    uint count=0;
+#endif
+    for (uint i=s_ind_beg; i<s_ind_end; i++) {
+        uint idx=sample_ind[i];
+#ifdef DEBUG
+            assert(y[i]>=0 && y[i]<n_classes);
+            count++;
+#endif
+        uint category=(uint)y[idx];
+        label_count_init[category]++;
+    }
+#ifdef DEBUG
+    assert(count==nSamples);
+#endif
+    reset();
+}
+void ClassificationCriterion::reset(){
+    nLeft=0;
+    for (uint i=0; i<n_classes; i++) {
+        label_count_left[i]=0;
+        label_count_right[i]=label_count_init[i];
+    }
+}
+void ClassificationCriterion::estimate(REAL* pred){
+    for (uint i=0; i<n_classes; i++) {
+        pred[i]=label_count_init[i];
+    }
+}
+
+//==================FIX ME===========================
+int ClassificationCriterion::update(REAL *y, uint *loc, uint a, uint b){
+    /*Update the criteria for each value in interval [a,b) and [b,end],
+     [a,b) in left side,[b,end] in right side(where a and b
+     are indices in loc).
+     */
+#ifdef DEBUG
+    assert(a>=0);
+    assert(b<=nSamples);
+#endif
+    for (uint i=a; i<b; i++) {
+        uint idx=loc[i];
+        uint category=y[idx];
+        label_count_left[category]++;
+        label_count_right[category]--;
+        nLeft++;
+    }
+    return nLeft;
+
+}
+//===================================================
+
+
+int ClassificationCriterion::update_next(REAL *y, uint a){
+    /* put sample[a] into left side and update the criteria*/
+#ifdef DEBUG
+    assert(a>=0);
+#endif
+    uint category=y[a];
+#ifdef DEBUG
+    assert(category>=0 && category<n_classes);
+#endif
+    label_count_left[category]++;
+    label_count_right[category]--;
+    nLeft++;
+    return nLeft;
+
+}
+
+class Entropy:public ClassificationCriterion {
+    //Cross Entropy splitting criteria.
+    
+public:
+    Entropy(uint n_classes);
+    
+    REAL eval();
+};
+
+Entropy::Entropy(uint n_classes):ClassificationCriterion(n_classes){
+    
+}
+
+REAL Entropy::eval(){
+    REAL l_entropy,r_entropy;
+    l_entropy=r_entropy=0;
+    uint nRight=nSamples-nLeft;
+    for (uint i=0; i<n_classes; i++) {
+        if (nLeft>0 && label_count_left[i]>0) {
+            l_entropy-=(1.0*label_count_left[i]/nLeft)*log((REAL)1.0*label_count_left[i]/nLeft);
+        }
+        if (nRight>0 && label_count_right[i]>0) {
+            r_entropy-=(1.0*label_count_right[i]/nRight)*log((REAL)1.0*label_count_right[i]/nRight);
+        }
+    }
+    l_entropy*=1.0*nLeft/nSamples;
+    r_entropy*=1.0*nRight/nSamples;
+    return l_entropy+r_entropy;
+}
+
+class Gini:public ClassificationCriterion {
+    //Gini spliting criteria
+    
+public:
+    Gini(uint n_classes);
+    REAL eval();
+};
+
+Gini::Gini(uint n_classes):ClassificationCriterion(n_classes){
+    
+}
+REAL Gini::eval(){
+    REAL l_score,r_score;
+    uint nRight=nSamples-nLeft;
+    l_score=(REAL)nLeft;
+    r_score=(REAL)nRight;
+    for (uint i=0; i<n_classes; i++) {
+        //Caution:avoid overflow.
+        if (label_count_left[i]>0) {
+            l_score-=(REAL)label_count_left[i]/nLeft*label_count_left[i];
+        }
+        if (label_count_right[i]>0) {
+            r_score-=(REAL)label_count_right[i]/nRight*label_count_right[i];
+        }
+    }
+#ifdef DEBUG
+    assert(l_score>=0);
+    assert(r_score>=0);
+#endif
+//    if (nLeft!=0) {
+//        l_score/=nLeft;
+//    }
+//    if (nRight!=0) {
+//        r_score/=nRight;
+//    }    
+    l_score/=nSamples;
+    r_score/=nSamples;
+    return l_score+r_score;
+}
+
+#endif
diff --git a/include/tensemble/Criterion.h b/include/tensemble/Criterion.h
new file mode 100644
index 0000000..b1f13d7
--- /dev/null
+++ b/include/tensemble/Criterion.h
@@ -0,0 +1,163 @@
+/* * * * *
+ *  Criterion.h 
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_Criterion_h
+#define libTM_Criterion_h
+#include "TypeDef.h"
+#include "assert.h"
+#define CRITERION_MSE   1
+#define CRITERION_ENTROPY 2
+#define CRITERION_GINI 3
+
+class Criterion {
+    //splitting criteria (regression and classification)
+public:
+//    Criterion(uint n_classes=1){
+//    }
+    virtual ~Criterion(){};
+public:
+    virtual void init(REAL *y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples)=0;
+    virtual void estimate(REAL* pred)=0;
+    virtual void reset()=0;
+    virtual REAL eval()=0;
+    virtual int update (REAL *y,uint* loc,uint a,uint b)=0;
+    virtual int update_next(REAL *y,uint a)=0;
+public:
+    //interval [left:mid] is in the left side,[mid+1:end] in right side
+    uint nSamples;
+    uint nLeft;
+    uint n_classes;
+};
+
+
+class RegressionCriterion:public Criterion {
+    //Abstract criterion for regression
+    
+public:
+    virtual void init(REAL *y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples);
+    virtual void estimate(REAL* pred);
+    virtual void reset();
+    virtual REAL eval()=0;
+    virtual int update (REAL *y,uint* loc,uint a,uint b);
+    virtual int update_next(REAL *y,uint a);
+public:
+    REAL mean_all;
+    REAL mean_left;
+    REAL mean_right;
+    REAL sq_sum_all;
+    REAL sq_sum_left;
+    REAL sq_sum_right;
+    REAL var_left;
+    REAL var_right;
+};
+void RegressionCriterion::init(REAL *y,uint* sample_ind,\
+                               uint s_ind_beg,uint s_ind_end,uint nSamples){
+    /* Initialise the criterion class; assume all samples
+     * are in the right branch.
+     * put all samples y[idx] which idx belong to (sample_ind[s_ind_beg],sample_ind[s_ind_end]) 
+     * to right right branch 
+     * sample_ind stored the real index in y
+     */
+    this->n_classes=1;
+    this->nSamples=nSamples;
+    this->nLeft=0;
+    mean_all=0.0;
+    mean_left=0.0;
+    mean_right=0.0;
+    sq_sum_all=0.0;
+    sq_sum_left=0.0;
+    sq_sum_right=0.0;
+    var_left=0.0;
+    var_right=0.0;
+#ifdef DEBUG
+    uint count=0;
+#endif
+    for (uint i=s_ind_beg; i<s_ind_end; i++) {
+        uint idx=sample_ind[i];
+        mean_all+=y[idx];
+        sq_sum_all+=y[idx]*y[idx];
+#ifdef DEBUG
+        count++;
+#endif
+    }
+    
+#ifdef DEBUG
+    assert(count==nSamples);
+#endif
+    mean_all/=nSamples;
+    reset();
+}
+void RegressionCriterion::reset(){
+    nLeft=0;
+    mean_left=0.0;
+    mean_right=mean_all;
+    sq_sum_left=0.0;
+    sq_sum_right=sq_sum_all;
+    var_left=0.0;
+    var_right=sq_sum_all-nSamples*(mean_right*mean_right);
+}
+void RegressionCriterion::estimate(REAL* pred){
+    //regression only need pred[0]
+    pred[0]=this->mean_all;
+}
+
+//=====================FIX ME==========================================
+int RegressionCriterion::update(REAL* y,uint* loc,uint a,uint b){
+    /*Update the criteria for each value in interval [a,b) and [b,end],
+     [a,b) in left side,[b,end] in right side(where a and b
+     are indices in loc).
+     */
+#ifdef DEBUG
+    assert(a>=0);
+    assert(b<=nSamples);
+#endif
+    for (uint i=a; i<b; i++) {
+        uint idx=loc[i];
+        sq_sum_left+=y[idx]*y[idx];
+        sq_sum_right-=y[idx]*y[idx];
+        mean_left=(nLeft*mean_left+y[idx])/(nLeft+1);
+        mean_right=((nSamples-nLeft)*mean_right-y[idx])/(nSamples-nLeft-1);
+        nLeft++;
+    }
+    return nLeft;
+}
+//======================================================================
+
+int RegressionCriterion::update_next(REAL* y,uint a){
+     /* put sample y[a] into left side and update the criteria*/
+#ifdef DEBUG
+    assert(a>=0);
+#endif
+    sq_sum_left+=y[a]*y[a];
+    sq_sum_right-=y[a]*y[a];
+    mean_left=(nLeft*mean_left+y[a])/(nLeft+1);
+    if (nLeft+1==nSamples) {
+        mean_right=0.0;
+    }else{
+        mean_right=(1.0*(nSamples-nLeft)*mean_right-y[a])/(nSamples-nLeft-1);
+    }
+    nLeft++;
+    var_left=sq_sum_left-nLeft*(mean_left*mean_left);
+    var_right=sq_sum_right-(nSamples-nLeft)*(mean_right*mean_right);
+    return nLeft;
+}
+
+class MSE:public RegressionCriterion {
+    /*Mean squared error impurity criterion.*/
+    
+public:
+    REAL eval();
+};
+REAL MSE::eval(){
+    return this->var_left+this->var_right;
+}
+#endif
diff --git a/include/tensemble/Estimator.h b/include/tensemble/Estimator.h
new file mode 100644
index 0000000..de329f6
--- /dev/null
+++ b/include/tensemble/Estimator.h
@@ -0,0 +1,79 @@
+/* * * * *
+ *  Estimator.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_Estimator_h
+#define libTM_Estimator_h
+#include "TypeDef.h"
+#include "assert.h"
+#include <map>
+
+void MeanEstimator(REAL* y,REAL* pred,REAL* prior,uint nSamples){
+    REAL mean=0;
+    for (uint i=0; i<nSamples; i++) {
+        mean+=y[i];
+    }
+    mean/=nSamples;
+//    mean=0;
+    for (uint i=0; i<nSamples; i++) {
+        pred[i]=mean;
+    }
+    *prior=mean;
+}
+
+void LogOddsEstimator(REAL* y,REAL* pred,REAL* prior,uint nSamples){
+    /* for two class problem,y must be 0 or 1 */
+    
+    uint n_pos=0;
+    for (uint i=0; i<nSamples; i++) {
+#ifdef DEBUG
+        assert(y[i]==0 || y[i]==1);
+#endif
+        n_pos+=y[i];
+    }
+    REAL prob=log((REAL)n_pos/(nSamples-n_pos));
+    for (uint i=0; i<nSamples; i++) {
+        pred[i]=prob;
+    }
+    *prior=prob;
+}
+
+void MulticlassPriorEstimator(REAL* y,REAL* pred,REAL* prior,uint nSamples,uint n_classes){
+    /* for multi-class problems,y must be 0,1,...,n_classes-1 
+     * pred is a nSamples*n_classes vector
+     */
+    
+#ifdef DEBUG
+    assert(n_classes>2);
+#endif
+    map<uint,uint> count;
+    uint i,j,offset;
+    for (i=0; i<n_classes; i++) {
+        count[i]=0;
+    }
+    for (i=0; i<nSamples; i++) {
+#ifdef DEBUG
+        assert(y[i]>=0 && y[i]<n_classes);
+#endif
+        count[y[i]]++;
+    }
+    for (j=0; j<n_classes; j++) {
+        offset=j*nSamples;
+        for (i=0; i<nSamples; i++) {
+            pred[offset+i]=count[j]/nSamples;
+        }
+    }
+    //return prior
+    for (j=0; j<n_classes; j++) {
+        prior[j]=count[j]/nSamples;
+    }
+}
+#endif
diff --git a/include/tensemble/EvaluateMetric.h b/include/tensemble/EvaluateMetric.h
new file mode 100644
index 0000000..1b428a8
--- /dev/null
+++ b/include/tensemble/EvaluateMetric.h
@@ -0,0 +1,83 @@
+/* * * * *
+ *  EvaluateMetric.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_EvaluateMetric_h
+#define libTM_EvaluateMetric_h
+#include "TypeDef.h"
+
+REAL mse(const REAL* preds, const REAL* label,uint nSamples) {
+    REAL r = 0;
+    int i, N = nSamples;
+    for (i=0; i<N; i++)
+        r += (label[i] - preds[i])*(label[i] - preds[i]); 
+    return 1.0 / N * r;
+}
+
+REAL rmse(const REAL* preds, const REAL* label,uint nSamples) {
+    REAL r = 0;
+    int i, N = nSamples;
+    for (i=0; i<N; i++)
+        r += (label[i] - preds[i])*(label[i] - preds[i]); 
+    return sqrt(1.0 / N * r);
+}
+
+REAL R2(const REAL* preds, const REAL* label,uint nSamples) {
+    REAL r1 = 0,r2=0,r3=0;
+    REAL mean_data=0,mean_preds=0;
+    int i, N = nSamples;
+    for (i=0; i<N; i++){
+        mean_data+=label[i];
+        mean_preds+=preds[i];
+    }
+    mean_data/=N;
+    mean_preds/=N;
+    for( i=0 ; i<N ; i++ )
+    {
+        r1+=(label[i]-mean_data)*(preds[i]-mean_preds);
+        r2+=(label[i]-mean_data)*(label[i]-mean_data);
+        r3+=(preds[i]-mean_preds)*(preds[i]-mean_preds);
+    }
+    if (r2==0.0 || r3==0.0) {
+        if (r1==0.0) {
+            return 1.0;
+        }
+        return 0.0;
+    }
+    return r1*r1/(r2*r3);
+}
+
+REAL BinomialDevianceLoss(REAL *y, REAL *pred,uint nSamples){
+    /* binomial deviance loss
+     * here y belong to [0,1]
+     */    
+    REAL deviance=0;
+    uint count=0;
+    for (uint i=0; i<nSamples; i++) {
+#ifdef DEBUG
+        assert(y[i]==0 || y[i]==1);
+#endif
+        deviance+=y[i]*pred[i]-log(1.0+exp(pred[i]));
+        count++;
+    }
+    return deviance/count;
+}
+
+REAL Accuracy(REAL* y,REAL *pred,uint nSamples){
+    REAL acc=0;
+    for (uint i=0; i<nSamples; i++) {
+        if (y[i]==pred[i]) {
+            acc++;
+        }
+    }
+    return acc/nSamples;
+}
+#endif
diff --git a/include/tensemble/FeatureData.h b/include/tensemble/FeatureData.h
new file mode 100644
index 0000000..c971d3e
--- /dev/null
+++ b/include/tensemble/FeatureData.h
@@ -0,0 +1,227 @@
+/* * * * *
+ *  FeatureData.h 
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_FeatureData_h
+#define libTM_FeatureData_h
+#include <iostream>
+#include <algorithm>
+#include "TypeDef.h"
+using namespace std;
+
+class FeatureData { // represents a training data set distributed among processors feature-wise
+public:
+    // constructor/destructor
+//    FeatureData();
+    FeatureData(uint nSamples, uint nFeaures);
+    ~FeatureData();
+    
+	// reading and initialization
+    void sort();
+	
+	// manage sample_in_node
+	void reset();
+	
+    // queries
+    uint getNumSamples();
+    uint getNumFeatures();
+    int getSampleInNode(uint i);//get node of sample[i]
+    uint_vec getNodeSamples(uint i);//get all sample from node i
+    void setNode(uint i_sample, uint i_node);//set i_sample to i_node
+
+    REAL getFeature(uint f, uint i);
+    REAL getSortedFeature(uint f, uint i);
+    uint getSortedIndex(uint f, uint i);
+    
+    uint whoHasFeature(uint f);
+    bool isLocalFeature(uint f);
+    uint localFeatureIndex(uint gf);
+    uint globalFeatureIndex(uint lf);
+    
+    // prediction
+    void updatePred(uint i, double p);
+    void updateResiduals();
+    
+    
+public:
+	// dataset descriptors
+	uint nSamples; // number of data instances
+    uint nFeatures; // number of features stored on this processor
+    
+	// static attributes
+    REAL** rawfeatures; // feature values ordered by instance
+    REAL** sortedfeatures; // feature values ordered by value
+    uint** sortedindices; // indices of original instances for each sorted feature
+    REAL* label; // target label value of each instance
+    
+	// level-specific attributes
+	int* sample_in_node; // indicated sample in which nodes.
+	
+};
+
+//FeatureData::FeatureData(){
+//    this->nSamples=this->nFeatures=-1;
+//    rawfeatures=NULL;
+//    sortedfeatures=NULL;
+//    sortedindices=NULL;
+//    label=NULL;
+//    sample_in_node=NULL;
+//}
+
+FeatureData::FeatureData(uint nSamples, uint nFeaures) {
+    this->nSamples=nSamples;
+    this->nFeatures = nFeaures;
+
+    // rawfeatures: initialized to minimum value (for missing values)
+    rawfeatures = new REAL*[nSamples];
+	for (int i=0; i<nSamples; i++) {
+        rawfeatures[i] = new REAL[nFeaures];
+		for (int j=0; j<nFeaures; j++)
+            rawfeatures[i][j] = 0.f; //-9999999.f; // TODO replace with better minimum value -- min float?
+	}
+	
+	// sortedfeatures, sortedindices: limited init, completed during sort()
+    sortedfeatures = new REAL*[nFeaures];
+    sortedindices = new uint*[nFeaures];
+    for (int i=0; i<nFeaures; i++) {
+        sortedfeatures[i] = new REAL[nSamples];
+        sortedindices[i] = new uint[nSamples];
+	}
+	
+	// label: limited init, read from file
+    label = new REAL[nSamples];
+	
+	// node: initialized to 0
+    sample_in_node = new int[nSamples];
+    for (int i=0; i<nSamples; i++)
+		sample_in_node[i] = -1;
+
+}
+
+FeatureData::~FeatureData() {
+    // delete all 1-d arrays: qid, label, node, pred, residual, idealdcg
+    if (label) {
+        delete [] label;
+    }
+    if (sample_in_node) {
+        delete [] sample_in_node;
+    }
+    label=NULL;
+    sample_in_node=NULL;
+	
+	// delete all 2-d arrays: rawfeatures, sortedfeatures, sortedindices
+    for (int i=0; i<nSamples; i++) {
+        if(rawfeatures[i])
+            delete [] rawfeatures[i];
+        rawfeatures[i] = NULL;
+    }
+	for (int i=0; i<nFeatures; i++) {
+        if(sortedfeatures[i])
+            delete [] sortedfeatures[i];
+        sortedfeatures[i] = NULL;
+        if(sortedindices[i])
+            delete [] sortedindices[i];
+        sortedindices[i] = NULL;
+    }
+    delete[] rawfeatures;
+    delete[] sortedfeatures;
+    delete[] sortedindices;
+}
+
+
+void FeatureData::reset() {
+    // clear nodes before next tree
+	for (int i=0; i<nSamples; i++) {
+        sample_in_node[i]=-1;
+	}
+}
+
+class FeatureValuePair {
+public:
+    uint index;
+	REAL value;
+};
+
+struct CompareFeatureValuePairs {
+    bool operator() (FeatureValuePair* fv1, FeatureValuePair* fv2) {
+        return (fv1->value < fv2->value);
+    }
+};
+
+void FeatureData::sort() {
+    // initialize FeatureValue array
+    CompareFeatureValuePairs cfvp;
+    FeatureValuePair** pairs = new FeatureValuePair*[nSamples];
+    for (int i=0; i<nSamples; i++)
+        pairs[i] = new FeatureValuePair();
+    
+    // sort each feature
+	for (int f=0; f<nFeatures; f++) {
+	    // load feature into pairs array
+        for (int i=0; i<nSamples; i++) {
+            pairs[i]->index = i;
+            pairs[i]->value = rawfeatures[f][i];
+        }
+	    
+	    // sort pairs array
+		std::sort(pairs, pairs + nSamples, cfvp);
+		
+		// write feature to sortedfeatures, sortedindices
+        for (int i=0; i<nSamples; i++) {
+            sortedfeatures[f][i] = pairs[i]->value;
+            sortedindices[f][i] = pairs[i]->index;
+        }
+	}
+	
+	// delete FeatureValue array
+    for (int i=0; i<nSamples; i++) {
+        delete pairs[i];
+        pairs[i] = NULL;
+    }
+    delete [] pairs;
+}
+
+uint FeatureData::getNumSamples() {
+    return nSamples;
+}
+
+int FeatureData::getSampleInNode(uint i) {
+    return sample_in_node[i];
+}
+
+uint_vec FeatureData::getNodeSamples(uint i){
+    uint_vec ret;
+    for (uint k=0; k<nSamples; k++) {
+        if (sample_in_node[k]==i) {
+            ret.push_back(k);
+        }
+    }
+    return ret;
+}
+
+void FeatureData::setNode(uint i_sample, uint i_node) {
+    sample_in_node[i_sample]=i_node;
+}
+
+
+REAL FeatureData::getFeature(uint f, uint i) {
+    return rawfeatures[f][i];
+}
+
+REAL FeatureData::getSortedFeature(uint f, uint i) {
+    return sortedfeatures[f][i];
+}
+
+uint FeatureData::getSortedIndex(uint f, uint i) {
+    return sortedindices[f][i];
+}
+
+#endif
diff --git a/include/tensemble/GBMClassifier.h b/include/tensemble/GBMClassifier.h
new file mode 100644
index 0000000..a6bcf57
--- /dev/null
+++ b/include/tensemble/GBMClassifier.h
@@ -0,0 +1,400 @@
+/* * * * *
+ *  GBMClassifier.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_GBMClassifier_h
+#define libTM_GBMClassifier_h
+#include "BaseGBM.h"
+#include <map>
+
+class GBMClassifier:public BaseGBM {
+    //gradient boosting regressor
+    
+public:
+    GBMClassifier(){};
+    
+    GBMClassifier(int loss_function,uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,REAL subsample,REAL learning_rate,bool oob,bool compute_importance,uint random_seed,uint n_jobs,int verbose);
+    ~GBMClassifier();
+    
+    int build(REAL **X, REAL *original_y, uint n_samples,\
+              REAL** val_X=NULL,REAL* val_y=NULL,uint n_val_samples=0);
+    
+    void predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures,uint k_trees=0);
+    
+    void predict_prob(REAL** X,REAL* pred_prob,uint nSamples,uint nFeatures,uint k_trees=0);
+    
+    void predict_stage(REAL** X,REAL* pred,uint nSamples,uint nFeatures,uint k);
+    
+    void score2label(REAL* score,REAL* label,uint nSamples);
+    
+    void score2prob(REAL* score,REAL* prob,uint nSamples);
+    
+    int save_model(const char* filename);
+    
+    void predict_verbose(REAL** X,REAL* y,REAL* pred,uint nSamples,uint nFeatures,\
+                                 uint k_trees){};
+    
+private:
+    void convert_to_unique(REAL* original_y,REAL* y,uint nSamples);
+    
+};
+
+GBMClassifier::GBMClassifier(int loss_function,uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,REAL subsample,REAL learning_rate,bool oob,bool compute_importance,uint random_seed,uint n_jobs,int verbose)\
+:BaseGBM(loss_function,n_trees,n_features,max_depth,min_sample_leaf,max_features_ratio,subsample,learning_rate,oob,compute_importance,random_seed,n_jobs,verbose){
+#ifdef DEBUG
+    assert(loss_function==BINOMIAL_DEVIANCE || loss_function==MULTINOMIAL_DEVIANCE);
+#endif
+}
+
+GBMClassifier::~GBMClassifier(){
+}
+
+void GBMClassifier::convert_to_unique(REAL *original_y, REAL *y,uint nSamples){
+    std::map<int,uint> key_values;
+    std::map<int,uint>::const_iterator iter;
+    vector<int> classes_map;
+    n_classes=0;
+    for (uint i=0; i<nSamples; i++) {
+        int origin_class=(int)original_y[i];
+        if ((iter=key_values.find(origin_class))==key_values.end()) {
+            key_values.insert(make_pair(origin_class, n_classes));
+            y[i]=n_classes;
+            classes_map.push_back(origin_class);
+            n_classes++;
+        }else {
+            y[i]=iter->second;
+        }
+    }
+    original_classes=new int[n_classes];
+    for (uint i=0; i<n_classes; i++) {
+        original_classes[i]=classes_map[i];
+    }
+}
+
+int GBMClassifier::build(REAL **X, REAL *original_y, uint n_samples,REAL** val_X,REAL* val_y,uint n_val_samples){
+    /* BUILD GRADIENT BOOSTING MODEL FOR CLASSIFICATION
+     * For classification, labels must correspond to classes 0, 1, ..., n_classes_-1
+     * For regression and two-classes classification,just need fit one tree in each GBM round
+     * For multi-classes classification,need fit n_classes trees in each GBM round
+     */
+#ifdef DEBUG
+    assert(loss_function==BINOMIAL_DEVIANCE || loss_function==MULTINOMIAL_DEVIANCE);
+#endif
+    uint i,j,k,idx,n_subsamples,*sample_index,count,offset;
+    REAL** sub_X,*sub_y,*y,*zero_one_y;
+    REAL* y_pred,*y_pred_label,*val_pred,*val_pred_stage,*val_pred_label,*residual,train_score,oob_score;
+    bool *mask;
+    //convert y to unique classes
+    y=new REAL[n_samples];
+    y_pred_label=new REAL[n_samples];
+    convert_to_unique(original_y, y, n_samples);
+    
+    if (n_classes==1) {
+        fprintf(stderr, "Error: The training file only have one label,at least two-labels are required for classification.Please check your file.\n");
+        delete []y;
+        return ENSEMBLE_FAIL;
+    }
+    
+    //initialize loss function
+    if (n_classes==2) {
+        loss_function=BINOMIAL_DEVIANCE;
+        loss=new BinomialDeviance;
+    }else {
+        loss_function=MULTINOMIAL_DEVIANCE;
+        loss=new MultinomialDeviance;
+    }
+    //set n_classes=1 for two-classes problems(just need build one tree each round)
+    if (loss_function==BINOMIAL_DEVIANCE) {
+        n_classes=1;
+    }
+    //allocate memory for prior and trees
+    prior_pred=new REAL[n_classes];
+    this->tree=new Tree*[n_trees*n_classes];
+    
+    n_subsamples=n_samples;
+    mask=new bool[n_samples];
+    y_pred=new REAL[n_samples*n_classes];
+    residual=new REAL[n_samples*n_classes];
+    if (subsample<1.0) {
+        sample_index=new uint[n_samples];
+        n_subsamples=(uint)floor((REAL)n_samples*subsample);
+        sub_X=new REAL*[n_subsamples];
+        sub_y=new REAL[n_subsamples];
+        for (i=0; i<n_samples; i++) {
+            sample_index[i]=i;
+        }
+#ifdef DEBUG
+        assert(n_subsamples<n_samples);
+#endif
+    }
+
+    //initialize prediction and save prior
+    loss->get_init_estimate(y, y_pred, prior_pred, n_samples, this->n_classes);
+    
+    //if have validation set,initialize prediction for validation set
+    if (val_X && val_y) {
+        val_pred=new REAL[n_classes*n_val_samples];
+        val_pred_stage=new REAL[n_classes*n_val_samples];
+        val_pred_label=new REAL[n_val_samples];
+        for (j=0; j<n_classes; j++) {
+            offset=j*n_val_samples;
+            for (i=0; i<n_val_samples; i++) {
+                val_pred[offset+i]=prior_pred[j];
+            }
+        }
+    }
+    if (loss_function==MULTINOMIAL_DEVIANCE) {
+        zero_one_y=new REAL[n_samples];
+    }
+    time_t beg,end;
+    beg=time(NULL);
+    //main iteration
+    for (i=0; i<n_trees; i++) {
+        //sub-sampling
+        memset(mask, false, n_samples*sizeof(bool));
+        if (subsample<1.0) {
+            count=n_samples;
+            for (j=0; j<n_subsamples; j++) {
+//                k=rand()%count;
+                k=randomMT()%count;
+                idx=sample_index[k];
+                sub_X[j]=X[idx];
+                mask[idx]=true;
+                swap(sample_index[k], sample_index[--count]);
+            }
+        }else {
+            sub_X=X;
+            for (j=0; j<n_samples; j++) {
+                mask[j]=true;
+            }
+        }
+        //build k trees each round
+        for(k=0;k<n_classes;k++){
+            REAL* kth_residual=residual+k*n_samples;
+            //convert to zero-one value for Multinomial Deviance
+            if (loss_function==MULTINOMIAL_DEVIANCE) {
+                for (j=0; j<n_samples; j++) {
+                    if (y[j]==k) {
+                        zero_one_y[j]=1;
+                    }else {
+                        zero_one_y[j]=0;
+                    }
+                }
+            }else {
+                zero_one_y=y;
+            }
+            //compute working residual for k-th tree
+            loss->compute_residual(kth_residual, zero_one_y, y_pred, n_samples, n_classes, k);
+            if (subsample<1.0) {
+                count=0;
+                for (j=n_samples-1; j>=n_samples-n_subsamples; j--) {
+                    idx=sample_index[j];
+                    sub_y[count++]=kth_residual[idx];
+                }
+#ifdef DEBUG
+                assert(count==n_subsamples);
+#endif
+            }else {
+                sub_y=kth_residual;
+            }
+            uint tree_random_seed=randomMT();
+            TreeRegressor *t=new TreeRegressor(n_features,\
+                                               max_features,\
+                                               min_sample_leaf,\
+                                               max_depth,\
+                                               FIND_BEST,\
+                                               tree_random_seed,\
+                                               n_jobs);
+            //build tree
+            t->build(sub_X, sub_y, n_subsamples);
+            //update
+            loss->update(t, X, mask, zero_one_y, y_pred+k*n_samples, kth_residual, n_samples, n_features, learning_rate, n_classes,k);
+            //add tree to ensemble
+            tree[i*n_classes+k]=t;
+        }         
+        if (val_X && val_y) {
+            predict_stage(val_X, val_pred_stage, n_val_samples, n_features, i);
+            for (k=0; k<n_classes; k++) {
+                offset=k*n_val_samples;
+                for (j=0; j<n_val_samples; j++) {
+                    val_pred[offset+j]+=this->learning_rate*val_pred_stage[offset+j];
+                }
+            }
+        }
+        if ((verbose && (i+1)%verbose==0) || i+1==n_trees) {
+            //calculate train score and oob score;
+//            train_score=loss->loss(y, y_pred, mask, true, n_samples, n_classes);
+            score2label(y_pred, y_pred_label, n_samples);
+            if (oob && false) {
+                oob_score=loss->loss(y, y_pred, mask, false, n_samples, n_classes);
+                oob_score=sqrt(oob_score);
+            }
+            if (verbose<=0) {
+                fprintf(stderr, "GBM train done.");
+            }else {
+                fprintf(stderr, "build tree %u of %u,",i+1,n_trees);
+            }
+            fprintf(stderr, "train Acc=%lf",Accuracy(original_y, y_pred_label, n_samples));
+            if (oob && false) {
+                fprintf(stderr, ",oob score(deviance)=%lf",oob_score);
+            }
+            if (val_y) {
+                score2label(val_pred, val_pred_label, n_val_samples);
+                fprintf(stderr, ",validation Acc=%f",Accuracy(val_y, val_pred_label, n_val_samples));
+            }
+            fprintf(stderr, ".\n");
+        }
+    }
+    delete []mask;
+    delete []y;
+    delete []y_pred;
+    delete []y_pred_label;
+    delete []residual;
+    if (subsample<1.0) {
+        delete []sample_index;
+        delete []sub_X;
+        delete []sub_y;
+    }
+    if (val_y) {
+        delete []val_pred_stage;
+        delete []val_pred;
+        delete []val_pred_label;
+    }
+    if (loss_function==MULTINOMIAL_DEVIANCE) {
+        delete []zero_one_y;
+    }
+    end=time(NULL);
+    fprintf(stderr, "|Gradient Boosting Classifier training done. | Using time: %.0lf secs|\n",difftime(end, beg));
+    return ENSEMBLE_SUCCESS;
+}
+
+void GBMClassifier::predict_stage(REAL **X, REAL *pred, uint nSamples, uint nFeatures, uint k){
+    /* predict k-th round */
+    for (uint i=0; i<n_classes; i++) {
+        tree[k*n_classes+i]->predict(X, pred+i*nSamples, nSamples, nFeatures);
+    }
+}
+
+void GBMClassifier::score2label(REAL *score, REAL *label,uint nSamples){
+    uint label_idx;
+    REAL max_score;
+    if (loss_function==BINOMIAL_DEVIANCE) {
+        for (uint i=0; i<nSamples; i++) {
+            if (1.0/(1.0+exp(-score[i]))>=0.5) {
+                label[i]=original_classes[1];
+            }else {
+                label[i]=original_classes[0];
+            }
+        }
+    }else{
+        for (uint i=0; i<nSamples; i++) {
+            max_score=-HUGE_VAL;
+            for (uint j=0; j<n_classes; j++) {
+                if (score[j*nSamples+i]>max_score) {
+                    max_score=score[j*nSamples+i];
+                    label_idx=j;
+                }
+            }
+            label[i]=original_classes[label_idx];
+        }
+    }
+}
+
+void GBMClassifier::predict(REAL **X, REAL *pred, uint nSamples, uint nFeatures,uint k_trees){
+    if (k_trees==0 || k_trees>this->n_trees) {
+        k_trees=n_trees;
+    }
+    uint i,j,k,offset;
+    REAL* score=new REAL[nSamples*n_classes];
+    REAL* score_stage=new REAL[nSamples];
+    for (i=0; i<n_classes; i++) {
+        offset=i*nSamples;
+        for (j=0; j<nSamples; j++) {
+            score[offset+j]=prior_pred[i];
+        }
+    }
+    for (i=0; i<k_trees; i++) {
+        for (k=0; k<n_classes; k++) {
+            offset=k*nSamples;
+            tree[i*n_classes+k]->predict(X, score_stage, nSamples, nFeatures);
+            for (j=0; j<nSamples; j++) {
+                score[offset+j]+=learning_rate*score_stage[j];
+            }
+        }
+    }
+    score2label(score, pred, nSamples);
+    delete []score;
+    delete []score_stage;
+}
+void GBMClassifier::score2prob(REAL *score, REAL *prob, uint nSamples){
+    uint i,j;
+    REAL sum_exp;
+    if (loss_function==BINOMIAL_DEVIANCE) {
+        for (i=0; i<nSamples; i++) {
+            prob[i]=1.0/(1.0+exp(-score[i]));
+        }
+    }else{
+        for (i=0; i<nSamples; i++) {
+            sum_exp=0.0;
+            for (j=0; j<n_classes; j++) {
+                sum_exp+=exp(score[j*nSamples+i]);
+            }
+            for (j=0; j<n_classes; j++) {
+                if (sum_exp<=EPS) {
+                    prob[j*nSamples+i]=0.0;
+                }else{
+                    prob[j*nSamples+i]=exp(score[j*nSamples+i])/sum_exp;
+                }
+            }
+        }
+    }
+}
+void GBMClassifier::predict_prob(REAL **X, REAL *pred_prob, uint nSamples, uint nFeatures, uint k_trees){
+    if (k_trees==0 || k_trees>this->n_trees) {
+        k_trees=n_trees;
+    }
+    uint i,j,k,offset;
+    REAL* score=new REAL[nSamples*n_classes];
+    REAL* score_stage=new REAL[nSamples];
+    for (i=0; i<n_classes; i++) {
+        offset=i*nSamples;
+        for (j=0; j<nSamples; j++) {
+            score[offset+j]=prior_pred[i];
+        }
+    }
+    for (i=0; i<k_trees; i++) {
+        for (k=0; k<n_classes; k++) {
+            offset=k*nSamples;
+            tree[i*n_classes+k]->predict(X, score_stage, nSamples, nFeatures);
+            for (j=0; j<nSamples; j++) {
+                score[offset+j]+=learning_rate*score_stage[j];
+            }
+        }
+    }
+    score2prob(score, pred_prob, nSamples);
+    delete []score;
+    delete []score_stage;
+
+}
+
+int GBMClassifier::save_model(const char *filename){
+    FILE* fp=fopen(filename,"w");
+    if (!fp) {
+        fprintf(stderr, "Cannot open file %s for save GBMClassifier model.Please check your file path is correct.\n",filename);
+        return false;
+    }
+    fprintf(fp, "GBMClassifier\n");
+    BaseGBM::save_model(fp);
+    fclose(fp);
+    return true;
+
+}
+#endif
diff --git a/include/tensemble/GBMRegressor.h b/include/tensemble/GBMRegressor.h
new file mode 100644
index 0000000..f64b821
--- /dev/null
+++ b/include/tensemble/GBMRegressor.h
@@ -0,0 +1,242 @@
+/* * * * *
+ *  GBMRegressor.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_GBMRegressor_h
+#define libTM_GBMRegressor_h
+#include "BaseGBM.h"
+
+
+class GBMRegressor:public BaseGBM {
+    //gradient boosting regressor
+    
+public:
+    GBMRegressor(){};
+    
+    GBMRegressor(int loss_function,uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,REAL subsample,REAL learning_rate,bool oob,bool compute_importance,uint random_seed,uint n_jobs,int verbose);
+    
+    ~GBMRegressor();
+    
+    int build(REAL **X, REAL *y, uint n_samples,\
+              REAL** val_X=NULL,REAL* val_y=NULL,uint n_val_samples=0);
+    
+    void predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures,uint k_trees=0);
+    
+    //may useful for validation
+    void predict_verbose(REAL** X,REAL* y,REAL* pred,uint nSamples,uint nFeatures,\
+                         uint k);
+    int save_model(const char* filename);
+};
+
+GBMRegressor::GBMRegressor(int loss_function,uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,REAL subsample,REAL learning_rate,bool oob,bool compute_importance,uint random_seed,uint n_jobs,int verbose)\
+:BaseGBM(loss_function,n_trees,n_features,max_depth,min_sample_leaf,max_features_ratio,subsample,learning_rate,oob,compute_importance,random_seed,n_jobs,verbose){
+}
+
+GBMRegressor::~GBMRegressor(){
+    
+}
+
+int GBMRegressor::build(REAL **X, REAL *y, uint n_samples,REAL** val_X,REAL* val_y,uint n_val_samples){
+    /* BUILD GRADIENT BOOSTING MODEL FOR REGRESSION
+     * For classification, labels must correspond to classes 0, 1, ..., n_classes_-1
+     * For regression and two-classes classification,just need fit one tree in each GBM round
+     * For multi-classes classification,need fit n_classes trees in each GBM round
+     */
+    uint i,j,k,idx,n_subsamples,*sample_index,count;
+    REAL** sub_X,*sub_y;
+    REAL* y_pred,*val_pred,*val_pred_stage,*residual,train_score,oob_score,valid_score;
+    bool *mask;
+    val_pred=NULL;
+#ifdef DEBUG
+    assert(n_trees>=1);
+#endif
+    //initialize loss function and set n_classes=1.
+    n_classes=1;
+    if (loss_function==SQUARE_LOSS) {
+        loss=new SquareLoss();
+    }else {
+        fprintf(stderr, "Error: Unknow loss function\n");
+        return ENSEMBLE_FAIL;
+    }
+    //allocate memory for prior and trees
+    prior_pred=new REAL[n_classes];
+    this->tree=new Tree*[n_trees];
+    
+    n_subsamples=n_samples;
+    mask=new bool[n_samples];
+    y_pred=new REAL[n_samples*n_classes];
+    if (subsample<1.0) {
+        sample_index=new uint[n_samples];
+        n_subsamples=(uint)floor((REAL)n_samples*subsample);
+        sub_X=new REAL*[n_subsamples];
+        sub_y=new REAL[n_subsamples];
+        for (i=0; i<n_samples; i++) {
+            sample_index[i]=i;
+        }
+#ifdef DEBUG
+        assert(n_subsamples<n_samples);
+#endif
+    }
+    residual=new REAL[n_samples];
+    //initialize prediction and save prior
+    loss->get_init_estimate(y, y_pred, prior_pred, n_samples, this->n_classes);
+    if (val_X && val_y) {
+        val_pred=new REAL[n_val_samples];
+        val_pred_stage=new REAL[n_val_samples];
+        for (i=0; i<n_val_samples; i++) {
+            val_pred[i]=*prior_pred;
+        }
+    }
+    time_t beg,end;
+    beg=time(NULL);
+    //main iteration
+    for (i=0; i<n_trees; i++) {
+        //compute working residual
+        loss->compute_residual(residual, y, y_pred, n_samples, n_classes, 0);
+        //sub-sampling
+        memset(mask, false, n_samples*sizeof(bool));
+        if (subsample<1.0) {
+            count=n_samples;
+            for (j=0; j<n_subsamples; j++) {
+//                k=rand()%count;
+                k=randomMT()%count;
+                idx=sample_index[k];
+                sub_X[j]=X[idx];
+                sub_y[j]=residual[idx];
+                mask[idx]=true;
+                swap(sample_index[k], sample_index[--count]);
+            }
+        }else {
+            sub_X=X;
+            sub_y=residual;
+            for (j=0; j<n_samples; j++) {
+                mask[j]=true;
+            }
+        }
+        uint tree_random_seed=randomMT();
+        TreeRegressor *t=new TreeRegressor(n_features,\
+                                           max_features,\
+                                           min_sample_leaf,\
+                                           max_depth,\
+                                           FIND_BEST,\
+                                           tree_random_seed,\
+                                           n_jobs);
+        //build tree
+        t->build(sub_X, sub_y, n_subsamples);
+        //update
+        loss->update(t, X, mask, y, y_pred, residual, n_samples, n_features, learning_rate, n_classes,0);
+        //add tree to ensemble
+        tree[i]=t;
+        if (val_X && val_y) {
+            t->predict(val_X, val_pred_stage, n_val_samples, n_features);
+            for (j=0; j<n_val_samples; j++) {
+                val_pred[j]+=learning_rate*val_pred_stage[j];
+            }
+        }
+        if ( (verbose && (i+1)%verbose==0) || i+1==n_trees) {      
+            //calculate train score and oob score;
+            train_score=loss->loss(y, y_pred, mask, true, n_samples, n_classes);
+            train_score=sqrt(train_score);
+            if (oob && false) {
+                oob_score=loss->loss(y, y_pred, mask, false, n_samples, n_classes);
+                oob_score=sqrt(oob_score);
+            }
+            if (verbose<=0) {
+                fprintf(stderr, "GBM train done.");
+            }else {
+                fprintf(stderr, "build tree %u of %u,",i+1,n_trees);
+            }
+            fprintf(stderr, "train RMSE=%f",train_score);
+            if (oob && false) {
+                fprintf(stderr, ",oob score(deviance)=%f",oob_score);
+            }
+            if (val_y) {
+                valid_score=rmse(val_pred, val_y, n_val_samples);
+                fprintf(stderr, ",validation RMSE=%f",valid_score);
+            }
+            fprintf(stderr, ".\n");
+
+        }
+    }
+    delete []mask;
+    delete []y_pred;
+    delete []residual;
+    if (subsample<1.0) {
+        delete []sample_index;
+        delete []sub_X;
+        delete []sub_y;
+    }
+    if (val_y) {
+        delete []val_pred_stage;
+        delete []val_pred;
+    }
+    end=time(NULL);
+    fprintf(stderr, "|Gradient Boosting Regressor training done. | Using time: %.0lf secs|\n",difftime(end, beg));
+    return ENSEMBLE_SUCCESS;
+}
+
+void GBMRegressor::predict(REAL **X, REAL *pred, uint nSamples, uint nFeatures,uint k_trees){
+#ifdef DEBUG
+    assert(k_trees<=n_trees);
+#endif
+    if (k_trees==0) {
+        k_trees=n_trees;
+    }
+    uint i,j;
+    REAL* pred_stage=new REAL[nSamples];
+    for (i=0; i<nSamples; i++) {
+        pred[i]=*prior_pred;
+    }
+    for (i=0; i<k_trees; i++) {
+        tree[i]->predict(X, pred_stage, nSamples, nFeatures);
+        for (j=0; j<nSamples; j++) {
+            pred[j]+=learning_rate*pred_stage[j];
+        }
+    }
+    delete []pred_stage;
+}
+
+void GBMRegressor::predict_verbose(REAL **X, REAL *y, REAL *pred, uint nSamples, uint nFeatures,uint k_trees){
+#ifdef DEBUG
+    assert(k_trees<=n_trees);
+#endif
+    if (k_trees==0) {
+        k_trees=n_trees;
+    }
+    uint i,j;
+    REAL* pred_stage=new REAL[nSamples];
+    for (i=0; i<nSamples; i++) {
+        pred[i]=*prior_pred;
+    }
+    for (i=0; i<k_trees; i++) {
+        tree[i]->predict(X, pred_stage, nSamples, nFeatures);
+        for (j=0; j<nSamples; j++) {
+            pred[j]+=learning_rate*pred_stage[j];
+        }
+        REAL MSE=mse(pred, y, nSamples);
+        REAL Rscore=R2(pred,y,nSamples);
+        fprintf(stderr,"GBMRegressor prediction: tree %u,MSE=%f,R^2=%f\n",i+1,MSE,Rscore);
+    }
+    delete []pred_stage;
+}
+
+int GBMRegressor::save_model(const char *filename){
+    FILE* fp=fopen(filename,"w");
+    if (!fp) {
+        fprintf(stderr, "Error: Cannot open file %s for save GBMRegressor model.Please check your file path is correct.\n",filename);
+        return false;
+    }
+    fprintf(fp, "GBMRegressor\n");
+    BaseGBM::save_model(fp);
+    fclose(fp);
+    return true;
+}
+#endif
diff --git a/include/tensemble/LossFunction.h b/include/tensemble/LossFunction.h
new file mode 100644
index 0000000..9f3a2cb
--- /dev/null
+++ b/include/tensemble/LossFunction.h
@@ -0,0 +1,350 @@
+/* * * * *
+ *  LossFunction.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_LossFunction_h
+#define libTM_LossFunction_h
+#include "Estimator.h"
+#include "Tree.h"
+
+class LossFunction {
+    /* ABSTRACT LOSS FUNCTION
+     * parameter n_classes and k in the following function is for classification
+     * get_init_estimate: return the prior estimate(initialize predction)
+     * loss: calculate the loss
+     * compute_residual: return the residual(or pseudoresponse).For multi-class problems,return the
+     * k-th residual(or pseudoresponse)
+     */
+    
+public:
+    REAL alpha;//just for QuantileLossFunction and HuberLossFunction
+public:
+    virtual ~LossFunction(){};
+    
+    virtual void get_init_estimate(REAL* y,REAL* pred,REAL* prior,uint nSamples,uint n_classes=1)=0;
+    
+    virtual REAL loss(REAL* y,REAL* pred,bool* mask,bool invert_mask,uint nSamples,uint n_classes=1)=0;
+    
+    virtual void compute_residual(REAL* residual, REAL* y,REAL* pred,uint nSamples,uint n_classes=1,uint k=0)=0;
+    
+    virtual void update(Tree* tree, REAL** X,bool* mask,REAL* y,REAL* pred,REAL* residual,\
+                        uint nSamples,uint nFeatures,REAL learning_rate,uint n_classes=1,uint k=0);
+    
+    virtual void update_terminal_region(Tree* tree, uint* region, bool* mask,REAL* y, \
+                                        REAL* residual,uint nSamples,uint n_classes)=0;
+};
+
+void LossFunction::update(Tree* tree, REAL** X,bool* mask,REAL* y,REAL* pred,REAL* residual,\
+                          uint nSamples,uint nFeature,REAL learning_rate,uint n_classes,uint k){
+    /* UPDATE FUNCTION FOR GRADIENT BOOSTING
+     * update k-th tree for gradient boosting.
+     * k=0,n_classes=1 for regression and two-classes classification
+     */
+    uint *region=new uint[nSamples];
+    tree->predict_terminal_region(X, region, nSamples, nFeature);
+    update_terminal_region(tree, region, mask, y, residual, nSamples,n_classes);
+    for (uint i=0; i<nSamples; i++) {
+#ifdef DEBUG
+        assert(y[i]==0 || y[i]==1);
+#endif
+        uint node_id=region[i];
+        pred[i]+=learning_rate*tree->nodes[node_id]->pred[0];
+//#ifdef DEBUG
+//        assert(pred[i]>=0);
+//#endif
+    }
+    delete []region;
+}
+
+class SquareLoss:public LossFunction {
+    //Square loss
+    
+public:
+    void get_init_estimate(REAL* y,REAL* pred,REAL* prior,uint nSamples,uint n_classes);
+    
+    REAL loss(REAL* y, REAL* pred,bool* mask,bool invert_mask,uint nSamples,uint n_classes);
+    
+    void compute_residual(REAL* residual, REAL* y,REAL* pred,uint nSamples,uint n_classes,uint k);
+    
+    void update(Tree* tree, REAL** X,bool* mask,REAL* y,REAL* pred,REAL* residual,\
+                        uint nSamples,uint nFeatures,REAL learning_rate,uint n_classes,uint k);
+    
+    void update_terminal_region(Tree* tree, uint* region, bool* mask,REAL* y, \
+                                        REAL* residual,uint nSamples,uint n_classes);
+};
+
+void SquareLoss::get_init_estimate(REAL *y, REAL *pred, REAL* prior, uint nSamples,uint n_classes){
+    return MeanEstimator(y, pred, prior,nSamples);
+}
+
+REAL SquareLoss::loss(REAL *y, REAL *pred,bool* mask,bool invert_mask,uint nSamples,uint n_classes){
+    /* compute the loss
+     * if invert_mask=true,return train score,else return OOB score.
+     */
+    REAL ret=0.0;
+    uint count=0;
+    for (uint i=0; i<nSamples; i++) {
+        if ( (mask[i] && !invert_mask) || (!mask[i] && invert_mask) ) {
+            continue;
+        }
+        ret+=(y[i]-pred[i])*(y[i]-pred[i]);
+        count++;
+    }
+    return ret/count;
+}
+
+void SquareLoss::compute_residual(REAL *residual, REAL *y, REAL *pred, uint nSamples,uint n_classes,uint k){
+    /* compute the working residual, equals negative gradient. 
+     */
+//    uint count=0;
+    for (uint i=0; i<nSamples; i++) {
+//        if (!mask[i]) {
+//            continue;
+//        }
+        residual[i]=y[i]-pred[i];
+    }
+}
+
+void SquareLoss::update(Tree *tree, REAL **X, bool *mask, REAL *y, REAL *pred, REAL *residual, uint nSamples, uint nFeatures, REAL learning_rate,uint n_classes, uint k){
+    /* Least squares does not need to update terminal regions.*/
+#ifdef DEBUG
+    assert(n_classes==1);
+    assert(k==0);
+#endif
+    REAL* r_pred=new REAL[nSamples];
+    tree->predict(X, r_pred, nSamples, nFeatures);
+    for (uint i=0; i<nSamples; i++) {
+        pred[i]+=learning_rate*r_pred[i];
+    }
+    delete []r_pred;
+}
+
+void SquareLoss::update_terminal_region(Tree *tree, uint *region, bool *mask, REAL *y, REAL *residual, uint nSamples,uint n_classes){
+    /* Least squares does not need to update terminal regions.*/
+    
+}
+
+class BinomialDeviance:public LossFunction {
+    //negative binomial log-likelihood loss
+    
+public:
+    void get_init_estimate(REAL* y,REAL* pred,REAL* prior,uint nSamples,uint n_classes);
+    
+    REAL loss(REAL* y, REAL* pred,bool* mask,bool invert_mask,uint nSamples,uint n_classes);
+    
+    void compute_residual(REAL* residual, REAL* y,REAL* pred,uint nSamples,uint n_classes,uint k);
+    
+    void update_terminal_region(Tree* tree, uint* region, bool* mask,REAL* y, \
+                                REAL* residual,uint nSamples,uint n_classes);
+};
+
+void BinomialDeviance::get_init_estimate(REAL *y, REAL *pred, REAL* prior, uint nSamples,uint n_classes){
+    return LogOddsEstimator(y, pred, prior, nSamples);
+}
+
+REAL BinomialDeviance::loss(REAL *y, REAL *pred,bool* mask,bool invert_mask,uint nSamples,uint n_classes){
+    /* compute the loss
+     * if invert_mask=true,return train score,else return OOB score.
+     */
+    REAL deviance=0;
+    uint count=0;
+    for (uint i=0; i<nSamples; i++) {
+        if ( (mask[i] && !invert_mask) || (!mask[i] && invert_mask) ) {
+            continue;
+        }
+#ifdef DEBUG
+        assert(y[i]==0 || y[i]==1);
+#endif
+        deviance-=y[i]*pred[i]-log(1.0+exp(pred[i]));
+        count++;
+    }
+    return deviance/count;
+}
+
+void BinomialDeviance::compute_residual(REAL *residual, REAL *y, REAL *pred, uint nSamples,uint n_classes,uint k){
+    /* compute the working residual, equals negative gradient. 
+     * y has zero-one values.e.g y[i]=1 if sample i belong positive class,otherwise y[i]=0.
+     */
+
+//    uint count=0;
+    for (uint i=0; i<nSamples; i++) {
+//        if (!mask[i]) {
+//            continue;
+//        }
+#ifdef DEBUG
+        assert(y[i]==0 || y[i]==1);
+#endif
+        residual[i]=y[i]-1.0/(1.0+exp(-pred[i]));
+    }
+}
+
+void BinomialDeviance::update_terminal_region(Tree* tree, uint* region, bool* mask, REAL* y,\
+                            REAL* residual,uint nSamples,uint n_classes){
+    /* UPDATE TERMINAL REGION FOR BinomialDeviance
+     * Make a single Newton-Raphson step
+     * sample i is out of bag sample if mask[i]=false
+     */
+    uint i,k,numNodes=tree->numNodes;
+    REAL *numerator,*denominator;
+    numerator=new REAL[numNodes+1];
+    denominator=new REAL[numNodes+1];
+    for (i=ROOT; i<=numNodes; i++) {
+        numerator[i]=denominator[i]=0;
+    }
+//    uint count=0;
+    for (i=0; i<nSamples; i++) {
+        if (!mask[i]) {
+            continue;
+        }
+#ifdef DEBUG
+        assert(y[i]==0 || y[i]==1);
+#endif
+        k=region[i];
+        numerator[k]+=residual[i];
+        denominator[k]+=(y[i]-residual[i])*(1.0-y[i]+residual[i]);
+//        count++;
+    }
+    for (i=ROOT; i<=numNodes; i++) {
+        //update terminal nodes
+        if (tree->nodes[i]->leaf) {
+            //using pred[0] to store the GBM prediction
+            if (denominator[i]==0) {
+                tree->nodes[i]->pred[0]=0;
+            }else{
+                tree->nodes[i]->pred[0]=numerator[i]/denominator[i];
+            }
+        }
+    }
+    delete []numerator;
+    delete []denominator;
+}
+
+class MultinomialDeviance:public LossFunction {
+    //multi-class logistic loss
+    
+public:
+    void get_init_estimate(REAL* y,REAL* pred,REAL* prior,uint nSamples,uint n_classes);
+    
+    REAL loss(REAL* y, REAL* pred,bool* mask,bool invert_mask,uint nSamples,uint n_classes);
+    
+    void compute_residual(REAL* residual, REAL* y,REAL* pred,uint nSamples,uint n_classes,uint k);
+    
+    void update_terminal_region(Tree* tree, uint* region, bool* mask,REAL* y, \
+                                REAL* residual,uint nSamples,uint n_classes);
+};
+
+void MultinomialDeviance::get_init_estimate(REAL *y, REAL *pred,REAL* prior, uint nSamples, uint n_classes){
+    return MulticlassPriorEstimator(y, pred, prior, nSamples, n_classes);
+}
+
+REAL MultinomialDeviance::loss(REAL *y, REAL *pred,bool* mask,bool invert_mask, uint nSamples,uint n_classes){
+    /* multinomial deviance loss
+     * here y belong to [0,1,...,n_classes-1],not zero-one values
+     * if invert_mask=true,return train score,else return OOB score.
+     */
+    
+    uint *zero_one_y=new uint[nSamples*n_classes];
+    uint i,j,k;
+    REAL deviance=0.0,sum_exp;
+    uint count=0;
+    memset(zero_one_y, 0, nSamples*n_classes*sizeof(uint));
+    for (i=0; i<nSamples; i++) {
+#ifdef DEBUG
+        assert(y[i]>=0 && y[i]<n_classes);
+#endif
+        k=y[i];
+        zero_one_y[k*nSamples+i]=1;
+    }
+    
+    for (i=0; i<nSamples; i++) {
+        if ( (mask[i] && !invert_mask) || (!mask[i] && invert_mask) ) {
+            continue;
+        }
+        sum_exp=0.0;
+        for (j=0; j<n_classes; j++) {
+            sum_exp+=exp(pred[j*nSamples+i]);
+        }
+#ifdef DEBUG
+        assert(sum_exp!=0);
+#endif
+        for (j=0; j<n_classes; j++) {
+            deviance-=((REAL)zero_one_y[j*nSamples+i]*pred[j*nSamples+i]-log(sum_exp));
+        }
+        count++;
+    }
+    delete []zero_one_y;
+    return deviance/count;
+}
+
+void MultinomialDeviance::compute_residual(REAL *residual, REAL *y, REAL *pred, uint nSamples,uint n_classes, uint k){
+    /* Compute residual(or pseudoresponse,negative gradient) for the k-th class. 
+     * y has zero-one values.e.g y[i]=1 if sample i belong k-th class,otherwise y[i]=0.
+     * pred has size nSamples*n_classes,that is pred=[pred_1,pred_2,...,pred_n_classes]
+     * pred_k=[f_k(X_1),f_k(X_2),...,f_k(X_nSamples)]
+     */
+    uint i,j;
+    REAL sum_exp,prob_k;
+    for (i=0; i<nSamples; i++) {
+#ifdef DEBUG
+        assert(y[i]==0 || y[i]==1);
+#endif
+        sum_exp=0.0;
+        for (j=0; j<n_classes; j++) {
+            sum_exp+=exp(pred[j*nSamples+i]);
+        }
+        if (sum_exp==0.0) {
+            prob_k=0.0;
+        }else{
+            prob_k=exp(pred[k*nSamples+i])/sum_exp;
+        }
+        residual[i]=y[i]-prob_k;
+    }
+}
+
+void MultinomialDeviance::update_terminal_region(Tree *tree, uint *region, bool *mask, REAL *y, REAL *residual, uint nSamples,uint n_classes){
+    /* UPDATE TERMINAL REGION FOR MultinomialDeviance
+     * Make a single Newton-Raphson step
+     * sample i is out of bag sample if mask[i]=false
+     */
+    uint i,k,numNodes=tree->numNodes;
+    REAL *numerator,*denominator;
+    numerator=new REAL[numNodes+1];
+    denominator=new REAL[numNodes+1];
+    for (i=ROOT; i<=numNodes; i++) {
+        numerator[i]=denominator[i]=0.0;
+    }
+//    uint count=0;
+    for (i=0; i<nSamples; i++) {
+        if (!mask[i]) {
+            continue;
+        }
+#ifdef DEBUG
+        assert(y[i]==0 || y[i]==1);
+#endif
+        k=region[i];
+        numerator[k]+=residual[i];
+        denominator[k]+=(y[i]-residual[i])*(1-y[i]+residual[i]);
+    }
+    for (i=ROOT; i<=numNodes; i++) {
+        //update terminal nodes
+        if (tree->nodes[i]->leaf) {
+            //using pred[0] to store the GBM prediction
+            if (denominator[i]==0) {
+                tree->nodes[i]->pred[0]=0.0;
+            }else{
+                tree->nodes[i]->pred[0]=((REAL)(n_classes-1.0)/n_classes)*numerator[i]/denominator[i];
+            }
+        }
+    }
+    delete []numerator;
+    delete []denominator;
+}
+#endif
diff --git a/include/tensemble/RandomForestClassifier.h b/include/tensemble/RandomForestClassifier.h
new file mode 100644
index 0000000..a5c34a9
--- /dev/null
+++ b/include/tensemble/RandomForestClassifier.h
@@ -0,0 +1,269 @@
+/* * * * *
+ *  RandomForestClassifier.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_RandomForestClassifier_h
+#define libTM_RandomForestClassifier_h
+#include "BaseForest.h"
+
+//global function for parallel build random forest classifier
+void RFC_build_trees_range(REAL** X,REAL* y,uint n_samples,BaseForest* forest,std::pair<uint, uint> n_trees_range,uint* oob_count,REAL* oob_prediction,REAL* importances);
+
+class RandomForestClassifier:public BaseForest {
+    //A random forest regressor
+    
+public:
+    RandomForestClassifier():BaseForest(){};
+    
+    RandomForestClassifier(int split_criterion,uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,bool bootstrap,bool oob,bool compute_importance,uint random_seed,uint n_jobs,bool verbose);
+    
+    ~RandomForestClassifier();
+    
+    int build(REAL** X,REAL* original_y,uint n_samples);
+    
+    void predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures);
+    
+    void predict_prob(REAL** X,REAL* pred_prob,uint nSamples,uint nFeatures);
+    
+    void score2label(REAL* score,REAL* pred_label,uint nSamples);
+    
+    int save_model(const char* filename);
+    
+private:
+    void convert_to_unique(REAL* original_y,REAL* y,uint nSamples);
+};
+RandomForestClassifier::RandomForestClassifier(int split_criterion,\
+                                             uint n_trees,\
+                                             uint n_features,\
+                                             uint max_depth,\
+                                             uint min_sample_leaf,\
+                                             REAL max_features_ratio,\
+                                             bool bootstrap,\
+                                             bool oob,\
+                                             bool compute_importance,\
+                                             uint random_seed,\
+                                             uint n_jobs,\
+                                             bool verbose)\
+:BaseForest(split_criterion, n_trees,n_features,max_depth,min_sample_leaf, max_features_ratio,FIND_BEST, bootstrap,oob,compute_importance,random_seed,n_jobs,verbose){    
+    
+}
+RandomForestClassifier::~RandomForestClassifier(){
+    
+}
+
+
+int RandomForestClassifier::build(REAL** X,REAL* original_y,uint n_samples){
+    time_t beg,end;
+    beg=time(NULL);
+    uint i,j,k,**oob_count,n_trees_beg,n_trees_end;
+    REAL** oob_prediction,**importance_jobs,*oob_label_prediction;
+    REAL* y;
+    
+    //convert y to unique classes
+    y=new REAL[n_samples];
+    convert_to_unique(original_y, y, n_samples);
+    if (n_classes==1) {
+        fprintf(stderr, "The training file only have one label,at least two-labels are required for classification.Please check your file.\n");
+        delete []y;
+        return ENSEMBLE_FAIL;
+    }
+    
+    std::pair<uint, uint> n_trees_range;
+    oob_count=new uint* [n_jobs];
+    oob_prediction=new REAL* [n_jobs];
+    importance_jobs=new REAL*[n_jobs];
+    if (oob) {
+        oob_label_prediction=new REAL[n_samples];
+        for (j=0; j<n_jobs; j++) {
+            oob_count[j]=new uint[n_samples];
+            oob_prediction[j]=new REAL[n_samples*n_classes];
+        }
+    }
+    if (compute_importance) {
+        this->importances=new REAL[n_features];
+        for (j=0; j<n_jobs; j++) {
+            importance_jobs[j]=new REAL[n_features];
+        }
+    }
+    boost::thread** thread=new boost::thread*[n_jobs];
+    for (j=0; j<n_jobs; j++) {
+        n_trees_beg=(n_trees/n_jobs)*j;
+        n_trees_end=(n_trees/n_jobs)*(j+1);
+        if (j==n_jobs-1) {
+            //set the rest trees to the last job.
+            n_trees_end=n_trees;
+        }
+        n_trees_range.first=n_trees_beg;
+        n_trees_range.second=n_trees_end;
+        thread[j]=new boost::thread(bind(RF_build_trees_range, X, y, n_samples, this, n_trees_range, oob_count[j], oob_prediction[j], importance_jobs[j]));
+    }
+    for (j=0; j<n_jobs; j++) {
+        thread[j]->join();
+        delete thread[j];
+    }
+    if (verbose) {
+        fprintf(stderr, "\n");
+    }
+    if (oob) {
+        bool warn_flag=false;
+        for (i=0; i<n_samples; i++) {
+            for (j=1; j<n_jobs; j++) {
+                oob_count[0][i]+=oob_count[j][i];
+                for (k=0; k<n_classes; k++) {
+                    oob_prediction[0][i*n_classes+k]+=oob_prediction[j][i*n_classes+k];
+                }
+            }
+            if (oob_count[0][i]==0) {
+                warn_flag==true?warn_flag=true:\
+                (fprintf(stderr, "WARN: Some inputs do not have OOB scores.This probably means too few trees were used to compute any reliable oob estimates.\n"),warn_flag=true);
+                oob_count[0][i]=1;
+            }
+            for (k=0; k<n_classes; k++) {
+                oob_prediction[0][i*n_classes+k]/=oob_count[0][i];
+            }
+        }
+        score2label(oob_prediction[0], oob_label_prediction, n_samples);
+        oob_scores=Accuracy(original_y, oob_label_prediction, n_samples);
+        if (1) {
+            fprintf(stderr, "Out-of-bag score(Accuracy)=%lf\n",oob_scores);
+        }
+        for (i=0; i<n_jobs; i++) {
+            delete []oob_count[i];
+            delete []oob_prediction[i];
+        }
+        delete []oob_count;
+        delete []oob_prediction;
+        delete []oob_label_prediction;
+    }
+    if (compute_importance) {
+        for (i=0; i<n_features; i++) {
+            importances[i]=0;
+            for (j=0; j<n_jobs; j++) {
+                importances[i]+=importance_jobs[j][i];
+            }
+            importances[i]/=n_trees;
+        }
+        for (i=0; i<n_jobs; i++) {
+            delete []importance_jobs[i];
+        }
+        delete []importance_jobs;
+    }
+    delete []y;
+    end=time(NULL);
+    fprintf(stderr, "|Random Forest training done. | Using time: %.0lf secs|\n",difftime(end, beg));
+    return ENSEMBLE_SUCCESS;
+}
+
+void RandomForestClassifier::convert_to_unique(REAL *original_y, REAL *y,uint nSamples){
+    std::map<int,uint> key_values;
+    std::map<int,uint>::const_iterator iter;
+    vector<int> classes_map;
+    n_classes=0;
+    for (uint i=0; i<nSamples; i++) {
+        int origin_class=(int)original_y[i];
+        if ((iter=key_values.find(origin_class))==key_values.end()) {
+            key_values.insert(make_pair(origin_class, n_classes));
+            y[i]=n_classes;
+            classes_map.push_back(origin_class);
+            n_classes++;
+        }else {
+            y[i]=iter->second;
+        }
+    }
+    original_classes=new int[n_classes];
+    for (uint i=0; i<n_classes; i++) {
+        original_classes[i]=classes_map[i];
+    }
+}
+
+void RandomForestClassifier::score2label(REAL *score, REAL *pred_label, uint nSamples){
+    uint idx,i,j;
+    for (i=0; i<nSamples; i++) {
+        REAL prob=-1;
+        for (j=0; j<n_classes; j++) {
+            if (score[i*n_classes+j]>prob) {
+                prob=score[i*n_classes+j];
+                idx=j;
+            }
+        }
+        pred_label[i]=original_classes[idx];
+    }
+}
+
+void RandomForestClassifier::predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures){
+    uint i,j;
+    REAL* single_tree_pred=new REAL[nSamples*n_classes];
+    REAL* score=new REAL[nSamples*n_classes];
+#ifdef DEBUG
+    assert(nFeatures==this->n_features);
+#endif
+    for (i=0; i<nSamples*n_classes; i++) {
+        score[i]=0;
+    }
+    for (i=0; i<n_trees; i++) {
+#ifdef DEBUG
+        assert(tree[i]!=NULL);
+#endif
+        tree[i]->predict(X, single_tree_pred, nSamples, nFeatures);
+        for (j=0; j<nSamples*n_classes; j++) {
+            score[j]+=single_tree_pred[j];
+        }
+    }
+    
+    //average prediction
+    for (i=0; i<nSamples*n_classes; i++) {
+        score[i]/=n_trees;
+    }
+//    predict_prob(X, score, nSamples, nFeatures);
+    score2label(score, pred, nSamples);
+    delete [] score;
+    delete [] single_tree_pred;
+}
+
+void RandomForestClassifier::predict_prob(REAL **X, REAL *pred_prob, uint nSamples, uint nFeatures){
+    uint i,j;
+    REAL* single_tree_pred=new REAL[nSamples*n_classes];
+#ifdef DEBUG
+    assert(nFeatures==this->n_features);
+#endif
+    for (i=0; i<nSamples*n_classes; i++) {
+        pred_prob[i]=0;
+    }
+    for (i=0; i<n_trees; i++) {
+#ifdef DEBUG
+        assert(tree[i]!=NULL);
+#endif
+        tree[i]->predict(X, single_tree_pred, nSamples, nFeatures);
+        for (j=0; j<nSamples*n_classes; j++) {
+            pred_prob[j]+=single_tree_pred[j];
+        }
+    }
+    //average prediction
+    for (i=0; i<nSamples*n_classes; i++) {
+        pred_prob[i]/=n_trees;
+    }
+    delete []single_tree_pred;
+}
+
+
+int RandomForestClassifier::save_model(const char *filename){
+    FILE* fp=fopen(filename,"w");
+    if (!fp) {
+        fprintf(stderr, "Error: Cannot open file %s for save RandomForestClassifier model.Please check your file path is correct.\n",filename);
+        return false;
+    }
+    fprintf(fp, "RandomForestClassifier\n");
+    BaseForest::save_model(fp);
+    fclose(fp);
+    return true;
+}
+
+#endif
diff --git a/include/tensemble/RandomForestRegressor.h b/include/tensemble/RandomForestRegressor.h
new file mode 100644
index 0000000..3e837df
--- /dev/null
+++ b/include/tensemble/RandomForestRegressor.h
@@ -0,0 +1,279 @@
+/* * * * *
+ *  RandomForestRegressor.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_RandomForestRegressor_h
+#define libTM_RandomForestRegressor_h
+#include "BaseForest.h"
+
+extern boost::mutex RF_mutex;
+//global function for parallel build random forest regressor
+void RFG_build_trees_range(REAL** X,REAL* y,uint n_samples,BaseForest* forest,uint n_trees_beg,uint n_trees_end,uint* oob_count,REAL* oob_prediction,REAL* importances);
+
+class RandomForestRegressor:public BaseForest {
+    //A random forest regressor
+    
+public:
+    RandomForestRegressor():BaseForest(){};
+    
+    RandomForestRegressor(uint n_trees,uint n_features,uint max_depth,uint min_sample_leaf, REAL max_features_ratio,bool bootstrap,bool oob,bool compute_importance,uint random_seed,uint n_jobs,bool verbose);
+    
+    ~RandomForestRegressor();
+    
+    int build(REAL** X,REAL* y,uint n_samples);
+    
+    void predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures);
+    
+    int save_model(const char* filename);
+    
+};
+RandomForestRegressor::RandomForestRegressor(uint n_trees,\
+                                             uint n_features,\
+                                             uint max_depth,\
+                                             uint min_sample_leaf,\
+                                             REAL max_features_ratio,\
+                                             bool bootstrap,\
+                                             bool oob,\
+                                             bool compute_importance,\
+                                             uint random_seed,\
+                                             uint n_jobs,\
+                                             bool verbose)\
+:BaseForest(CRITERION_MSE,n_trees,n_features,max_depth,min_sample_leaf, max_features_ratio,FIND_BEST, bootstrap,oob,compute_importance,random_seed,n_jobs,verbose){    
+}
+RandomForestRegressor::~RandomForestRegressor(){
+    
+}
+
+int RandomForestRegressor::build(REAL** X,REAL* y,uint n_samples){
+    
+    //set n_classes=1 for regression
+    this->n_classes=1;
+    
+    time_t beg,end;
+    beg=time(NULL);
+    uint i,j,**oob_count,n_trees_beg,n_trees_end;
+    REAL** oob_prediction,**importance_tmp;
+    oob_count=new uint* [n_jobs];
+    oob_prediction=new REAL* [n_jobs];
+    importance_tmp=new REAL*[n_jobs];
+    std::pair<uint,uint> n_trees_range;
+    if (oob) {
+        for (j=0; j<n_jobs; j++) {
+            oob_count[j]=new uint[n_samples];
+            oob_prediction[j]=new REAL[n_samples];
+        }
+    }
+    if (compute_importance) {
+        this->importances=new REAL[n_features];
+        for (j=0; j<n_jobs; j++) {
+            importance_tmp[j]=new REAL[n_features];
+        }
+    }
+    boost::thread** thread=new boost::thread*[n_jobs];
+    for (j=0; j<n_jobs; j++) {
+        n_trees_beg=(n_trees/n_jobs)*j;
+        n_trees_end=(n_trees/n_jobs)*(j+1);
+        if (j==n_jobs-1) {
+            //set the rest trees to the last job.
+            n_trees_end=n_trees;
+        }
+        n_trees_range.first=n_trees_beg;
+        n_trees_range.second=n_trees_end;
+        thread[j]=new boost::thread(bind(RF_build_trees_range, X, y, n_samples, this, n_trees_range, oob_count[j], oob_prediction[j], importance_tmp[j]));
+    }
+    for (j=0; j<n_jobs; j++) {
+        thread[j]->join();
+        delete thread[j];
+    }
+    if (verbose) {
+        fprintf(stderr, "\n");
+    }
+    if (oob) {
+        bool warn_flag=false;
+        for (i=0; i<n_samples; i++) {
+            for (j=1; j<n_jobs; j++) {
+                oob_count[0][i]+=oob_count[j][i];
+                oob_prediction[0][i]+=oob_prediction[j][i];
+            }
+            if (oob_count[0][i]==0) {
+                warn_flag==true?warn_flag=true:\
+                (fprintf(stderr, "WARN: Some inputs do not have OOB scores.This probably means too few trees were used to compute any reliable oob estimates.\n"),warn_flag=true);
+                oob_count[0][i]=1;
+            }
+            oob_prediction[0][i]/=oob_count[0][i];
+        }
+        REAL oob_R2=R2(oob_prediction[0], y, n_samples);
+        oob_scores=rmse(oob_prediction[0], y, n_samples);
+        if (1) {
+            fprintf(stderr, "Out-of-bag score(RMSE,Correlation Coefficient)=(%lf,%lf)\n",oob_scores,oob_R2);
+        }
+        for (i=0; i<n_jobs; i++) {
+            delete []oob_count[i];
+            delete []oob_prediction[i];
+        }
+        delete []oob_count;
+        delete []oob_prediction;
+    }
+    if (compute_importance) {
+        for (i=0; i<n_features; i++) {
+            importances[i]=0;
+            for (j=0; j<n_jobs; j++) {
+                importances[i]+=importance_tmp[j][i];
+            }
+            importances[i]/=n_trees;
+        }
+        for (i=0; i<n_jobs; i++) {
+            delete []importance_tmp[i];
+        }
+        delete []importance_tmp;
+    }
+    end=time(NULL);
+    fprintf(stderr, "|Random Forest training done. | Using time: %.0lf secs|\n",difftime(end, beg));
+    return ENSEMBLE_SUCCESS;
+}
+void RandomForestRegressor::predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures){
+    uint i,j;
+    REAL* single_tree_pred=new REAL[nSamples];
+#ifdef DEBUG
+    assert(nFeatures==this->n_features);
+#endif
+    for (i=0; i<nSamples; i++) {
+        pred[i]=0;
+    }
+    for (i=0; i<n_trees; i++) {
+#ifdef DEBUG
+        assert(tree[i]!=NULL);
+#endif
+        tree[i]->predict(X, single_tree_pred, nSamples, nFeatures);
+        for (j=0; j<nSamples; j++) {
+            pred[j]+=single_tree_pred[j];
+        }
+    }
+    
+    //average prediction
+    for (i=0; i<nSamples; i++) {
+        pred[i]/=n_trees;
+    }
+    delete [] single_tree_pred;
+}
+
+
+void RFG_build_trees_range(REAL** X,REAL* y,uint n_samples,BaseForest* forest,uint n_trees_beg,uint n_trees_end,uint* oob_count,REAL* oob_prediction,REAL* importances){
+    Tree** tree=forest->tree;
+    uint n_features=forest->n_features;
+    uint max_features=forest->max_features;
+    uint max_depth=forest->max_depth;
+    uint min_sample_leaf=forest->min_sample_leaf;
+    uint find_split_algorithm=forest->find_split_algorithm;
+    uint random_seed=forest->random_seed;
+    bool oob=forest->oob;
+    bool compute_importance=forest->compute_importance;
+    bool bootstrap=forest->bootstrap;
+    
+    uint i,j;
+    REAL *oob_prediction_tmp,*importance_tmp;
+    REAL** sub_X;
+    REAL* sub_y;
+    bool* mask;//for oob prediction
+    if (oob) {
+        oob_prediction_tmp=new REAL[n_samples];
+        for (i=0; i<n_samples; i++) {
+            oob_prediction[i]=0;
+            oob_count[i]=0;
+        }
+        mask=new bool[n_samples];
+    }
+    if (compute_importance) {
+        importance_tmp=new REAL[n_features];
+        for (j=0; j<n_features; j++) {
+            importances[j]=0;
+        }
+    }
+    if (bootstrap) {
+        sub_X=new REAL* [n_samples];
+        sub_y=new REAL[n_samples];
+    }
+    for (i=n_trees_beg; i<n_trees_end; i++) {
+        if (oob) {
+            for (j=0; j<n_samples; j++) {
+                mask[j]=false;
+            }
+        }
+        if (bootstrap) {
+            for (j=0; j<n_samples; j++) {
+                //                uint idx=rand()%n_samples;
+                uint idx=randomMT()%n_samples;
+                sub_X[j]=X[idx];
+                sub_y[j]=y[idx];
+                if (oob) {
+                    mask[idx]=true;
+                }
+            }
+        }else {
+            sub_X=X;
+            sub_y=y;
+        }
+        if (1) {
+            fprintf(stderr, "Random forest: building tree %d\n",i);
+        }
+        //build tree
+        tree[i]=new TreeRegressor(n_features,\
+                                  max_features,\
+                                  min_sample_leaf,\
+                                  max_depth,\
+                                  find_split_algorithm,\
+                                  random_seed,\
+                                  1);
+        tree[i]->build(sub_X, sub_y, n_samples);
+//        tree[i]->resize();
+        if (oob) {
+            tree[i]->predict(X, oob_prediction_tmp, mask, n_samples, n_features);
+            for (j=0; j<n_samples; j++) {
+                if (mask[j]) {
+                    continue;
+                }
+                oob_count[j]++;
+                oob_prediction[j]+=oob_prediction_tmp[j];
+            }
+        }
+        if (compute_importance) {
+            tree[i]->compute_importance(importance_tmp);
+            for (j=0; j<n_features; j++) {
+                importances[j]+=importance_tmp[j];
+            }
+            
+        }
+    }
+    if (oob) {
+        delete []mask;
+        delete []oob_prediction_tmp;
+    }
+    if (compute_importance) {
+        delete []importance_tmp;
+    }
+    if (bootstrap) {
+        delete []sub_X;
+        delete []sub_y;
+    }
+}
+
+int RandomForestRegressor::save_model(const char *filename){
+    FILE* fp=fopen(filename,"w");
+    if (!fp) {
+        fprintf(stderr, "Error: Cannot open file %s for save RandomForestRegressor model.Please check your file path is correct.\n",filename);
+        return false;
+    }
+    fprintf(fp, "RandomForestRegressor\n");
+    BaseForest::save_model(fp);
+    fclose(fp);
+    return true;
+}
+#endif
diff --git a/include/tensemble/RandomGenerator.h b/include/tensemble/RandomGenerator.h
new file mode 100644
index 0000000..824cc20
--- /dev/null
+++ b/include/tensemble/RandomGenerator.h
@@ -0,0 +1,33 @@
+/* * * * *
+ *  RandomGenerator.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_RandomGenerator_h
+#define libTM_RandomGenerator_h
+//#include    "cokus.h"
+#define MAX_UINT_COKUS 4294967295
+#define uint32 unsigned long
+#define SMALL_INT char
+
+#ifdef MATLAB
+#define SMALL_INT_CLASS mxCHAR_CLASS //will be used to allocate memory t
+#endif
+
+
+//this two function are implemented in cokus.cpp
+void seedMT(uint32 seed);
+uint32 randomMT(void);
+
+REAL unif_rand(){
+    return (((REAL)randomMT())/((REAL)MAX_UINT_COKUS));
+}
+
+#endif
diff --git a/include/tensemble/ReadData.h b/include/tensemble/ReadData.h
new file mode 100644
index 0000000..328054f
--- /dev/null
+++ b/include/tensemble/ReadData.h
@@ -0,0 +1,163 @@
+/* * * * *
+ *  ReadData.h 
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_ReadData_h
+#define libTM_ReadData_h
+#include "FeatureData.h"
+#include "fstream"
+#include <cstdlib>
+using namespace std;
+#define PRINT_DEBUG
+
+struct Data {
+public:
+    Data(){
+        X=NULL;
+        y=NULL;
+        n_samples=n_features=-1;
+    }
+    ~Data(){
+        if (X) {
+            for (int i=0; i<n_samples; i++) {
+                delete []X[i];
+            }
+            delete []X;
+            X=NULL;
+        }
+        if (y) {
+            delete []y;
+            y=NULL;
+        }
+    }
+    void set_size(int n_samples,int n_features){
+        this->n_samples=n_samples;
+        this->n_features=n_features;
+        X=new REAL*[n_samples];
+        y=new REAL[n_samples];
+        for (int i=0; i<n_samples; i++) {
+            X[i]=new REAL[n_features];
+        }
+    }
+public:
+    REAL** X;
+    REAL* y;
+    int n_samples;
+    int n_features;
+};
+
+bool readData(Data& data, const char* x_file,const char* y_file=NULL){
+    char* endptr,*idx,*val;
+    ifstream x_input(x_file);
+    ifstream y_input;
+    if (y_file) {
+        y_input.open(y_file);
+    }
+    int x_linenum,y_linenum;
+    x_linenum=y_linenum=0;
+    int n_samples=0;
+    string strline;
+	if (x_input.fail()) {
+	    fprintf(stderr, "Error: unable to open data point file \"%s\".\n", x_file);
+	    return false;
+    }
+    if (y_file && y_input.fail()) {
+        fprintf(stderr, "Error: unable to open target file \"%s\".\n",y_file);
+        return false;
+    }
+    //check file correctness
+    getline(x_input,strline);
+    while (!x_input.eof()) {
+        x_linenum++;
+        getline(x_input,strline);
+    }
+    if (y_file) {
+        getline(y_input,strline);
+        while (!y_input.eof()) {
+            y_linenum++;
+            getline(y_input,strline);
+        }
+        if (x_linenum!=y_linenum+1) {
+            fprintf(stderr, "Error: number of samples in data point file(%d samples) and target file(%d samples) are not consistent.Please check your input file.\n",x_linenum-1,y_linenum );
+            return false;
+        }
+        y_input.clear();
+        y_input.seekg(0,ios_base::beg);
+    }
+    //reset
+    x_input.clear();
+    x_input.seekg(0,ios_base::beg);
+    
+    n_samples=x_linenum-1;
+    getline(x_input, strline);
+    int n_features=(int)strtol(strline.c_str(), &endptr, 10);
+    if (endptr == strline.c_str() || (*endptr != '\0' && !isspace(*endptr))){
+        fprintf(stderr, "Error: wrong input format at line %d in file %s\n",1,x_file);
+        return false;
+    }
+    if (n_features<=0) {
+        fprintf(stderr, "Error: number of features is 0 in training file.Please check your input file.\n");
+        return false;
+    }
+    //read data
+    data.set_size(n_samples, n_features);
+    for (uint i=0; i<n_samples; i++) {
+        if (y_file) {
+            getline(y_input,strline);
+            data.y[i]=strtod(strline.c_str(), &endptr);        
+            if (endptr == strline.c_str() || (*endptr != '\0' && !isspace(*endptr))){
+                fprintf(stderr, "Error: wrong input format at line %d in file %s\n",i+1,y_file);
+                return false;
+            }
+        }
+        
+        getline(x_input, strline);
+        char* line = strdup(strline.c_str()); 
+//        cout<<line<<endl;
+        idx = NULL;
+        idx=strtok(line, ":");
+        if (idx==NULL) {
+            fprintf(stderr, "Error: Empty line at line %d in file %s\n",i+2,x_file);
+            free(line);
+            return false;
+        }
+        while (idx!=NULL) { // tok is feature:value
+            val=strtok(NULL, " \t");
+            if (val==NULL) {
+                break;
+            }
+            int index=(int)strtol(idx, &endptr, 10);
+            // check for errors
+            if (endptr == idx || (*endptr != '\0' && !isspace(*endptr))){
+                fprintf(stderr, "Error: wrong input format at line %d in file %s\n",i+2,x_file);
+                free(line);
+                return false;
+            }
+            if (index==0 || index>n_features) {
+                fprintf(stderr, "Error: wrong input format at line %d in file %s.index must from 1 to n_features(number of features).Please check your input file.\n",i+2,x_file);
+                free(line);
+                return false;
+            }
+            REAL value=strtod(val, &endptr);
+            if (endptr == val || (*endptr != '\0' && !isspace(*endptr))){
+                fprintf(stderr, "Error: wrong input format at line %d in file %s\n",i+2,x_file);
+                free(line);
+                return false;
+            }
+            data.X[i][index-1]=value;
+            idx=strtok(NULL, ":");
+        }
+        free(line);
+    }
+    return true;
+}
+
+#endif
diff --git a/include/tensemble/Tree.h b/include/tensemble/Tree.h
new file mode 100644
index 0000000..20e1911
--- /dev/null
+++ b/include/tensemble/Tree.h
@@ -0,0 +1,731 @@
+/* * * * *
+ *  Tree.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_MultiThread_Tree_h
+#define libTM_MultiThread_Tree_h
+
+#include <iostream>
+#include <string>
+#include "TreeNode.h"
+#include "FeatureData.h"
+#include "Criterion.h"
+#include "ClassificationCriterion.h"
+#include <cmath>
+#include "RandomGenerator.h"
+#include "assert.h"
+
+#include <boost/bind.hpp>
+#include <boost/thread/thread.hpp>
+#define TREE_MAX_DEPTH  30
+//using namespace boost;
+
+enum SplitAlgorithm {
+    FIND_BEST = 0,
+    FIND_RANDOM = 1
+};
+
+struct MyPair {
+    uint index;
+    REAL value;
+    bool friend operator < (const MyPair& a,const MyPair& b){
+        return a.value<b.value;
+    }
+};
+
+//split_args for parallel find best split,because boost:bind can handle 8 parameter at most
+struct split_args {
+    uint* sample_ind;
+    uint s_ind_beg;
+    uint s_ind_end;
+    uint f_beg;
+    uint f_end;
+    uint nSamples;
+    uint min_samples_leaf;
+};
+//for parallel find_best_split
+void find_best_split_range(bool *skip,Criterion *criterion,REAL** X,REAL* y,split_args args,uint& feature_split,REAL& value_split,REAL& min_error);
+
+class Tree {
+    /*base decision tree*/
+
+public:
+    Criterion **p_criterion;
+    int criterion_name;
+    uint min_sample_leaf;
+    uint n_features;
+    uint n_classes;//n_classes=1 for regression
+    uint max_features;
+    uint max_depth;
+    uint find_split_algorithm;
+    uint random_seed;
+
+    TreeNode **nodes;
+    sint *nodes_deep;
+    uint numNodes;
+    uint max_nNodes;
+
+    uint n_threads;
+
+public:
+    Tree(int criterion,uint n_classes,uint n_feature,uint max_features,uint min_sample_leaf,uint max_depth,uint find_split_algorithm=FIND_BEST,\
+         uint random_seed=0,uint n_thread=1);
+
+    virtual ~Tree();
+
+    int build(REAL** X,REAL* y,uint nSamples);
+
+    bool find_split(uint node_id,REAL** X,REAL* y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples,uint& feature_split,REAL& value_split,REAL& min_error);
+
+    bool find_best_split(uint node_id,REAL** X,REAL* y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples,uint& feature_split,REAL& value_split,REAL& min_error);
+
+    bool find_random_split(uint node_id,uint& feature_split,REAL& value_split,REAL& min_error);
+
+    virtual void predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures);
+
+    //predict with mask
+    virtual void predict(REAL** X,REAL* pred,bool* mask,uint nSamples,uint nFeatures);
+
+    void predict_terminal_region(REAL** X,uint* region,uint nSamples,uint nFeatures);
+
+    virtual void compute_importance(REAL* importance){};
+
+    //for parallel find best split
+    bool find_best_split_parallel(uint node_id,REAL** X,REAL* y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples,uint& feature_split,REAL& value_split,REAL& min_error);
+
+    void resize();
+
+protected:
+    void init_nodes(uint nSamples);
+
+    void init_criterion();
+
+    uint updateNodeSampleMap(uint node_id,uint l_node_id,uint r_node_id,\
+                             uint* sample_ind,uint* sample_ind_swap,REAL** X,\
+                             uint s_ind_beg,uint s_ind_end,uint feature_split,\
+                             REAL value_split);
+};
+
+Tree::Tree(int criterion,uint n_classes,uint n_features,uint max_features,uint min_sample_leaf,uint max_depth,uint find_split_algorithm,uint random_seed,uint n_threads){
+#ifdef DEBUG
+    assert(criterion==CRITERION_MSE || criterion==CRITERION_GINI || criterion==CRITERION_ENTROPY);
+    assert(find_split_algorithm==FIND_RANDOM || find_split_algorithm==FIND_BEST);
+    assert(max_features<=n_features);
+    assert(n_threads>=1);
+    if (criterion==CRITERION_MSE) {
+        assert(n_classes==1);
+    }
+#endif
+    if (max_features>n_features || max_features<=0) {
+        max_features=n_features;
+    }
+    int max_thread=boost::thread::hardware_concurrency();
+    if (n_threads>max_thread) {
+        n_threads=max_thread;
+    }
+    max_features=MAX(1,max_features);
+
+    //reset number of threads if max_features is too small
+    if (n_threads>1 && max_features/n_threads<THREAD_MIN_FEATURES) {
+//        fprintf(stderr, "WARNNING:Number of thread is %d,but max_feature=%d.Each thread at least need %d features to find best split for internal nodes in decision tree.Now set number of thread = %d\n",n_threads,max_features,THREAD_MIN_FEATURES,MAX(1,(uint)max_features/THREAD_MIN_FEATURES));
+        n_threads=MAX(1,(uint)max_features/THREAD_MIN_FEATURES);
+    }
+    this->n_threads=n_threads;
+    this->criterion_name=criterion;
+    switch (find_split_algorithm) {
+        case FIND_BEST:
+            this->find_split_algorithm=FIND_BEST;
+            break;
+        case FIND_RANDOM:
+            this->find_split_algorithm=FIND_RANDOM;
+            break;
+        default:
+            break;
+    }
+    this->n_features=n_features;
+    this->n_classes=n_classes;
+    this->max_depth=MIN(TREE_MAX_DEPTH, max_depth);
+    this->max_features=max_features;
+    this->min_sample_leaf=min_sample_leaf;
+    this->random_seed=random_seed;
+    this->numNodes=0;
+    this->nodes=NULL;
+    this->p_criterion=NULL;
+    this->nodes_deep=NULL;
+}
+Tree::~Tree(){
+    if (nodes) {
+        for (uint i=0; i<max_nNodes; i++) {
+            delete nodes[i];
+            nodes[i]=NULL;
+        }
+        delete []nodes;
+        nodes=NULL;
+    }
+    if (p_criterion){
+        for (uint i=0; i<n_threads; i++) {
+            delete p_criterion[i];
+            p_criterion[i]=NULL;
+        }
+        delete []p_criterion;
+        p_criterion=NULL;
+    }
+    if (nodes_deep) {
+        delete []nodes_deep;
+        nodes_deep=NULL;
+    }
+}
+void Tree::resize(){
+    if ((REAL)max_nNodes/numNodes<1.3) {
+//        cout<<max_nNodes<<endl;
+//        cout<<(REAL)max_nNodes/numNodes<<endl;
+//        cout<<numNodes<<endl;
+        return;
+    }
+    if (nodes) {
+        for (uint i=numNodes+1; i<max_nNodes; i++) {
+            delete nodes[i];
+            nodes[i]=NULL;
+        }
+        max_nNodes=numNodes+1;
+    }
+}
+void Tree::init_criterion(){
+    this->p_criterion=new Criterion*[n_threads];
+    switch (criterion_name) {
+        case CRITERION_MSE:
+            for (uint i=0; i<n_threads; i++) {
+                p_criterion[i]=new MSE;
+            }
+            break;
+        case CRITERION_GINI:
+            for (uint i=0; i<n_threads; i++) {
+                p_criterion[i]=new Gini(n_classes);
+            }
+            break;
+        case CRITERION_ENTROPY:
+            for (uint i=0; i<n_threads; i++) {
+                p_criterion[i]=new Entropy(n_classes);
+            }
+            break;
+        default:
+            break;
+    }
+}
+void Tree::init_nodes(uint nSamples){
+    numNodes=1;
+    /*===============max_nNodes must choose carefully==========================
+     * too small will get poor result,too big will waste memory and may slow down the speed
+     */
+    max_nNodes=2*(uint)((REAL)floor((REAL)(nSamples/(1>((int)min_sample_leaf-4)?1:(min_sample_leaf-4))))+1);
+    if (max_nNodes>pow((REAL)2.0,(REAL)max_depth)) {
+        max_nNodes=pow((REAL)2.0, (REAL)max_depth)+1;
+    }
+    nodes=new TreeNode*[max_nNodes];
+    nodes_deep=new sint[max_nNodes];
+    for (uint i=0; i<max_nNodes; i++) {
+        nodes[i]=new TreeNode;
+    }
+    nodes_deep[ROOT]=1;
+}
+
+uint Tree::updateNodeSampleMap(uint node_id,uint l_node_id,uint r_node_id,\
+                               uint* sample_ind,uint* sample_ind_swap,REAL** X,\
+                               uint s_ind_beg,uint s_ind_end,uint feature_split,\
+                               REAL value_split){
+    /* Map sample index in parent node to left child and right child
+     * return the number of sample in left child.
+     */
+
+#ifdef DEBUG
+    assert(s_ind_beg>=0 && s_ind_beg<s_ind_end);
+    assert(s_ind_end>0);
+#endif
+    uint i,count=s_ind_beg;
+    if (l_node_id>=max_nNodes || r_node_id>=max_nNodes) {
+        return -1;
+    }
+    nodes[l_node_id]->nSamples=0;
+    nodes[r_node_id]->nSamples=0;
+    for (i=s_ind_beg; i<s_ind_end; i++) {
+        uint idx=sample_ind[i];
+        if (X[idx][feature_split]<=value_split) {
+            nodes[l_node_id]->nSamples++;
+            sample_ind_swap[count++]=idx;
+        }
+    }
+    for (i=s_ind_beg; i<s_ind_end; i++) {
+        uint idx=sample_ind[i];
+        if (X[idx][feature_split]>value_split) {
+            nodes[r_node_id]->nSamples++;
+            sample_ind_swap[count++]=idx;
+        }
+    }
+#ifdef DEBUG
+    assert(count==s_ind_end);
+#endif
+
+    for (i=s_ind_beg; i<s_ind_end; i++) {
+        sample_ind[i]=sample_ind_swap[i];
+    }
+    return nodes[l_node_id]->nSamples;
+}
+
+int Tree::build(REAL** X,REAL* y,uint nSamples){
+    /* build decision tree
+     * X,y:the original data
+     * nSamples: number of sample in X
+     */
+#ifdef DEBUG
+    if (criterion_name==CRITERION_MSE) {
+        assert(n_classes==1);
+    }else{
+        assert(n_classes>=2);
+    }
+#endif
+    uint i,l_node_id,r_node_id,work_node_id,feature_split,sample_ind_beg,sample_ind_end;
+    REAL value_split,ini_error,min_error;
+    uint *node_beg,*node_end,*sample_ind,*sample_ind_swap;
+
+    //initialize criterion and nodes
+    init_criterion();
+    init_nodes(nSamples);
+    nodes[ROOT]->nSamples=nSamples;
+    node_beg=new uint[max_nNodes];
+    node_end=new uint[max_nNodes];
+    sample_ind=new uint[nSamples];
+    sample_ind_swap=new uint[nSamples];
+    node_beg[ROOT]=0;
+    node_end[ROOT]=nSamples;
+    for (i=0; i<nSamples; i++) {
+        sample_ind[i]=i;
+    }
+    //main
+    for (work_node_id=ROOT; work_node_id<max_nNodes; work_node_id++) {
+        if (work_node_id>numNodes) {
+            break;
+        }
+        sample_ind_beg=node_beg[work_node_id];
+        sample_ind_end=node_end[work_node_id];
+        //initialize threads criterion,let every thread has a criterion copy
+        for (i=0; i<n_threads; i++) {
+            p_criterion[i]->init(y, sample_ind,sample_ind_beg,sample_ind_end, nodes[work_node_id]->nSamples);
+        }
+        ini_error=p_criterion[0]->eval();
+        nodes[work_node_id]->ini_error=ini_error;
+        //terminal node if the number of sample in a node is less than min_sample_leaf
+        if (nodes[work_node_id]->nSamples<2*min_sample_leaf || nodes_deep[work_node_id]==max_depth || numNodes>=max_nNodes-2) {
+//        if (nodes[work_node_id]->nSamples<=min_sample_leaf || nodes_deep[work_node_id]==max_depth || numNodes>=max_nNodes-2) {
+            nodes[work_node_id]->leaf=true;
+            //allocate memory for prediction
+            nodes[work_node_id]->pred=new REAL[n_classes];
+            p_criterion[0]->estimate(nodes[work_node_id]->pred);
+            continue;
+        }
+        //find split
+        min_error=ini_error;
+        bool flag=find_split(work_node_id, X, y,sample_ind, sample_ind_beg,sample_ind_end, nodes[work_node_id]->nSamples, feature_split, value_split,min_error);
+        //terminal node
+        if (!flag) {
+            nodes[work_node_id]->leaf=true;
+            p_criterion[0]->reset();
+            //allocate memory for prediction
+            nodes[work_node_id]->pred=new REAL[n_classes];
+            p_criterion[0]->estimate(nodes[work_node_id]->pred);
+            continue;
+        }
+        //found split
+        nodes[work_node_id]->best_error=min_error;
+        nodes[work_node_id]->feature_split=feature_split;
+        nodes[work_node_id]->value_split=value_split;
+
+        //set left,right child
+        l_node_id=++numNodes;
+        r_node_id=++numNodes;
+        nodes[work_node_id]->left_child=l_node_id;
+        nodes[work_node_id]->right_child=r_node_id;
+        nodes_deep[l_node_id]=nodes_deep[r_node_id]=nodes_deep[work_node_id]+1;
+
+        //map samples in parent node to left,right child
+        uint mid=updateNodeSampleMap(work_node_id, l_node_id,r_node_id, sample_ind,sample_ind_swap, X, sample_ind_beg, sample_ind_end, feature_split, value_split);
+        node_beg[l_node_id]=sample_ind_beg;
+        node_end[l_node_id]=sample_ind_beg + mid;
+        node_beg[r_node_id]=sample_ind_beg + mid;
+        node_end[r_node_id]=sample_ind_end;
+    }
+    delete []sample_ind_swap;
+    delete []sample_ind;
+    delete []node_beg;
+    delete []node_end;
+    delete []nodes_deep;//no more need nodes_deep
+    nodes_deep=NULL;
+    //no more need criterion.
+    for (uint i=0; i<n_threads; i++) {
+        delete p_criterion[i];
+    }
+    delete []p_criterion;
+    p_criterion=NULL;
+    resize();
+    return ENSEMBLE_SUCCESS;
+}
+
+bool Tree::find_split(uint node_id,REAL** X,REAL* y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples,uint& feature_split,REAL& value_split,REAL& min_error){
+    if (find_split_algorithm==FIND_BEST) {
+        if (n_threads>1) {
+            return find_best_split_parallel(node_id, X,y, sample_ind, s_ind_beg,s_ind_end,nSamples, feature_split, value_split,min_error);
+        }
+        return find_best_split(node_id, X,y, sample_ind, s_ind_beg,s_ind_end,nSamples, feature_split, value_split,min_error);
+    }
+    return -1;
+}
+
+bool Tree::find_best_split(uint node_id,REAL** X,REAL* y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples,uint& feature_split,REAL& value_split,REAL& min_error){
+    /* SINGLE-THREAD FIND SPLIT
+     * find best split for internal node(node_id)
+     * X,y is the original data
+     * samples in internal node(node_id) is specify by spsample_ind[s_ind_beg:s_ind_end]
+     */
+#ifdef DEBUG
+    assert(s_ind_beg>=0 && s_ind_beg<s_ind_end);
+    assert((s_ind_end-s_ind_beg) ==nSamples);
+#endif
+    REAL ini_error=min_error;
+    uint i,j,node_nSamples,f,count;
+    bool same_data=true;
+    //do not consider split if all data are identical.
+    REAL pre_y=y[sample_ind[s_ind_beg]];
+    for (i=s_ind_beg+1; i<s_ind_end; i++) {
+        uint idx=sample_ind[i];
+        if (y[idx]!=pre_y) {
+            same_data=false;
+            break;
+        }
+    }
+    if (same_data) {
+        return false;
+    }
+
+    Criterion *criterion=p_criterion[0];
+    bool* skip=new bool[n_features];
+    uint* skip_idx=new uint[n_features];
+    node_nSamples=nodes[node_id]->nSamples;
+    MyPair* x_f=new MyPair[nSamples];
+
+    //generate random split feature
+    for (i=0; i<n_features; i++) {
+        skip[i]=true;
+        skip_idx[i]=i;
+    }
+    count=n_features;
+    for (i=0; i<max_features; i++) {
+        j=rand_r(&this->random_seed)%count;
+//        j=randomMT()%count;
+        uint idx=skip_idx[j];
+        skip[idx]=false;
+        std::swap(skip_idx[j], skip_idx[--count]);
+    }
+    //find best split main loop
+    for (f=0; f<n_features; f++) {
+        if (skip[f]) {
+            continue;
+        }
+        criterion->reset();
+        count=0;
+        for (i=s_ind_beg; i<s_ind_end; i++) {
+            uint idx=sample_ind[i];
+            x_f[count].index=idx;
+            x_f[count].value=X[idx][f];
+            count++;
+        }
+#ifdef DEBUG
+        assert(count==nSamples);
+#endif
+        //sort sample with feature f
+        sort(x_f, x_f+nSamples);
+        if (x_f[0].value>=x_f[nSamples-1].value) {
+            continue;
+        }
+        count=0;
+        for (i=s_ind_beg; i<s_ind_end-1; i++) {
+            uint loc=x_f[count].index;
+            uint nLeft=criterion->update_next(y, loc);
+            //do not consider split if feature value are identical.
+            if (x_f[count].value==x_f[count+1].value || nLeft<min_sample_leaf || nSamples-nLeft<min_sample_leaf) {
+//            if (x_f[count].value==x_f[count+1].value){
+                count++;
+                continue;
+            }
+            REAL error=criterion->eval();
+            if (error<min_error) {
+                min_error=error;
+                feature_split=f;
+                value_split=0.5*(x_f[count].value+x_f[count+1].value);
+            }
+            count++;
+        }
+    }
+//    fprintf(stderr, "\t%d,%lf,%lf\n",min_error==ini_error, ini_error, min_error);
+    delete []skip;
+    delete []skip_idx;
+    delete []x_f;
+    return min_error!=ini_error;
+}
+
+bool Tree::find_best_split_parallel(uint node_id, REAL **X, REAL *y, uint *sample_ind, uint s_ind_beg, uint s_ind_end, uint nSamples, uint &feature_split, REAL &value_split, REAL &min_error){
+    /* MULTI-THREAD FIND SPLIT
+     * parallel find best split for internal node(node_id)
+     * X,y is the original data
+     * samples in internal node(node_id) is specify by spsample_ind[s_ind_beg:s_ind_end]
+     */
+    uint *f_split,i,j,count;
+    REAL *v_split,*p_min_error;
+    struct split_args args;
+    f_split=new uint[n_threads];
+    v_split=new REAL[n_threads];
+    p_min_error=new REAL[n_threads];
+    bool* skip=new bool[n_features];
+    uint* skip_idx=new uint[n_features];
+    args.s_ind_beg=s_ind_beg;
+    args.s_ind_end=s_ind_end;
+    args.sample_ind=sample_ind;
+    args.nSamples=nSamples;
+    args.min_samples_leaf=min_sample_leaf;
+    REAL ini_error=min_error;
+
+    //generate random split feature
+    for (i=0; i<n_features; i++) {
+        skip[i]=true;
+        skip_idx[i]=i;
+    }
+    count=n_features;
+    for (i=0; i<max_features; i++) {
+        j=rand_r(&this->random_seed)%count;
+//        j=randomMT()%count;
+        uint idx=skip_idx[j];
+        skip[idx]=false;
+        std::swap(skip_idx[j], skip_idx[--count]);
+    }
+    boost::thread **thread=new boost::thread*[n_threads];
+    for (i=0; i<n_threads; i++) {
+        p_min_error[i]=ini_error;
+        args.f_beg=(n_features/n_threads*i);
+        args.f_end=(n_features/n_threads*(i+1));
+        if (i==n_threads-1) {
+            args.f_end=n_features;
+        }
+        thread[i]=new boost::thread(boost::bind(find_best_split_range,skip, p_criterion[i], X, y, args, boost::ref(f_split[i]), boost::ref(v_split[i]), boost::ref(p_min_error[i])));
+    }
+    for (i=0; i<n_threads; i++) {
+        thread[i]->join();
+        delete thread[i];
+        if (p_min_error[i]<min_error) {
+            min_error=p_min_error[i];
+            feature_split=f_split[i];
+            value_split=v_split[i];
+        }
+    }
+    delete thread;
+    delete []skip;
+    delete []skip_idx;
+    delete []f_split;
+    delete []v_split;
+    delete []p_min_error;
+    return min_error!=ini_error;
+}
+void Tree::predict(REAL **X,REAL* pred,uint nSamples,uint nFeatures){
+    /* Make Prediction
+     * for classification.pred is the probability of each classes.
+     */
+#ifdef DEBUG
+    assert(nFeatures==this->n_features);
+    if (criterion_name==CRITERION_MSE) {
+        assert(n_classes==1);
+    }else {
+        assert(n_classes>=2);
+    }
+#endif
+    uint node_id,f_s;
+    REAL v_s,normalizer=0.0;
+    for (uint i=0; i<nSamples; i++) {
+        node_id=ROOT;
+        while (!nodes[node_id]->leaf) {
+            f_s=nodes[node_id]->feature_split;
+            v_s=nodes[node_id]->value_split;
+            if (X[i][f_s]<=v_s) {
+                node_id=nodes[node_id]->left_child;
+            }else {
+                node_id=nodes[node_id]->right_child;
+            }
+        }
+        normalizer=0.0;
+        for (uint j=0; j<n_classes; j++) {
+            pred[n_classes*i+j]=nodes[node_id]->pred[j];
+            normalizer+=nodes[node_id]->pred[j];
+        }
+        if (criterion_name!=CRITERION_MSE) {
+            if (normalizer!=0) {
+                for (uint j=0; j<n_classes; j++) {
+                    pred[n_classes*i+j]/=normalizer;
+                }
+            }
+        }
+    }
+}
+void Tree::predict(REAL **X,REAL* pred,bool* mask,uint nSamples,uint nFeatures){
+    /* Make Prediction with mask
+     * for classification.pred is the probability of each classes.
+     */
+#ifdef DEBUG
+    assert(nFeatures==this->n_features);
+    if (criterion_name==CRITERION_MSE) {
+        assert(n_classes==1);
+    }
+#endif
+    uint node_id,f_s;
+    REAL v_s,normalizer;
+    for (uint i=0; i<nSamples; i++) {
+        if (mask[i]) {
+            continue;
+        }
+        node_id=ROOT;
+        while (!nodes[node_id]->leaf) {
+            f_s=nodes[node_id]->feature_split;
+            v_s=nodes[node_id]->value_split;
+            if (X[i][f_s]<=v_s) {
+                node_id=nodes[node_id]->left_child;
+            }else {
+                node_id=nodes[node_id]->right_child;
+            }
+        }
+        normalizer=0.0;
+        for (uint j=0; j<n_classes; j++) {
+            pred[n_classes*i+j]=nodes[node_id]->pred[j];
+            normalizer+=nodes[node_id]->pred[j];
+        }
+        if (criterion_name!=CRITERION_MSE) {
+            if (normalizer!=0) {
+                for (uint j=0; j<n_classes; j++) {
+                    pred[n_classes*i+j]/=normalizer;
+                }
+            }
+        }
+
+    }
+}
+
+void Tree::predict_terminal_region(REAL **X, uint* region,uint nSamples, uint nFeatures){
+    /* GET TERMINAL REGION FOR EACH SAMPLE */
+#ifdef DEBUG
+    assert(nFeatures==this->n_features);
+#endif
+    uint node_id,f_s;
+    REAL v_s;
+    for (uint i=0; i<nSamples; i++) {
+        node_id=ROOT;
+        while (!nodes[node_id]->leaf) {
+            f_s=nodes[node_id]->feature_split;
+            v_s=nodes[node_id]->value_split;
+            if (X[i][f_s]<=v_s) {
+                node_id=nodes[node_id]->left_child;
+            }else {
+                node_id=nodes[node_id]->right_child;
+            }
+        }
+        region[i]=node_id;
+    }
+}
+
+void find_best_split_range(bool *skip,Criterion *criterion,REAL** X,REAL* y,split_args args,uint& feature_split,REAL& value_split,REAL& min_error){
+    /* FOR MULTI-THREAD FIND SPLIT
+     * find best split for internal node(node_id),feature range in [f_beg,f_end]
+     * X,y is the original data
+     * samples in internal node(node_id) is specify by spsample_ind[s_ind_beg:s_ind_end]
+     * [f_beg,f_end] /in [0,n_features] is the feature range for each thread.
+     *
+     * notice：
+     * criterion must be initialied before calling find_best_split_range
+     * one criterion for one thread
+     */
+    uint s_ind_beg,s_ind_end,nSamples,f_beg,f_end,*sample_ind,min_samples_leaf;
+    min_samples_leaf=args.min_samples_leaf;
+    s_ind_beg=args.s_ind_beg;
+    s_ind_end=args.s_ind_end;
+    nSamples=args.nSamples;
+    f_beg=args.f_beg;
+    f_end=args.f_end;
+    sample_ind=args.sample_ind;
+    REAL ini_error=min_error;
+#ifdef DEBUG
+    assert(s_ind_beg>=0 && s_ind_beg<s_ind_end);
+    assert((s_ind_end-s_ind_beg) ==nSamples);
+#endif
+    uint i,f,count;
+    bool same_data=true;
+
+    //do not consider split if all data are identical.
+    REAL pre_y=y[sample_ind[s_ind_beg]];
+    for (i=s_ind_beg+1; i<s_ind_end; i++) {
+        uint idx=sample_ind[i];
+        if (y[idx]!=pre_y) {
+            same_data=false;
+            break;
+        }
+    }
+    if (same_data) {
+        return;
+    }
+
+    MyPair* x_f=new MyPair[nSamples];
+    criterion->init(y, sample_ind,s_ind_beg,s_ind_end, nSamples);
+    //find best split main loop
+    for (f=f_beg; f<f_end; f++) {
+        if (skip[f]) {
+            continue;
+        }
+        criterion->reset();
+        count=0;
+        for (i=s_ind_beg; i<s_ind_end; i++) {
+            uint idx=sample_ind[i];
+            x_f[count].index=idx;
+            x_f[count].value=X[idx][f];
+            count++;
+        }
+#ifdef DEBUG
+        assert(count==nSamples);
+#endif
+        //sort sample with feature f
+        sort(x_f, x_f+nSamples);
+        if (x_f[0].value>=x_f[nSamples-1].value) {
+            continue;
+        }
+        count=0;
+        for (i=s_ind_beg; i<s_ind_end-1; i++) {
+            uint loc=x_f[count].index;
+            uint nLeft=criterion->update_next(y, loc);
+            //do not consider split if feature value are identical.
+            if (x_f[count].value==x_f[count+1].value || nLeft<min_samples_leaf || nSamples-nLeft<min_samples_leaf) {
+//            if (x_f[count].value==x_f[count+1].value){
+                count++;
+                continue;
+            }
+            REAL error=criterion->eval();
+            if (error<min_error) {
+                min_error=error;
+                feature_split=f;
+                value_split=0.5*(x_f[count].value+x_f[count+1].value);
+            }
+            count++;
+        }
+    }
+    delete []x_f;
+
+}
+#endif
diff --git a/include/tensemble/Tree.h.backup.h b/include/tensemble/Tree.h.backup.h
new file mode 100644
index 0000000..bcfa0e6
--- /dev/null
+++ b/include/tensemble/Tree.h.backup.h
@@ -0,0 +1,391 @@
+//
+//  Tree.h
+//  libTM
+//
+//  Created by apple on 12-10-29.
+//  Copyright (c) 2012年 Rongkai Xia. All rights reserved.
+//
+
+#ifndef libTM_Tree_h
+#define libTM_Tree_h
+#include <iostream>
+#include <string>
+#include "TreeNode.h"
+#include "FeatureData.h"
+#include "Criterion.h"
+#include <math.h>
+//#include "cokus.cpp"
+#include "RandomGenerator.h"
+#include "assert.h"
+#define LEFT(A) ((A)<<1)
+#define RIGHT(A) (((A)<<1)+1)
+#define MAX_DEPTH   100
+#define ROOT    1
+
+enum SplitAlgorithm {
+    FIND_BEST = 0,
+    FIND_RANDOM = 1
+    };
+
+struct MyPair {
+    uint index;
+    REAL value;
+    bool friend operator < (const MyPair& a,const MyPair& b){
+        return a.value<b.value;
+    }
+};
+
+class Tree {
+    /*decision tree*/
+    
+public:
+    Criterion *criterion;
+    uint min_sample_leaf;
+    uint n_features;
+    uint n_classes;//n_classes=1 for regression
+    uint max_features;
+    uint max_depth;
+    uint find_split_algorithm;
+    uint random_seed;
+    
+    TreeNode **nodes;
+    sint *nodes_deep;
+    uint numNodes;
+    uint max_nNodes;
+
+public:
+    Tree(int criterion,uint n_classes,uint n_feature,uint max_features,uint min_sample_leaf,\
+         uint max_depth,uint find_split_algorithm,\
+         uint random_seed=0);
+    
+    virtual ~Tree();
+    
+    int build(REAL** X,REAL* y,uint nSamples);
+    
+    bool find_split(uint node_id,REAL** X,REAL* y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples,uint& feature_split,REAL& value_split,REAL& min_error);
+    
+    bool find_best_split(uint node_id,REAL** X,REAL* y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples,uint& feature_split,REAL& value_split,REAL& min_error);
+    
+    bool find_random_split(uint node_id,uint& feature_split,REAL& value_split,REAL& min_error);
+    
+    void predict(REAL** X,REAL* pred,uint nSamples,uint nFeatures);
+    
+    virtual void compute_importance(REAL* importance)=0;
+protected:
+    void init_nodes(uint nSamples);
+    
+    uint updateNodeSampleMap(uint node_id,uint l_node_id,uint r_node_id,\
+                             uint* sample_ind,uint* sample_ind_swap,REAL** X,\
+                             uint s_ind_beg,uint s_ind_end,uint feature_split,\
+                             REAL value_split);
+};
+Tree::Tree(int criterion,uint n_classes,uint n_features,uint max_features,uint min_sample_leaf,\
+               uint max_depth,uint find_split_algorithm,\
+           uint random_seed){
+#ifdef DEBUG
+    assert(max_features<=n_features);
+#endif
+    max_features=MAX(1,max_features);
+    if (criterion==CRITERION_MSE) {
+        this->criterion=new MSE();
+    }else{
+        fprintf(stderr, "invalid criterion: %d\n",criterion);
+    }
+    switch (find_split_algorithm) {
+        case FIND_BEST:
+            this->find_split_algorithm=FIND_BEST;
+            break;
+        case FIND_RANDOM:
+            this->find_split_algorithm=FIND_RANDOM;
+            break;
+        default:
+            break;
+    }
+    this->n_features=n_features;
+    this->n_classes=n_classes;
+    this->max_depth=max_depth;
+    this->max_features=max_features;
+    this->min_sample_leaf=min_sample_leaf;
+    this->random_seed=random_seed;
+    this->numNodes=0;
+}
+Tree::~Tree(){
+    for (uint i=0; i<max_nNodes; i++) {
+        delete nodes[i];
+    }
+    delete []nodes;
+}
+
+void Tree::init_nodes(uint nSamples){
+    numNodes=1;//ROOT node
+    max_nNodes=2*(uint)((REAL)floor((REAL)(nSamples/(1>((int)min_sample_leaf-5)?1:(min_sample_leaf-5))))+1);
+    nodes=new TreeNode*[max_nNodes];
+    nodes_deep=new sint[max_nNodes];
+    for (uint i=0; i<max_nNodes; i++) {
+        nodes[i]=new TreeNode;
+    }
+    nodes_deep[ROOT]=1;
+}
+
+uint Tree::updateNodeSampleMap(uint node_id,uint l_node_id,uint r_node_id,\
+                               uint* sample_ind,uint* sample_ind_swap,REAL** X,\
+                               uint s_ind_beg,uint s_ind_end,uint feature_split,\
+                               REAL value_split){
+    /* Map sample index in parent node to left child and right child
+     * return the number of sample in left child.
+     */
+    
+#ifdef DEBUG
+    assert(s_ind_beg>=0 && s_ind_beg<s_ind_end);
+    assert(s_ind_end>0);
+#endif
+    uint i,count=s_ind_beg;
+    if (l_node_id>=max_nNodes || r_node_id>=max_nNodes) {
+        return -1;
+    }
+    nodes[l_node_id]->nSamples=0;
+    nodes[r_node_id]->nSamples=0;
+    for (i=s_ind_beg; i<s_ind_end; i++) {
+        uint idx=sample_ind[i];
+        if (X[idx][feature_split]<=value_split) {
+            nodes[l_node_id]->nSamples++;
+            sample_ind_swap[count++]=idx;
+        }
+    }
+    for (i=s_ind_beg; i<s_ind_end; i++) {
+        uint idx=sample_ind[i];
+        if (X[idx][feature_split]>value_split) {
+            nodes[r_node_id]->nSamples++;
+            sample_ind_swap[count++]=idx;
+        }
+    }
+#ifdef DEBUG
+    assert(count==s_ind_end);
+#endif
+    
+    for (i=s_ind_beg; i<s_ind_end; i++) {
+        sample_ind[i]=sample_ind_swap[i];
+    }
+    return nodes[l_node_id]->nSamples;
+}
+
+int Tree::build(REAL** X,REAL* y,uint nSamples){
+    //build decision tree
+    
+    uint i,l_node_id,r_node_id,work_node_id,feature_split,sample_ind_beg,sample_ind_end;
+    REAL value_split,ini_error,min_error;
+    uint *node_beg,*node_end,*sample_ind,*sample_ind_swap;
+    
+    //initialize nodes
+    init_nodes(nSamples);
+    nodes[ROOT]->nSamples=nSamples;
+    node_beg=new uint[max_nNodes];
+    node_end=new uint[max_nNodes];
+    sample_ind=new uint[nSamples];
+    sample_ind_swap=new uint[nSamples];
+    node_beg[ROOT]=0;
+    node_end[ROOT]=nSamples;
+    for (i=0; i<nSamples; i++) {
+        sample_ind[i]=i;
+    }
+    
+//    cout<<"build tree: max_nNodes="<<max_nNodes<<endl;
+    for (work_node_id=ROOT; work_node_id<max_nNodes; work_node_id++) {
+        if (work_node_id>numNodes) {
+            break;
+        }
+        sample_ind_beg=node_beg[work_node_id];
+        sample_ind_end=node_end[work_node_id];
+//        cout<<"work node="<<work_node_id<<endl;
+        //initialize criterion
+        criterion->init(y, sample_ind,sample_ind_beg,sample_ind_end, nodes[work_node_id]->nSamples);
+        ini_error=criterion->eval();
+        nodes[work_node_id]->ini_error=ini_error;
+        //terminal node if the number of sample in a node is less than min_sample_leaf
+        if (nodes[work_node_id]->nSamples<=min_sample_leaf || nodes_deep[work_node_id]==max_depth \
+            || numNodes>=max_nNodes-2) {
+            nodes[work_node_id]->leaf=true;
+            //allocate memory for prediction
+            nodes[work_node_id]->pred=new REAL[n_classes];
+            criterion->estimate(nodes[work_node_id]->pred);
+            cout<<"leaf node:"<<work_node_id<<" pred="<<nodes[work_node_id]->pred[0]<< endl;
+            continue;
+        }
+        //find split
+        bool flag=find_split(work_node_id, X, y,sample_ind, sample_ind_beg,sample_ind_end, nodes[work_node_id]->nSamples, feature_split, value_split,min_error);
+//            cout<<"split flag min_error "<<min_error<<endl;
+        //terminal node
+        if (!flag) {
+            nodes[work_node_id]->leaf=true;
+//                cout<<"leaf node:"<<work_node_id<<endl;
+//                criterion->init(y, mask, nodes[work_node_id]->nSamples, total_nSamples);
+            criterion->reset();
+            //allocate memory for prediction
+            nodes[work_node_id]->pred=new REAL[n_classes];
+            criterion->estimate(nodes[work_node_id]->pred);
+            cout<<"leaf node:"<<work_node_id<<" pred="<<nodes[work_node_id]->pred[0]<< endl;
+            continue;
+        }
+        //found split
+        nodes[work_node_id]->best_error=min_error;
+        nodes[work_node_id]->feature_split=feature_split;
+        nodes[work_node_id]->value_split=value_split;
+        
+        //set left,right child
+        l_node_id=++numNodes;
+        r_node_id=++numNodes;
+        nodes[work_node_id]->left_child=l_node_id;
+        nodes[work_node_id]->right_child=r_node_id;
+        nodes_deep[l_node_id]=nodes_deep[r_node_id]=nodes_deep[work_node_id]+1;
+        
+        fprintf(stderr, "deep=%d,(%d,%f,%f,pred=%f)\n",nodes_deep[work_node_id],feature_split+1,value_split,min_error,0.0);
+        
+        //map samples in parent node to left,right child
+        uint mid=updateNodeSampleMap(work_node_id, l_node_id,r_node_id, sample_ind,sample_ind_swap, X, sample_ind_beg, sample_ind_end, feature_split, value_split);
+        node_beg[l_node_id]=sample_ind_beg;
+        node_end[l_node_id]=sample_ind_beg + mid;
+        node_beg[r_node_id]=sample_ind_beg + mid;
+        node_end[r_node_id]=sample_ind_end;
+        //        cout<<"left_nSamples["<<l_node_id<<"]="<<nodes[l_node_id]->nSamples<<" right_nSamples["<<r_node_id<<"]="<<nodes[r_node_id]->nSamples<<endl;
+//        cout<<"left_beg="<<node_beg[l_node_id]<<"left_end="<<node_end[l_node_id]<<endl;
+//        cout<<"right_beg="<<node_beg[r_node_id]<<"right_end="<<node_end[r_node_id]<<endl;
+#ifdef PRINT_DEBUG
+        fprintf(stderr, "deep=%d,node_id=%d,feature_split=%d,value_split=%f,min error=%f\n",\
+                deep,work_node_id,feature_split,value_split,min_error);
+#endif
+    }
+//    delete []mask;
+    //release NodeSampleMap;
+    delete []sample_ind_swap;
+    delete []sample_ind;
+    delete []node_beg;
+    delete []node_end;
+    delete []nodes_deep;//no more need nodes_deep
+    delete criterion; //no more need Criterion
+    return SUCCESS;
+}
+
+bool Tree::find_split(uint node_id,REAL** X,REAL* y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples,uint& feature_split,REAL& value_split,REAL& min_error){
+    if (find_split_algorithm==FIND_BEST) {
+        return find_best_split(node_id, X,y, sample_ind, s_ind_beg,s_ind_end,nSamples, feature_split, value_split,min_error);
+    }
+    return -1;
+}
+
+bool Tree::find_best_split(uint node_id,REAL** X,REAL* y,uint* sample_ind,uint s_ind_beg,uint s_ind_end,uint nSamples,uint& feature_split,REAL& value_split,REAL& min_error){
+    
+#ifdef DEBUG
+    assert(s_ind_beg>=0 && s_ind_beg<s_ind_end);
+    assert((s_ind_end-s_ind_beg) <=nSamples);
+#endif
+    
+    uint i,j,node_nSamples,f,count;
+    bool* skip=new bool[n_features];
+    uint* skip_idx=new uint[n_features];
+    min_error=HUGE_VAL;
+    node_nSamples=nodes[node_id]->nSamples;
+    MyPair* x_f=new MyPair[nSamples];
+    
+    //generate random split feature
+    for (i=0; i<n_features; i++) {
+        skip[i]=true;
+        skip_idx[i]=i;
+    }
+    count=n_features;
+//    cout<<"======================"<<endl;
+    for (i=0; i<max_features; i++) {
+//        j=rand()%count;
+        j=randomMT()%count;
+        uint idx=skip_idx[j];
+        skip[idx]=false;
+        std::swap(skip_idx[j], skip_idx[--count]);
+//        cout<<"skip="<<idx<<endl;
+    }
+//    criterion->init(y, mask, node_nSamples, total_nSamples);
+    //main loop
+//    cout<<"find_best_split"<<endl;
+    for (f=0; f<n_features; f++) {
+//        cout<<"feature="<<f<<endl;
+        if (skip[f]) {
+            continue;
+        }
+        criterion->reset();
+//        criterion->init(y, mask, node_nSamples, total_nSamples);
+        count=0;
+        for (i=s_ind_beg; i<s_ind_end; i++) {
+            uint idx=sample_ind[i];
+            x_f[count].index=idx;
+            x_f[count].value=X[idx][f];
+            count++;
+        }
+#ifdef DEBUG
+        assert(count==nSamples);
+#endif
+        //sort sample with feature f
+        sort(x_f, x_f+nSamples);
+        if (x_f[0].value>=x_f[nSamples-1].value) {
+            continue;
+        }
+        count=0;
+        for (i=s_ind_beg; i<s_ind_end-1; i++) {
+            uint loc=x_f[count].index;
+            uint nLeft=criterion->update_next(y, loc);
+//            cout<<"nLeft="<<nLeft<<endl;
+            //do not consider split if feature value are identical.
+            if (x_f[count].value==x_f[count+1].value) {
+                count++;
+                continue;
+            }
+            REAL error=criterion->eval();
+            cout<<"f="<<f<<' '<<error<<endl;
+            if (error<min_error) {
+                fprintf(stderr, "f=%d,pre_error=%.8f,now_error=%.8f\n",f,min_error,error);
+                min_error=error;
+                feature_split=f;
+                value_split=0.5*(x_f[count].value+x_f[count+1].value);
+            }
+//            cout<<"error="<<error<<endl;
+            count++;
+        }
+//        fprintf(stderr, "node_id=%d,nSamples=%d,f=%d,min_error=%.8lf\n",node_id,node_nSamples,f,min_error);
+    }
+    delete []skip;
+    delete []skip_idx;
+    delete []x_f;
+    return min_error!=HUGE_VAL;
+}
+void Tree::predict(REAL **X,REAL* pred,uint nSamples,uint nFeatures){
+#ifdef DEBUG
+    assert(nFeatures==this->n_features);
+#endif
+    uint node_id,f_s;
+    REAL v_s;
+    for (uint i=0; i<nSamples; i++) {
+        node_id=ROOT;
+//        fprintf(stderr, "sample %d:",i);
+//        for (uint j=0; j<n_features; j++) {
+//            cout<<"X["<<j<<"]="<<X[j][i]<<endl;
+//        }
+        while (!nodes[node_id]->leaf) {
+//            cout<<node_id<<' '<<nodes[node_id]->leaf<<endl;
+//            fprintf(stderr, "(%d,%d,%f)\n",node_id,nodes[node_id]->feature_split,\
+                    nodes[node_id]->value_split);
+            f_s=nodes[node_id]->feature_split;
+            v_s=nodes[node_id]->value_split;
+            if (X[i][f_s]<=v_s) {
+//                cout<<X[f_s][i]<<":"<<v_s<<" left"<<endl;
+//                node_id=LEFT(node_id);
+                node_id=nodes[node_id]->left_child;
+            }else {
+//                cout<<X[f_s][i]<<":"<<v_s<<" right"<<endl;
+//                node_id=RIGHT(node_id);
+                node_id=nodes[node_id]->right_child;
+            }
+        }
+//        cout<<node_id<<" pred="<<nodes[node_id]->pred[0]<<endl;
+        for (uint j=0; j<n_classes; j++) {
+            pred[i]=nodes[node_id]->pred[j];
+        }
+    }
+}
+#endif
diff --git a/include/tensemble/TreeClassifier.h b/include/tensemble/TreeClassifier.h
new file mode 100644
index 0000000..e0f4c3f
--- /dev/null
+++ b/include/tensemble/TreeClassifier.h
@@ -0,0 +1,74 @@
+/* * * * *
+ *  TreeClassifier.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+
+#ifndef libTM_TreeClassifier_h
+#define libTM_TreeClassifier_h
+
+#include "Tree.h"
+
+class TreeClassifier:public Tree {
+    /* DecisionTreeClassifier
+     * label must be [0,1,...,n_classes-1] before calling TreeClassifier
+     */
+    
+public:
+    TreeClassifier(int split_criterion,\
+                   uint n_classes,\
+                   uint n_features,\
+                   uint max_features,\
+                   uint min_sample_leaf,\
+                   uint max_depth,\
+                   uint find_split_algorithm=FIND_BEST,\
+                   uint random_seed=0,\
+                   uint n_threads=1);
+    
+    void compute_importance(REAL* importance);
+};
+
+TreeClassifier::TreeClassifier(int split_criterion,\
+                               uint n_classes,\
+                               uint n_features,\
+                               uint max_features,\
+                               uint min_sample_leaf,\
+                               uint max_depth,\
+                               uint find_split_algorithm,\
+                               uint random_seed,\
+                               uint n_threads)\
+:Tree(split_criterion,n_classes,n_features,max_features,min_sample_leaf,max_depth,find_split_algorithm,random_seed,n_threads){
+}
+void TreeClassifier::compute_importance(REAL* importance){
+    //node from 1....numNodes
+    uint f;
+    REAL normalize=0;
+    for (uint i=0; i<n_features; i++) {
+        importance[i]=0.0;
+    }
+    for (uint i=ROOT; i<=numNodes; i++) {
+        // don't calculate in redundant nodes or leaf node
+        if (nodes[i]->leaf || nodes[i]->nSamples<=0) {
+            continue;
+        }
+        f=nodes[i]->feature_split;
+        importance[f]+=\
+        nodes[i]->nSamples*(nodes[i]->ini_error-nodes[i]->best_error);
+    }
+    for (uint i=0; i<n_features; i++) {
+        normalize+=importance[i];
+    }
+    if (normalize!=0) {
+        for (uint i=0; i<n_features; i++) {
+            importance[i]/=normalize;
+        }
+    }
+}
+#endif
diff --git a/include/tensemble/TreeNode.h b/include/tensemble/TreeNode.h
new file mode 100644
index 0000000..a6b5599
--- /dev/null
+++ b/include/tensemble/TreeNode.h
@@ -0,0 +1,41 @@
+/* * * * *
+ *  TreeNode.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+
+#ifndef libTM_TreeNode_h
+#define libTM_TreeNode_h
+#include    "TypeDef.h"
+
+class TreeNode {
+public:
+    uint feature_split;
+    REAL value_split;
+    bool leaf;
+    uint nSamples;
+    uint left_child;
+    uint right_child;
+    REAL* pred;//only leaf node need allocate memory
+    
+    //for compute feature importance
+    REAL ini_error;
+    REAL best_error;
+    TreeNode():feature_split(0),value_split(0.0),leaf(false),nSamples(0),left_child(0),right_child(0),pred(NULL),ini_error(0.0),best_error(0.0){
+    }
+    ~TreeNode(){
+        if (pred) {
+            delete pred;
+            pred=NULL;
+        }
+    }
+};
+
+#endif
diff --git a/include/tensemble/TreeRegressor.h b/include/tensemble/TreeRegressor.h
new file mode 100644
index 0000000..3ff811a
--- /dev/null
+++ b/include/tensemble/TreeRegressor.h
@@ -0,0 +1,66 @@
+/* * * * *
+ *  TreeRegressor.h 
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_TreeRegressor_h
+#define libTM_TreeRegressor_h
+#include "Tree.h"
+
+class TreeRegressor:public Tree {
+    //DecisionTreeRegressor
+    
+public:
+    //constructor,regression set n_classes=1; 
+    TreeRegressor(uint n_features,\
+                  uint max_features,\
+                  uint min_sample_leaf,\
+                  uint max_depth,\
+                  uint find_split_algorithm=FIND_BEST,\
+                  uint random_seed=0,\
+                  uint n_threads=1);
+    void compute_importance(REAL* importance);
+};
+
+TreeRegressor::TreeRegressor(uint n_features,\
+                             uint max_features,\
+                             uint min_sample_leaf,\
+                             uint max_depth,\
+                             uint find_split_algorithm,\
+                             uint random_seed,\
+                             uint n_threads)\
+:Tree(CRITERION_MSE,1,n_features,max_features,min_sample_leaf,max_depth,find_split_algorithm,random_seed,n_threads){
+}
+void TreeRegressor::compute_importance(REAL* importance){
+    //node from 1....numNodes
+    uint f;
+    REAL normalize=0;
+    for (uint i=0; i<n_features; i++) {
+        importance[i]=0.0;
+    }
+    for (uint i=ROOT; i<=numNodes; i++) {
+        // don't calculate in redundant nodes or leaf node
+        if (nodes[i]->leaf || nodes[i]->nSamples<=0) {
+            continue;
+        }
+        f=nodes[i]->feature_split;
+        importance[f]+=\
+        (nodes[i]->ini_error-nodes[i]->best_error)*(nodes[i]->ini_error-nodes[i]->best_error);
+    }
+    for (uint i=0; i<n_features; i++) {
+        normalize+=importance[i];
+    }
+    if (normalize!=0) {
+        for (uint i=0; i<n_features; i++) {
+            importance[i]/=normalize;
+        }
+    }
+}
+#endif
diff --git a/include/tensemble/TypeDef.h b/include/tensemble/TypeDef.h
new file mode 100644
index 0000000..70dca8d
--- /dev/null
+++ b/include/tensemble/TypeDef.h
@@ -0,0 +1,39 @@
+/* * * * *
+ *  TypeDef.h 
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_TypeDef_h
+#define libTM_TypeDef_h
+#include <vector>
+#include "cstdio"
+#define ENSEMBLE_SUCCESS 0
+#define ENSEMBLE_FAIL    -1
+
+#define EPS 1e-18
+#define LEFT(A) ((A)<<1)
+#define RIGHT(A) (((A)<<1)+1)
+
+#define MAX_DEPTH   100
+#define ROOT    1
+#define THREAD_MIN_FEATURES 5
+#define MTRY_DEFAULT    0
+
+typedef double REAL;
+typedef unsigned int uint;
+typedef short int sint;
+typedef std::vector<uint> uint_vec;
+
+
+#define MAX(A,B)    (((A)>(B))?(A):(B))
+#define MIN(A,B)    (((A)>(B))?(B):(A))
+
+
+#endif
diff --git a/include/tensemble/cmdline.h b/include/tensemble/cmdline.h
new file mode 100644
index 0000000..125fddb
--- /dev/null
+++ b/include/tensemble/cmdline.h
@@ -0,0 +1,160 @@
+/* * * * *
+ *  cmdline.h
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Copyright (C) 2012, Rongkai Xia, shinekaixia at gmail.com
+ *
+ * * * * */
+
+#ifndef libTM_cmdline_h
+#define libTM_cmdline_h
+#include <string>
+#include <vector>
+#include <map>
+
+static int param_offset=25;
+static int line_max_char=80;
+class cmdline {
+    
+public:
+    
+    bool parse_cmdline(int argc,const char * argv[]);
+    
+    void print_help();
+    
+    std::string get_value(const std::string& param);
+    
+    bool check_param(const std::string *param,int len);
+    
+    bool has_param(const std::string& param);
+    
+    void register_help(const std::string& param,const std::string& help);
+    
+    void register_help(const std::string *param,const std::string *help,int len);
+    
+    void clear_help();
+protected:
+    bool parse_name(std::string& name);
+    std::map<std::string,std::string> value;
+    
+    std::vector<std::pair<std::string, std::string> > help;
+    
+};
+
+bool cmdline::has_param(const std::string &param){
+    if (value.find(param)!=value.end()) {
+        return true;
+    }
+    return false;
+}
+std::string cmdline::get_value(const std::string &param){
+    if (value.find(param)==value.end()) {
+        return "";
+    }
+    return value[param];
+}
+
+bool cmdline::parse_name(std::string& name){
+    if (name.length()>=1 && name[0]!='-') {
+        return false;
+    }
+    if (name.length()>=2 && name[1]=='-') {
+        name=name.substr(2);
+    }else {
+        name=name.substr(1);
+    }
+    return true;
+}
+
+bool cmdline::parse_cmdline(int argc,const char * argv[]){
+    int i = 1;
+    while (i < argc) {
+        std::string s(argv[i]);
+        if (parse_name(s)) {
+            if (value.find(s) != value.end()) {
+                fprintf(stderr, "Error: The parameter -%s is already specified.\n",s.c_str());
+                return false;
+            }
+            if ((i+1) < argc) {
+                std::string s_next(argv[i+1]);
+                if (! parse_name(s_next)) {
+                    value[s] = s_next;
+                    i++;
+                } else {
+                    value[s] = "";
+                }
+            } else {
+                value[s] = "";
+            }
+        } else {
+            fprintf(stderr, "Error: Unknow parameter %s.\n",s.c_str());
+            print_help();
+            return false;
+        }
+        i++;
+    }
+    return true;
+}
+
+void cmdline::print_help() {
+    std::vector< std::pair<std::string, std::string> >::const_iterator pv;
+    
+    for ( pv = help.begin(); pv != help.end(); ++pv) {
+        std::cout << "-" << pv->first;
+        for (int i=pv->first.size()+1; i < param_offset; i++) { std::cout << " "; } 
+        std::string s_out = pv->second;
+        while (s_out.size() > 0) {
+            if (s_out.size() > (line_max_char-param_offset)) {
+                size_t p = s_out.substr(0, line_max_char-param_offset).find_last_of(" \t");
+                if (p == 0) {
+                    p = line_max_char-param_offset;
+                }
+                std::cout << s_out.substr(0, p) << std::endl;
+                s_out = s_out.substr(p+1, s_out.length()-p);            
+            } else {
+                std::cout << s_out << std::endl;
+                s_out = "";  
+            }
+            if (s_out.size() > 0) {
+                for (int i=0; i < param_offset; i++) { std::cout << " "; }
+            }
+        }
+    }
+}
+
+bool cmdline::check_param(const std::string *param, int len){
+    int i;
+    for (std::map<std::string,std::string>::const_iterator iter=value.begin(); iter!=value.end(); iter++) {
+        for (i=0; i<len; i++) {
+            if (iter->first==param[i]) {
+                break;
+            }
+        }
+        if (i==len) {
+            fprintf(stderr, "Error: Unknow parameter %s.\n",iter->first.c_str());
+//            print_help();
+            return false;
+        }
+    }
+    return true;
+}
+
+void cmdline::register_help(const std::string &param, const std::string &help){
+    this->help.push_back(make_pair(param, help));
+}
+
+void cmdline::register_help(const std::string *param,const std::string *help,int len){
+    clear_help();
+    for (int i=0; i<len; i++) {
+        register_help(param[i], help[i]);
+    }
+}
+
+void cmdline::clear_help(){
+    this->help.clear();
+}
+#endif
diff --git a/sample_data.tgz b/sample_data.tgz
new file mode 100644
index 0000000..b61277f
Binary files /dev/null and b/sample_data.tgz differ
diff --git a/scripts/AddHeaders.sh b/scripts/AddHeaders.sh
new file mode 100644
index 0000000..0926709
--- /dev/null
+++ b/scripts/AddHeaders.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+usage()
+{
+cat << EOF
+usage: $0 options
+
+This script adds a header to all files matching the provided pattern in the given directory
+
+OPTIONS:
+   -h      Show this message
+   -l      license file 
+   -d      directory to search
+   -p      file pattern to match
+EOF
+}
+
+license=
+pattern=
+directory=
+while getopts "hl:p:d:" OPTION
+do
+     case $OPTION in
+         h)
+             usage
+             exit 1
+             ;;
+         l)
+             license=$OPTARG
+             ;;
+         p)
+             pattern=$OPTARG
+             ;;
+         d)
+             directory=$OPTARG
+             ;;
+         ?)
+             usage
+             exit
+             ;;
+     esac
+done
+
+echo "Prepending ${license} to files with pattern ${pattern} in directory ${directory}"
+
+for i in ${directory}/${pattern}
+do
+  if ! grep -q Copyright $i
+  then
+    cat ${license} $i >$i.new && mv $i.new $i
+  fi
+done
diff --git a/scripts/ComputeMutationRate.py b/scripts/ComputeMutationRate.py
new file mode 100644
index 0000000..dac765c
--- /dev/null
+++ b/scripts/ComputeMutationRate.py
@@ -0,0 +1,70 @@
+"""Compute Mutation Rate
+
+Usage:
+  ComputeMutationRate.py <input1> <input2>
+
+
+Options:
+  -h --help               Show this message.
+"""
+from docopt import docopt
+from Bio import SeqIO
+import itertools
+
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+
+    def disable(self):
+        self.HEADER = ''
+        self.OKBLUE = ''
+        self.OKGREEN = ''
+        self.WARNING = ''
+        self.FAIL = ''
+        self.ENDC = ''
+
+def main(args):
+    in1 = args['<input1>']
+    in2 = args['<input2>']
+
+    seq1 = SeqIO.parse(in1, 'fasta')
+    seq2 = SeqIO.parse(in2, 'fasta')
+    nMut = 0
+    totLen = 0
+    i = 0
+
+    for s1, s2 in itertools.izip(seq1, seq2):
+        o1 = ""
+        o2 = ""
+        newMut = True
+        assert(s1.name == s2.name)
+        totLen += len(s1.seq)
+        for a,b in itertools.izip(s1.seq, s2.seq):
+            if a != b:
+                nMut+= 1
+                newMut = True
+                o1 += bcolors.OKGREEN + a + bcolors.ENDC
+                o2 += bcolors.FAIL + b + bcolors.ENDC
+            else:
+                if newMut:
+                    o1 += "***"
+                    o2 += "***"
+                    newMut = False
+
+        i += 1
+        print(o1)
+        print(o2)
+        if i % 1000 == 0:
+            print("There were {} mutations in {} bases; total rate = {:0.2f}\r\r".format(\
+                   nMut, totLen, (100.0 * nMut) / totLen))
+    print("There were {} mutations in {} bases; total rate = {:0.2f}\r\r".format(\
+           nMut, totLen, (100.0 * nMut) / totLen))
+
+if __name__ == "__main__":
+    args = docopt(__doc__, version="Compute Mutation Rate v1.0")
+    main(args)
diff --git a/scripts/MutateReference.py b/scripts/MutateReference.py
new file mode 100644
index 0000000..477fd5b
--- /dev/null
+++ b/scripts/MutateReference.py
@@ -0,0 +1,83 @@
+"""Mutate Reference.
+
+Usage:
+  MutateReference.py --in=<input> --out=<output> --rate=<rate>
+
+Options:
+  -h --help               Show this message.
+  --in=<input>            The input reference sequence.
+  --out=<output>          The output [mutated] sequences.
+  --rate=<rate>           The rate of mutation.
+"""
+from docopt import docopt
+from Bio import SeqIO
+import itertools
+import random
+import copy
+import math
+import sys
+
+mutList = {'A' : ['T', 'C', 'G'],
+           'T' : ['A', 'C', 'G'],
+           'C' : ['A', 'G', 'T'],
+           'G' : ['A', 'C', 'T'],
+           'a' : ['c', 'g', 't'],
+           't' : ['a', 'c', 'g'],
+           'c' : ['a', 'g', 't'],
+           'g' : ['a', 'c', 't'],
+           'N' : ['A', 'T', 'C', 'G'],
+           'n' : ['a', 't', 'c', 'g']}
+
+def mutate(base):
+    ml = mutList[base]
+    return ml[random.randint(0, len(ml)-1)]
+
+def randCeilFloor(x):
+    if x < 1.0 or random.random() < 0.5:
+        return int(math.ceil(x))
+    else:
+        return int(math.floor(x))
+
+def main(args):
+    mutRate = float(args['--rate'])
+    refIn = args['--in']
+    refOut = args['--out']
+
+    ofile = open(refOut,'a')
+    seqs = SeqIO.parse(refIn, 'fasta')
+    currBase = 0
+    nMut = 0
+    totLen = 0
+    nextMut = currBase + randCeilFloor(random.expovariate(mutRate))
+    mutList = []
+    for i, s in enumerate(seqs):
+        lenS = len(s.seq)
+        firstBase = currBase
+        lastBase = currBase + lenS
+        totLen += lenS
+        o = copy.copy(s.seq)
+        x = s.seq.tomutable()
+        while nextMut < lastBase:
+            offset = nextMut - firstBase
+            orig = o[offset]
+            x[offset] = mutate(orig)
+            assert(x[offset] != orig)
+            nMut += 1
+            currBase = nextMut
+            nextMut = currBase + randCeilFloor(random.expovariate(mutRate))
+        currBase = lastBase
+        #assert(len(o) == len(x))
+        #for a,b in itertools.izip(o,x):
+        #    if a != b: nMut += 1
+        s.seq = x
+        SeqIO.write(s, ofile, 'fasta')
+        if (i % 1000 == 0):
+            sys.stderr.write("processed {} records; performed {} mutations; rate = {:.2f}%\r\r".format(i, nMut, (100.0 * nMut) / totLen))
+        # print("\n")/tmp/hg18_transcripts_mut0.001.fa/tmp/hg18_transcripts_mut0.001.fa/tmp/hg18_transcripts_mut0.001.fa
+        # print('Num Rec = {}, Num Mut = {}, Tot. length = {}, rate = {}'.format(i, nMut, totLen, (100.0 * nMut) / totLen))
+        # print(mutList)
+    print("\ndone.")
+
+if __name__ == "__main__":
+    arguments = docopt(__doc__, version="Mutate Reference 1.0")
+    main(arguments)
diff --git a/scripts/cpld.bash b/scripts/cpld.bash
new file mode 100755
index 0000000..2b890a0
--- /dev/null
+++ b/scripts/cpld.bash
@@ -0,0 +1,35 @@
+#!/bin/bash 
+# Author : Hemanth.HM
+# Email : hemanth[dot]hm[at]gmail[dot]com
+# License : GNU GPLv3
+#
+
+function useage()
+{
+	cat << EOU
+Useage: bash $0 <path to the binary> <path to copy the dependencies>
+EOU
+exit 1
+}
+
+#Validate the inputs
+[[ $# < 2 ]] && useage
+
+#Check if the paths are vaild
+[[ ! -e $1 ]] && echo "Not a vaild input $1" && exit 1 
+[[ -d $2 ]] || echo "No such directory $2 creating..."&& mkdir -p "$2"
+
+#Get the library dependencies
+echo "Collecting the shared library dependencies for $1..."
+deps=$(ldd $1 | awk 'BEGIN{ORS=" "}$1~/^\//{print $1}$3~/^\//{print $3}' | sed 's/,$/\n/')
+echo "Copying the dependencies to $2"
+
+#Copy the deps
+for dep in $deps
+do
+	echo "Copying $dep to $2"
+	cp "$dep" "$2"
+done
+
+echo "Done!"
+
diff --git a/scripts/make-release.sh b/scripts/make-release.sh
new file mode 100755
index 0000000..7014feb
--- /dev/null
+++ b/scripts/make-release.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+SOURCE="${BASH_SOURCE[0]}"
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
+  DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+  SOURCE="$(readlink "$SOURCE")"
+  [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
+done
+DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+
+host=
+version=
+
+while getopts "v:n:" opt; do
+  case $opt in
+    n)
+      echo "Host is $OPTARG" >&2
+      host=$OPTARG
+      ;;
+    v)
+      echo "Version is $OPTARG" >&2
+      version=$OPTARG
+      ;;
+    \?)
+      echo "Invalid option: -$OPTARG" >&2
+      exit 1
+      ;;
+  esac
+done
+
+echo -e "Preparing binary release\n=====================\n"
+echo -e "Version = ${version}"
+echo -e "Host = ${host}"
+
+# create the binary directory 
+betaname=SalmonBeta-${version}_${host}
+mkdir ${DIR}/../RELEASES
+mkdir ${DIR}/../RELEASES/${betaname}
+mkdir ${DIR}/../RELEASES/${betaname}/bin
+mkdir ${DIR}/../RELEASES/${betaname}/lib
+
+# copy over the executable and Intel TBB libraries
+echo -e "Copying over the binary and Intel TBB libraries\n"
+cp ${DIR}/../bin/salmon ${DIR}/../RELEASES/${betaname}/bin/
+cp ${DIR}/../lib/libtbb* ${DIR}/../RELEASES/${betaname}/lib/
+
+# copy other dependencies (shared libraries)
+echo -e "Copying over other shared library dependencies\n"
+bash ${DIR}/../scripts/cpld.bash ${DIR}/../bin/salmon ${DIR}/../RELEASES/${betaname}/lib/
+echo -e "Removing dangerous dependencies\n"
+rm ${DIR}/../RELEASES/${betaname}/lib/libc.so.6
+rm ${DIR}/../RELEASES/${betaname}/lib/ld-linux-x86-64.so.2
+rm ${DIR}/../RELEASES/${betaname}/lib/libdl.so.2
+rm ${DIR}/../RELEASES/${betaname}/lib/libpthread*.so.*
+
+# now make the tarball
+echo -e "Making the tarball\n"
+cd ${DIR}/../RELEASES
+tar czvf ${betaname}.tar.gz ${betaname}
+
+echo -e "Done making release!"
+
+#echo -e "Pushing the tarball to GitHub\n"
+## Since it's currently unclear to me how to overwrite an asset via the GitHub
+## API, the following code deletes the old asset, and uploads the new one in its place
+#
+## Get the previous asset id of the tarball
+#echo -e "Getting previous asset ID\n"
+#ASSETID=`curl -s -X GET https://api.github.com/repos/COMBINE-lab/salmon/releases/1263754/assets | grep "\"id" | head -1 | awk '{gsub(/,$/,""); print $2}'`
+#
+## Delete the previous tarball
+#echo -e "Deleting previous asset\n"
+#curl -X DELETE -H "Authorization: token ${SALMON_PUSH_KEY}" https://api.github.com/repos/COMBINE-lab/salmon/releases/assets/$ASSETID
+#
+## Upload the new tarball
+#echo -e "Uploading new asset\n"
+#curl -X POST --data-binary "@SalmonBeta-latest_ubuntu-12.04.tar.gz" https://uploads.github.com/repos/COMBINE-lab/salmon/releases/1263754/assets?name=SalmonBeta-latest_ubuntu-12.04.tar.gz --header "Content-Type:application/gzip" -H "Authorization: token ${SALMON_PUSH_KEY}"
+#echo -e "Done!\n"
diff --git a/scripts/push-binary.sh b/scripts/push-binary.sh
new file mode 100755
index 0000000..05e1a9e
--- /dev/null
+++ b/scripts/push-binary.sh
@@ -0,0 +1,42 @@
+echo -e "Preparing binary release\n=====================\n"
+
+# create the binary directory 
+mkdir $HOME/SalmonBeta-latest_ubuntu-12.04
+mkdir $HOME/SalmonBeta-latest_ubuntu-12.04/bin
+mkdir $HOME/SalmonBeta-latest_ubuntu-12.04/lib
+
+# copy over the executable and Intel TBB libraries
+echo -e "Copying over the binary and Intel TBB libraries\n"
+cp $TRAVIS_BUILD_DIR/bin/salmon $HOME/SalmonBeta-latest_ubuntu-12.04/bin/
+cp $TRAVIS_BUILD_DIR/lib/libtbb* $HOME/SalmonBeta-latest_ubuntu-12.04/lib/
+
+# copy other dependencies (shared libraries)
+echo -e "Copying over other shared library dependencies\n"
+bash $TRAVIS_BUILD_DIR/scripts/cpld.bash $TRAVIS_BUILD_DIR/bin/salmon $HOME/SalmonBeta-latest_ubuntu-12.04/lib/
+echo -e "Removing dangerous dependencies\n"
+rm $HOME/SalmonBeta-latest_ubuntu-12.04/lib/libc.so.6
+rm $HOME/SalmonBeta-latest_ubuntu-12.04/lib/ld-linux-x86-64.so.2
+rm $HOME/SalmonBeta-latest_ubuntu-12.04/lib/libdl.so.2
+rm $HOME/SalmonBeta-latest_ubuntu-12.04/lib/libpthread*.so.*
+
+# now make the tarball
+echo -e "Making the tarball\n"
+cd $HOME
+tar czvf SalmonBeta-latest_ubuntu-12.04.tar.gz SalmonBeta-latest_ubuntu-12.04
+
+echo -e "Pushing the tarball to GitHub\n"
+# Since it's currently unclear to me how to overwrite an asset via the GitHub
+# API, the following code deletes the old asset, and uploads the new one in its place
+
+# Get the previous asset id of the tarball
+echo -e "Getting previous asset ID\n"
+ASSETID=`curl -s -X GET https://api.github.com/repos/COMBINE-lab/salmon/releases/1263754/assets | grep "\"id" | head -1 | awk '{gsub(/,$/,""); print $2}'`
+
+# Delete the previous tarball
+echo -e "Deleting previous asset\n"
+curl -X DELETE -H "Authorization: token ${SALMON_PUSH_KEY}" https://api.github.com/repos/COMBINE-lab/salmon/releases/assets/$ASSETID
+
+# Upload the new tarball
+echo -e "Uploading new asset\n"
+curl -X POST --data-binary "@SalmonBeta-latest_ubuntu-12.04.tar.gz" https://uploads.github.com/repos/COMBINE-lab/salmon/releases/1263754/assets?name=SalmonBeta-latest_ubuntu-12.04.tar.gz --header "Content-Type:application/gzip" -H "Authorization: token ${SALMON_PUSH_KEY}"
+echo -e "Done!\n"
diff --git a/src/AlignmentModel.cpp b/src/AlignmentModel.cpp
new file mode 100644
index 0000000..a685fb3
--- /dev/null
+++ b/src/AlignmentModel.cpp
@@ -0,0 +1,621 @@
+#include <iostream>
+#include <cstdio>
+#include <tuple>
+#include <map>
+
+#include <boost/config.hpp> // for BOOST_LIKELY/BOOST_UNLIKELY
+
+#include "Transcript.hpp"
+#include "SalmonMath.hpp"
+#include "SalmonStringUtils.hpp"
+#include "UnpairedRead.hpp"
+#include "ReadPair.hpp"
+#include "AlignmentModel.hpp"
+
+AlignmentModel::AlignmentModel(double alpha, uint32_t readBins ) :
+    transitionProbsLeft_(readBins, AtomicMatrix<double>(numAlignmentStates(), numAlignmentStates(), alpha)),
+    transitionProbsRight_(readBins, AtomicMatrix<double>(numAlignmentStates(), numAlignmentStates(), alpha)),
+    isEnabled_(true),
+    readBins_(readBins),
+    burnedIn_(false) {}
+
+bool AlignmentModel::burnedIn() { return burnedIn_; }
+void AlignmentModel::burnedIn(bool burnedIn) { burnedIn_ = burnedIn; }
+
+void AlignmentModel::setLogger(std::shared_ptr<spdlog::logger> logger){
+    logger_ = logger;
+}
+
+bool AlignmentModel::hasLogger(){
+    return (logger_) ? true : false;
+}
+
+bool AlignmentModel::hasIndel(ReadPair& hit) {
+    if (!hit.isPaired()) {
+        return hasIndel(hit.read1);
+    }
+    return hasIndel(hit.read1) or hasIndel(hit.read2);
+}
+
+
+bool AlignmentModel::hasIndel(UnpairedRead& hit) {
+    return hasIndel(hit.read);
+}
+
+
+bool AlignmentModel::hasIndel(bam_seq_t* read) {
+    uint32_t* cigar = bam_cigar(read);
+    uint32_t cigarLen = bam_cigar_len(read);
+
+    for (uint32_t cigarIdx = 0; cigarIdx < cigarLen; ++cigarIdx) {
+        uint32_t opLen = cigar[cigarIdx] >> BAM_CIGAR_SHIFT;
+        enum cigar_op op = static_cast<enum cigar_op>(cigar[cigarIdx] & BAM_CIGAR_MASK);
+
+        switch (op) {
+            case BAM_CINS:
+                return true;
+            case BAM_CDEL:
+                return true;
+            default:
+                break;
+        }
+    }
+    return false;
+}
+
+/*
+inline void AlignmentModel::setBasesFromCIGAROp_(enum cigar_op op, size_t& curRefBase, size_t& curReadBase,
+                                                 std::stringstream& readStr, std::stringstream& matchStr,
+                                                 std::stringstream& refStr) {
+    using salmon::stringtools::twoBitToChar;
+    switch (op) {
+        case BAM_UNKNOWN:
+            std::cerr << "ENCOUNTERED UNKNOWN SYMBOL IN CIGAR STRING!\n";
+            break;
+        case BAM_CMATCH:
+            readStr << twoBitToChar[curReadBase];
+            matchStr << ((curReadBase == curRefBase) ? ' ' : 'X');
+            refStr << twoBitToChar[curRefBase];
+            // do nothing
+            break;
+        case BAM_CBASE_MATCH:
+            readStr << twoBitToChar[curReadBase];
+            matchStr << ' ';
+            refStr << twoBitToChar[curRefBase];
+            // do nothing
+            break;
+        case BAM_CBASE_MISMATCH:
+            readStr << twoBitToChar[curReadBase];
+            matchStr << 'X';
+            refStr << twoBitToChar[curRefBase];
+            // do nothing
+            break;
+        case BAM_CINS:
+            readStr << twoBitToChar[curReadBase];
+            matchStr << ' ';
+            refStr << '-';
+            curRefBase = ALN_DASH;
+            break;
+        case BAM_CDEL:
+            readStr << '-';
+            matchStr << ' ';
+            refStr << twoBitToChar[curRefBase];
+            curReadBase = ALN_DASH;
+            break;
+        case BAM_CREF_SKIP:
+            readStr << 'N';
+            matchStr << ' ';
+            refStr << twoBitToChar[curRefBase];
+            curReadBase = ALN_REF_SKIP;
+            break;
+        case BAM_CSOFT_CLIP:
+            readStr << twoBitToChar[curReadBase];
+            matchStr << ' ';
+            refStr << 'S';
+            curRefBase = ALN_SOFT_CLIP;
+            break;
+        case BAM_CHARD_CLIP:
+            readStr << 'H';
+            matchStr << ' ';
+            refStr << 'H';
+            curRefBase = ALN_HARD_CLIP;
+            curReadBase = ALN_HARD_CLIP;
+            break;
+        case BAM_CPAD:
+            readStr << 'P';
+            matchStr << ' ';
+            refStr << 'P';
+            curRefBase = ALN_PAD;
+            curReadBase = ALN_PAD;
+            break;
+    }
+}
+*/
+
+inline void AlignmentModel::setBasesFromCIGAROp_(enum cigar_op op, size_t& curRefBase, size_t& curReadBase) {
+    switch (op) {
+        case BAM_UNKNOWN:
+            std::cerr << "ENCOUNTERED UNKNOWN SYMBOL IN CIGAR STRING!\n";
+            break;
+        case BAM_CMATCH:
+           // do nothing
+            break;
+        case BAM_CBASE_MATCH:
+            // do nothing
+            break;
+        case BAM_CBASE_MISMATCH:
+            // do nothing
+            break;
+        case BAM_CINS:
+            curRefBase = ALN_DASH;
+            break;
+        case BAM_CDEL:
+            curReadBase = ALN_DASH;
+            break;
+        case BAM_CREF_SKIP:
+            curReadBase = ALN_REF_SKIP;
+            break;
+        case BAM_CSOFT_CLIP:
+            curRefBase = ALN_SOFT_CLIP;
+            break;
+        case BAM_CHARD_CLIP:
+            curRefBase = ALN_HARD_CLIP;
+            curReadBase = ALN_HARD_CLIP;
+            break;
+        case BAM_CPAD:
+            curRefBase = ALN_PAD;
+            curReadBase = ALN_PAD;
+            break;
+    }
+}
+
+char opToChr(enum cigar_op op){
+    switch (op) {
+        case BAM_UNKNOWN:
+            std::cerr << "ENCOUNTERED UNKNOWN SYMBOL IN CIGAR STRING!\n";
+            break;
+        case BAM_CMATCH:
+           // do nothing
+	    return 'M';
+            break;
+        case BAM_CBASE_MATCH:
+            // do nothing
+	    return 'M';
+            break;
+        case BAM_CBASE_MISMATCH:
+            // do nothing
+	    return 'M';
+            break;
+        case BAM_CINS:
+	    return 'I';
+            break;
+        case BAM_CDEL:
+	    return 'D';
+            break;
+        case BAM_CREF_SKIP:
+	    return 'S';
+            break;
+        case BAM_CSOFT_CLIP:
+	    return 'c';
+            break;
+        case BAM_CHARD_CLIP:
+	    return 'C';
+            break;
+        case BAM_CPAD:
+	    return 'P';
+            break;
+    }
+    return 'X';
+}
+
+
+double AlignmentModel::logLikelihood(bam_seq_t* read, Transcript& ref,
+                                 std::vector<AtomicMatrix<double>>& transitionProbs){
+    using namespace salmon::stringtools;
+    bool useQual{false};
+    size_t readIdx{0};
+    auto transcriptIdx = bam_pos(read);
+    size_t transcriptLen = ref.RefLength;
+    // if the read starts before the beginning of the transcript,
+    // only consider the part overlapping the transcript
+    if (transcriptIdx < 0) {
+        readIdx = -transcriptIdx;
+        transcriptIdx = 0;
+    }
+
+    // unsigned version of transcriptIdx
+    size_t uTranscriptIdx = static_cast<size_t>(transcriptIdx);
+
+    if (uTranscriptIdx >= transcriptLen) {
+        std::lock_guard<std::mutex> l(outputMutex_);
+        std::cerr << "transcript index = " << uTranscriptIdx << ", transcript length = " << transcriptLen << "\n";
+        return salmon::math::LOG_0;
+    }
+
+    //std::stringstream readStream, matchStream, refStream;
+
+    uint32_t* cigar = bam_cigar(read);
+    uint32_t cigarLen = bam_cigar_len(read);
+    uint8_t* qseq = reinterpret_cast<uint8_t*>(bam_seq(read));
+    uint8_t* qualStr = reinterpret_cast<uint8_t*>(bam_qual(read));
+    int32_t readLen = bam_seq_len(read);
+
+    if (cigarLen == 0 or !cigar) { return salmon::math::LOG_EPSILON; }
+
+    salmon::stringtools::strand readStrand = salmon::stringtools::strand::forward;
+    double logLike = salmon::math::LOG_1;
+
+    bool advanceInRead{false};
+    bool advanceInReference{false};
+    uint32_t readPosBin{0};
+    uint32_t cigarIdx{0};
+    uint32_t prevStateIdx{startStateIdx};
+    uint32_t curStateIdx{0};
+    double invLen = static_cast<double>(readBins_) / bam_seq_len(read);
+
+    for (uint32_t cigarIdx = 0; cigarIdx < cigarLen; ++cigarIdx) {
+        uint32_t opLen = cigar[cigarIdx] >> BAM_CIGAR_SHIFT;
+        enum cigar_op op = static_cast<enum cigar_op>(cigar[cigarIdx] & BAM_CIGAR_MASK);
+        size_t curReadBase = samToTwoBit[bam_seqi(qseq, readIdx)];
+        size_t curRefBase = samToTwoBit[ref.baseAt(uTranscriptIdx, readStrand)];
+        advanceInRead = false;
+        advanceInReference = false;
+
+
+        for (size_t i = 0; i < opLen; ++i) {
+            if (advanceInRead) {
+                // Shouldn't happen!
+                if (readIdx >= readLen) {
+                    if (logger_) {
+                        logger_->warn("(in logLikelihood()) CIGAR string for read [{}] "
+                            "seems inconsistent. It refers to non-existant "
+                            "positions in the read!", bam_name(read));
+                        std::stringstream cigarStream;
+                        for (size_t j = 0; j < cigarLen; ++j) {
+                            uint32_t opLen = cigar[j] >> BAM_CIGAR_SHIFT;
+                            enum cigar_op op = static_cast<enum cigar_op>(cigar[j] & BAM_CIGAR_MASK);
+                            cigarStream << opLen << opToChr(op);
+                        }
+                        logger_->warn("(in logLikelihood()) CIGAR = {}", cigarStream.str());
+                    }
+                    return logLike;
+                }
+                curReadBase = samToTwoBit[bam_seqi(qseq, readIdx)];
+                readPosBin = static_cast<uint32_t>((readIdx * invLen));
+                advanceInRead = false;
+            }
+            if (advanceInReference) {
+                // Shouldn't happen!
+                if (uTranscriptIdx >= transcriptLen) {
+                    if (logger_) {
+			    logger_->warn("(in logLikelihood()) CIGAR string for read [{}] "
+					    "seems inconsistent. It refers to non-existant "
+					    "positions in the reference! Transcript name "
+	    				    "is {}, length is {}, id is {}. Read things refid is {}",  
+					    bam_name(read), ref.RefName, transcriptLen, ref.id, bam_ref(read));
+                    }
+                    return logLike;
+                }
+                curRefBase = samToTwoBit[ref.baseAt(uTranscriptIdx, readStrand)];
+                advanceInReference = false;
+            }
+
+
+            setBasesFromCIGAROp_(op, curRefBase, curReadBase);//, readStream, matchStream, refStream);
+            curStateIdx = curRefBase * numStates + curReadBase;
+            double tp = transitionProbs[readPosBin](prevStateIdx, curStateIdx);
+            logLike += tp;
+            prevStateIdx = curStateIdx;
+            if (BAM_CONSUME_SEQ(op)) {
+                ++readIdx;
+                advanceInRead = true;
+            }
+            if (BAM_CONSUME_REF(op)) {
+                ++uTranscriptIdx;
+                advanceInReference = true;
+            }
+
+        }
+    }
+
+    /*
+    {
+        std::lock_guard<std::mutex> l(outputMutex_);
+        std::cerr << "\n\nread:   " << readStream.str() << "\n";
+        std::cerr << "        " << matchStream.str() << "\n";
+        std::cerr << "ref:    " << refStream.str() << "\n";
+        //std::cerr << "states: " << stateStream.str() << "\n";
+    }
+    */
+
+    return logLike;
+}
+
+double AlignmentModel::logLikelihood(const ReadPair& hit, Transcript& ref){
+    double logLike = salmon::math::LOG_1;
+    if (BOOST_UNLIKELY(!isEnabled_)) { return logLike; }
+
+    if (!hit.isPaired()) {
+        if (hit.isLeftOrphan()) {
+            return logLikelihood(hit.read1, ref, transitionProbsLeft_);
+        } else {
+            return logLikelihood(hit.read1, ref, transitionProbsRight_);
+        }
+    }
+
+    bam_seq_t* leftRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read1 : hit.read2;
+    bam_seq_t* rightRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read2 : hit.read1;
+
+    size_t leftLen = static_cast<size_t>(bam_seq_len(leftRead));
+    size_t rightLen = static_cast<size_t>(bam_seq_len(rightRead));
+
+    if (leftRead) {
+        logLike += logLikelihood(leftRead, ref, transitionProbsLeft_);
+    }
+
+    if (rightRead) {
+        logLike += logLikelihood(rightRead, ref, transitionProbsRight_);
+    }
+    if (logLike == salmon::math::LOG_0) {
+            std::lock_guard<std::mutex> lock(outputMutex_);
+            std::cerr << "orphan status: " << hit.orphanStatus << "\n";
+            std::cerr << "error likelihood: " << logLike << "\n";
+    }
+
+    return logLike;
+}
+
+double AlignmentModel::logLikelihood(const UnpairedRead& hit, Transcript& ref){
+    double logLike = salmon::math::LOG_1;
+    if (BOOST_UNLIKELY(!isEnabled_)) { return logLike; }
+
+    bam_seq_t* read = hit.read;
+    size_t readLen = static_cast<size_t>(bam_seq_len(read));
+    // NOTE: Raise a warning in this case?
+    /*
+    if (BOOST_UNLIKELY(readLen > maxExpectedLen_)) {
+        return logLike;
+    }
+    */
+
+    logLike += logLikelihood(read, ref, transitionProbsLeft_);
+
+    if (logLike == salmon::math::LOG_0) {
+            std::lock_guard<std::mutex> lock(outputMutex_);
+            std::cerr << "error log likelihood: " << logLike << "\n";
+    }
+
+    return logLike;
+}
+
+void AlignmentModel::update(const UnpairedRead& hit, Transcript& ref, double p, double mass){
+    if (mass == salmon::math::LOG_0) { return; }
+    if (BOOST_UNLIKELY(!isEnabled_)) { return; }
+    bam_seq_t* leftRead = hit.read;
+    update(leftRead, ref, p, mass, transitionProbsLeft_);
+}
+
+void AlignmentModel::update(bam_seq_t* read, Transcript& ref, double p, double mass,
+                        std::vector<AtomicMatrix<double>>& transitionProbs) {
+    using namespace salmon::stringtools;
+    bool useQual{false};
+    size_t readIdx{0};
+    auto transcriptIdx = bam_pos(read);
+    size_t transcriptLen = ref.RefLength;
+    // if the read starts before the beginning of the transcript,
+    // only consider the part overlapping the transcript
+    if (transcriptIdx < 0) {
+        readIdx = -transcriptIdx;
+        transcriptIdx = 0;
+    }
+    // unsigned version of transcriptIdx
+    size_t uTranscriptIdx = static_cast<size_t>(transcriptIdx);
+
+    //std::stringstream readStream, matchStream, refStream;
+
+    uint32_t* cigar = bam_cigar(read);
+    uint32_t cigarLen = bam_cigar_len(read);
+    uint8_t* qseq = reinterpret_cast<uint8_t*>(bam_seq(read));
+    uint8_t* qualStr = reinterpret_cast<uint8_t*>(bam_qual(read));
+    int32_t readLen = bam_seq_len(read);
+
+    if (cigarLen > 0 and cigar) {
+
+        salmon::stringtools::strand readStrand = salmon::stringtools::strand::forward;
+
+        bool advanceInRead{false};
+        bool advanceInReference{false};
+        uint32_t readPosBin{0};
+        uint32_t cigarIdx{0};
+        uint32_t prevStateIdx{startStateIdx};
+        uint32_t curStateIdx{0};
+        double invLen = static_cast<double>(readBins_) / readLen;
+
+
+        for (uint32_t cigarIdx = 0; cigarIdx < cigarLen; ++cigarIdx) {
+            uint32_t opLen = cigar[cigarIdx] >> BAM_CIGAR_SHIFT;
+            enum cigar_op op = static_cast<enum cigar_op>(cigar[cigarIdx] & BAM_CIGAR_MASK);
+            size_t curReadBase = samToTwoBit[bam_seqi(qseq, readIdx)];
+            size_t curRefBase = samToTwoBit[ref.baseAt(uTranscriptIdx, readStrand)];
+            advanceInRead = false;
+            advanceInReference = false;
+
+            for (size_t i = 0; i < opLen; ++i) {
+                if (advanceInRead) {
+                    // Shouldn't happen!
+                    if (readIdx >= readLen) {
+                        if (logger_) {
+                            logger_->warn("(in update()) CIGAR string for read [{}] "
+                                "seems inconsistent. It refers to non-existant "
+                                "positions in the read!", bam_name(read));
+                            std::stringstream cigarStream;
+                            for (size_t j = 0; j < cigarLen; ++j) {
+                                uint32_t opLen = cigar[j] >> BAM_CIGAR_SHIFT;
+                                enum cigar_op op = static_cast<enum cigar_op>(cigar[j] & BAM_CIGAR_MASK);
+                                cigarStream << opLen << opToChr(op);
+                            }
+                            logger_->warn("CIGAR = {}", cigarStream.str());
+                        }
+                        return;
+                    }
+                    curReadBase = samToTwoBit[bam_seqi(qseq, readIdx)];
+                    readPosBin = static_cast<uint32_t>((readIdx * invLen));
+                    advanceInRead = false;
+                }
+                if (advanceInReference) {
+                    // Shouldn't happen!
+                    if (uTranscriptIdx >= transcriptLen) {
+                        if (logger_) {
+			    logger_->warn("(in update()) CIGAR string for read [{}] "
+					    "seems inconsistent. It refers to non-existant "
+					    "positions in the reference! Transcript name "
+	    				    "is {}, length is {}, id is {}. Read things refid is {}",  
+					    bam_name(read), ref.RefName, transcriptLen, ref.id, bam_ref(read));
+                               }
+                        return;
+                    }
+                    curRefBase = samToTwoBit[ref.baseAt(uTranscriptIdx, readStrand)];
+                    advanceInReference = false;
+                }
+
+                setBasesFromCIGAROp_(op, curRefBase, curReadBase);//, readStream, matchStream, refStream);
+                curStateIdx = curRefBase * numStates + curReadBase;
+
+                // update the state in the actual model
+                transitionProbs[readPosBin].increment(prevStateIdx, curStateIdx, mass+p);
+
+                prevStateIdx = curStateIdx;
+                if (BAM_CONSUME_SEQ(op)) {
+                    ++readIdx;
+                    advanceInRead = true;
+                    /* DEBUG -- print what happened
+                       std::cerr << "read name = " << bam_name(read) << "\n";
+                       std::cerr << "curReadBase = " << readIdx << "\n";
+                       std::cerr << "readLen = " << bam_seq_len(read) << "\n";
+                       std::cerr << "ref = ";
+                       for (size_t j = 0; j < std::min(static_cast<size_t>(bam_seq_len(read)),
+                       static_cast<size_t>(transcriptLen - transcriptIdx)); ++j) {
+                       std::cerr << salmon::stringtools::samCodeToChar[ref.baseAt(j, readStrand)];
+                       }
+                       std::cerr << "\n";
+                       std::cerr << "read = ";
+                       for (size_t j = 0; j < bam_seq_len(read); ++j) {
+                       std::cerr << salmon::stringtools::samCodeToChar[bam_seqi(qseq, j)];
+                       }
+                       std::cerr << "\nCIGAR = ";
+                       for (size_t j = 0; j < cigarLen; ++j) {
+                       uint32_t opLen = cigar[j] >> BAM_CIGAR_SHIFT;
+                       enum cigar_op op = static_cast<enum cigar_op>(cigar[j] & BAM_CIGAR_MASK);
+                       std::cerr << opLen << opToChr(op);
+                       }
+                       std::cerr << "\n";
+                       */
+                }
+                if (BAM_CONSUME_REF(op)) {
+                    ++uTranscriptIdx;
+                    advanceInReference = true;
+                }
+           }
+        }
+    } // if we had a cigar string
+}
+
+void AlignmentModel::update(const ReadPair& hit, Transcript& ref, double p, double mass){
+    if (mass == salmon::math::LOG_0) { return; }
+    if (BOOST_UNLIKELY(!isEnabled_)) { return; }
+
+    if (hit.isPaired()){
+        bam_seq_t* leftRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read1 : hit.read2;
+        bam_seq_t* rightRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read2 : hit.read1;
+	update(leftRead, ref, p, mass, transitionProbsLeft_);
+        update(rightRead, ref, p, mass, transitionProbsRight_);
+    } else if (hit.isLeftOrphan()) {
+	bam_seq_t* read = hit.read1;
+	update(read, ref, p, mass, transitionProbsLeft_);
+    } else if (hit.isRightOrphan()) {
+	bam_seq_t* read = hit.read1;
+	update(read, ref, p, mass, transitionProbsRight_);
+    }
+}
+
+// CIGAR string with printing
+/*
+    std::stringstream readStr;
+    std::stringstream matchStr;
+    std::stringstream refStr;
+    std::stringstream stateStr;
+
+switch (op) {
+                    case BAM_UNKNOWN:
+                        std::cerr << "ENCOUNTERED UNKNOWN SYMBOL IN CIGAR STRING!\n";
+                        break;
+                    case BAM_CMATCH:
+                        readStr << twoBitToChar[curReadBase];
+                        matchStr << ((curReadBase == curRefBase) ? ' ' : 'X');
+                        refStr << twoBitToChar[curRefBase];
+                        // do nothing
+                        break;
+                    case BAM_CBASE_MATCH:
+                        readStr << twoBitToChar[curReadBase];
+                        matchStr << ' ';
+                        refStr << twoBitToChar[curRefBase];
+                        // do nothing
+                        break;
+                    case BAM_CBASE_MISMATCH:
+                        readStr << twoBitToChar[curReadBase];
+                        matchStr << 'X';
+                        refStr << twoBitToChar[curRefBase];
+                        // do nothing
+                        break;
+                    case BAM_CINS:
+                        readStr << twoBitToChar[curReadBase];
+                        matchStr << ' ';
+                        refStr << '-';
+                        curRefBase = ALN_DASH;
+                        break;
+                    case BAM_CDEL:
+                        readStr << '-';
+                        matchStr << ' ';
+                        refStr << twoBitToChar[curRefBase];
+                        curReadBase = ALN_DASH;
+                        break;
+                    case BAM_CREF_SKIP:
+                        readStr << 'N';
+                        matchStr << ' ';
+                        refStr << twoBitToChar[curRefBase];
+                        curReadBase = ALN_REF_SKIP;
+                        break;
+                    case BAM_CSOFT_CLIP:
+                        readStr << twoBitToChar[curReadBase];
+                        matchStr << ' ';
+                        refStr << 'S';
+                        curRefBase = ALN_SOFT_CLIP;
+                        break;
+                    case BAM_CHARD_CLIP:
+                        readStr << 'H';
+                        matchStr << ' ';
+                        refStr << 'H';
+                        curRefBase = ALN_HARD_CLIP;
+                        curReadBase = ALN_HARD_CLIP;
+                        break;
+                    case BAM_CPAD:
+                        readStr << 'P';
+                        matchStr << ' ';
+                        refStr << 'P';
+                        curRefBase = ALN_PAD;
+                        curReadBase = ALN_PAD;
+                        break;
+                }
+
+                curStateIdx = curRefBase * numStates + curReadBase;
+
+
+               stateStr << curStateIdx << ", ";
+        {
+            std::lock_guard<std::mutex> l(outputMutex_);
+            std::cerr << "\n\nread:   " << readStr.str() << "\n";
+            std::cerr << "        " << matchStr.str() << "\n";
+            std::cerr << "ref:    " << refStr.str() << "\n";
+            std::cerr << "states: " << stateStr.str() << "\n";
+        }
+*/
diff --git a/src/BiasCorrectionDriver.cpp b/src/BiasCorrectionDriver.cpp
new file mode 100644
index 0000000..86c7973
--- /dev/null
+++ b/src/BiasCorrectionDriver.cpp
@@ -0,0 +1,52 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include <boost/filesystem.hpp>
+
+namespace bfs = boost::filesystem;
+
+int performBiasCorrection(
+        bfs::path featureFile,
+        bfs::path expressionFile,
+        double estimatedReadLength,
+        double kmersPerRead,
+        uint64_t mappedKmers,
+        uint32_t merLen,
+        bfs::path outputFile,
+        size_t numThreads);
+
+int main(int argc, char* argv[]) {
+        bfs::path featureFile(argv[1]);
+        bfs::path expressionFile(argv[2]);
+        double estimatedReadLength = atod(argv[3]);
+        double kmersPerRead = atod(argv[4]);
+        uint64_t mappedKmers = atol(argv[5]);
+        uint32_t mappedKmers = atoi(argv[6]);
+        bfs::path outputFile(argv[7]);
+        size_t numThreads = atoi(argv[8]);
+
+        performBiasCorrection(featureFile, expressionFile, estimatedReadLength, kmersPerRead,
+                              mappedKmers, merLen, outputFile, numThreads);
+}
diff --git a/src/BuildSalmonIndex.cpp b/src/BuildSalmonIndex.cpp
new file mode 100644
index 0000000..1255d5d
--- /dev/null
+++ b/src/BuildSalmonIndex.cpp
@@ -0,0 +1,279 @@
+#include <boost/thread/thread.hpp>
+
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include <atomic>
+#include <chrono>
+#include <thread>
+#include <functional>
+#include <memory>
+#include <cassert>
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+
+#include "cereal/types/vector.hpp"
+#include "cereal/archives/binary.hpp"
+
+#include "jellyfish/config.h"
+#include "jellyfish/err.hpp"
+#include "jellyfish/misc.hpp"
+#include "jellyfish/jellyfish.hpp"
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/filesystem.hpp>
+
+#include "tbb/parallel_for_each.h"
+#include "tbb/parallel_for.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/parallel_sort.h"
+
+#include "Transcript.hpp"
+#include "SalmonUtils.hpp"
+#include "GenomicFeature.hpp"
+#include "format.h"
+#include "spdlog/spdlog.h"
+
+using my_mer = jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;
+
+typedef jellyfish::stream_manager<char**>                stream_manager;
+typedef jellyfish::whole_sequence_parser<stream_manager> sequence_parser;
+
+extern "C" {
+int bwa_index(int argc, char* argv[]);
+}
+
+int computeBiasFeatures(
+    std::vector<std::string>& transcriptFiles,
+    boost::filesystem::path outFilePath,
+    bool useStreamingParser,
+    size_t numThreads);
+
+/*
+bool buildAuxKmerIndex(boost::filesystem::path& outputPrefix, uin32_t k,
+                       std::shared_ptr<spdlog::logger>& logger){
+    namespace bfs = boost::filesystem;
+
+    bfs::path indexPath = outputPrefix / "bwaindex";
+    // Load the bwa index
+    bwaidx_t *idx_{nullptr};
+    {
+        logger->info("Reading BWT index from file");
+        if ((idx_ = bwa_idx_load(indexPath.string().c_str(), BWA_IDX_BWT|BWA_IDX_BNS|BWA_IDX_PAC)) == 0) {
+            logger->error("Couldn't open index [{}] --- ", indexPath);
+            logger->error("Please make sure that 'salmon index' has been run successfully");
+            std::exit(1);
+        }
+    }
+
+    size_t numRecords = idx_->bns->n_seqs;
+    std::vector<Transcript> transcripts_tmp;
+    { // Load transcripts from file
+
+            logger->info("Index contained {} targets; loading them", numRecords);
+            //transcripts_.resize(numRecords);
+            for (auto i : boost::irange(size_t(0), numRecords)) {
+                uint32_t id = i;
+                char* name = idx_->bns->anns[i].name;
+                uint32_t len = idx_->bns->anns[i].len;
+                // copy over the length, then we're done.
+                transcripts_tmp.emplace_back(id, name, len);
+            }
+
+            std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
+                    [](const Transcript& t1, const Transcript& t2) -> bool {
+                    return t1.id < t2.id;
+                    });
+            double alpha = 0.005;
+            char nucTab[256];
+            nucTab[0] = 'A'; nucTab[1] = 'C'; nucTab[2] = 'G'; nucTab[3] = 'T';
+            for (size_t i = 4; i < 256; ++i) { nucTab[i] = 'N'; }
+
+            // Load the transcript sequence from file
+            for (auto& t : transcripts_tmp) {
+                transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength, alpha);
+                // from BWA
+                uint8_t* rseq = nullptr;
+                int64_t tstart, tend, compLen, l_pac = idx_->bns->l_pac;
+                tstart  = idx_->bns->anns[t.id].offset;
+                tend = tstart + t.RefLength;
+                rseq = bns_get_seq(l_pac, idx_->pac, tstart, tend, &compLen);
+                if (compLen != t.RefLength) {
+                    fmt::print(stderr,
+                               "For transcript {}, stored length ({}) != computed length ({}) --- index may be corrupt. exiting\n",
+                               t.RefName, compLen, t.RefLength);
+                    std::exit(1);
+                }
+
+                std::string seq(t.RefLength, ' ');
+                if (rseq != 0) {
+                    for (size_t i = 0; i < compLen; ++i) {
+                        seq[i] = rseq[i];
+                                 //nst_nt4_table[static_cast<int>(nucTab[rseq[i]])];
+                    }
+                }
+                transcripts_.back().Sequence = salmon::stringtools::encodeSequenceInSAM(seq.c_str(), t.RefLength);
+                free(rseq);
+                // end BWA code
+            }
+            // Since we have the de-coded reference sequences, we no longer need
+            // the encoded sequences, so free them.
+            free(idx_->pac); idx_->pac = nullptr;
+            transcripts_tmp.clear();
+            // ====== Done loading the transcripts from file
+}
+*/
+
+// Cool way to do this from 
+// http://stackoverflow.com/questions/108318/whats-the-simplest-way-to-test-whether-a-number-is-a-power-of-2-in-c
+bool isPowerOfTwo(uint32_t n) {
+  return (n > 0 and (n & (n-1)) == 0);
+}
+
+int salmonIndex(int argc, char* argv[]) {
+
+    using std::string;
+    namespace bfs = boost::filesystem;
+    namespace po = boost::program_options;
+
+    bool useStreamingParser = true;
+
+    uint32_t saSampInterval = 1;
+    uint32_t maxThreads = std::thread::hardware_concurrency();
+    uint32_t numThreads;
+
+    po::options_description generic("Command Line Options");
+    generic.add_options()
+    ("version,v", "print version string")
+    ("help,h", "produce help message")
+    ("transcripts,t", po::value<string>()->required(), "Transcript fasta file.")
+    ("index,i", po::value<string>()->required(), "Salmon index.")
+    ("threads,p", po::value<uint32_t>(&numThreads)->default_value(maxThreads)->required(),
+                            "Number of threads to use (only used for computing bias features)")
+    ("sasamp,s", po::value<uint32_t>(&saSampInterval)->default_value(1)->required(),
+                            "The interval at which the suffix array should be sampled. "
+                            "Smaller values are faster, but produce a larger index. "
+                            "The default should be OK, unless your transcriptome is huge. "
+			    "This value should be a power of 2.")
+    ;
+
+    po::variables_map vm;
+    int ret = 1;
+    try {
+
+        po::store(po::command_line_parser(argc, argv).options(generic).run(), vm);
+
+        if ( vm.count("help") ) {
+            auto hstring = R"(
+Index
+==========
+Creates a salmon index.
+)";
+            std::cout << hstring << std::endl;
+            std::cout << generic << std::endl;
+            std::exit(1);
+        }
+        po::notify(vm);
+
+	uint32_t sasamp = vm["sasamp"].as<uint32_t>();
+	if (!isPowerOfTwo(sasamp)) {
+	  fmt::MemoryWriter errWriter;
+	  errWriter << "Error: The suffix array sampling interval must be "
+		       "a power of 2. The value provided, " << sasamp << ", is not.";
+	  throw(std::logic_error(errWriter.str()));
+	}
+
+        fmt::MemoryWriter optWriter;
+        optWriter << vm["sasamp"].as<uint32_t>();
+
+        string transcriptFile = vm["transcripts"].as<string>();
+        bfs::path indexDirectory(vm["index"].as<string>());
+
+
+        if (!bfs::exists(indexDirectory)) {
+            std::cerr << "index [" << indexDirectory << "] did not previously exist "
+                      << " . . . creating it\n";
+            bfs::create_directory(indexDirectory);
+        }
+
+        bfs::path logPath = indexDirectory / "indexing.log";
+        size_t max_q_size = 2097152;
+        //spdlog::set_async_mode(max_q_size);
+
+        auto fileSink = std::make_shared<spdlog::sinks::simple_file_sink_mt>(logPath.string(), true);
+        auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
+        auto consoleLog = spdlog::create("consoleLog", {consoleSink});
+        auto fileLog = spdlog::create("fileLog", {fileSink});
+        auto jointLog = spdlog::create("jointLog", {fileSink, consoleSink});
+
+        // First, compute the transcript features in case the user
+        // ever wants to bias-correct his / her results
+        // NOTE: Currently, we're using the same bias correction technique here that
+        // we use in Sailfish. In the future, test more "traditional" bias correction
+        // techniques to see if we should adopt them instead
+        bfs::path transcriptBiasFile(indexDirectory); transcriptBiasFile /= "bias_feats.txt";
+
+        std::vector<std::string> transcriptFiles = {transcriptFile};
+        fmt::MemoryWriter infostr;
+        infostr << "computeBiasFeatures( {";
+        for (auto& tf : transcriptFiles) {
+            infostr << "[" << tf << "] ";
+        }
+        infostr << ", " << transcriptBiasFile.c_str() << ", " << useStreamingParser << ", " << numThreads << ")\n";
+        jointLog->info() << infostr.str();
+        computeBiasFeatures(transcriptFiles, transcriptBiasFile, useStreamingParser, numThreads);
+        // ==== finished computing bias fetures
+
+        bfs::path outputPrefix = indexDirectory / "bwaidx";
+
+        std::vector<char const*> bwaArgVec{ "index",
+                                    "-s",
+                                    optWriter.str().c_str(),
+                                    "-p",
+                                    outputPrefix.string().c_str(),
+                                    transcriptFile.c_str() };
+
+        char* bwaArgv[] = { const_cast<char*>(bwaArgVec[0]),
+                            const_cast<char*>(bwaArgVec[1]),
+                            const_cast<char*>(bwaArgVec[2]),
+                            const_cast<char*>(bwaArgVec[3]),
+                            const_cast<char*>(bwaArgVec[4]),
+                            const_cast<char*>(bwaArgVec[5]) };
+        int bwaArgc = 6;
+
+        ret = bwa_index(bwaArgc, bwaArgv);
+
+        jointLog->info("done building BWT Index");
+
+        // If we want to build the auxiliary k-mer index, do it here.
+        /*
+        uint32_t k = 15;
+        buildAuxKmerIndex(outputPrefix, k, jointLog);
+        */
+
+    } catch (po::error &e) {
+        std::cerr << "exception : [" << e.what() << "]. Exiting.\n";
+        std::exit(1);
+    } catch (const spdlog::spdlog_ex& ex) {
+        std::cout << "logger failed with : [" << ex.what() << "]. Exiting.\n";
+    } catch (std::exception& e) {
+        std::cerr << "Exception : [" << e.what() << "]\n";
+        std::cerr << argv[0] << " index was invoked improperly.\n";
+        std::cerr << "For usage information, try " << argv[0] << " index --help\nExiting.\n";
+    }
+    return ret;
+}
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100755
index 0000000..c2ad83e
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,266 @@
+set ( SALMON_MAIN_SRCS
+QSufSort.c
+is.c
+bwt_gen.c
+bwtindex.c
+CollapsedEMOptimizer.cpp
+CollapsedGibbsSampler.cpp
+Salmon.cpp
+BuildSalmonIndex.cpp
+SalmonQuantify.cpp
+FragmentLengthDistribution.cpp 
+FragmentStartPositionDistribution.cpp
+SequenceBiasModel.cpp
+StadenUtils.cpp
+TranscriptGroup.cpp
+)
+
+set (SALMON_ALIGN_SRCS
+FASTAParser.cpp 
+ErrorModel.cpp 
+AlignmentModel.cpp
+FragmentLengthDistribution.cpp 
+SalmonQuantifyAlignments.cpp 
+)
+
+set (SALMON_LIB_SRCS
+LibraryFormat.cpp
+GenomicFeature.cpp
+VersionChecker.cpp
+SalmonUtils.cpp
+SalmonStringUtils.cpp
+ComputeBiasFeatures.cpp
+PerformBiasCorrection.cpp
+cokus.cpp
+merge_files.cc
+format.cc
+)
+
+include_directories( 
+${GAT_SOURCE_DIR}/include
+${GAT_SOURCE_DIR}/include/eigen3
+${GAT_SOURCE_DIR}/external
+${GAT_SOURCE_DIR}/external/cereal/include
+${GAT_SOURCE_DIR}/external/install/include
+${GAT_SOURCE_DIR}/external/install/include/jellyfish-2.1.3
+${GAT_SOURCE_DIR}/external/install/include/bwa
+${ZLIB_INCLUDE_DIR}
+${TBB_INCLUDE_DIRS}
+${Boost_INCLUDE_DIRS}
+)
+
+link_directories( 
+${GAT_SOURCE_DIR}/lib
+${GAT_SOURCE_DIR}/external/install/lib
+${Boost_LIBRARY_DIRS}
+${TBB_LIBRARY_DIRS}
+${LAPACK_LIBRARY_DIR}
+${BLAS_LIBRARY_DIR}
+)
+
+message("TBB_LIBRARIES = ${TBB_LIBRARIES}")
+message("Boost_LIBRARIES = ${Boost_LIBRARIES}")
+
+# Set the RPATH 
+if (APPLE)
+    ## This DOES NOT do what I / any one sane, expects.  Setting the 
+    ## linker path on OSX is messed up.  Just tell the user to use 
+    ## DYLD_FALLBACK_LIBRARY_PATH for now
+    set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+else()
+    set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib:$ORIGIN/../../lib:$ORIGIN/:$ORIGIN/../../external/install/lib")
+endif()
+
+set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+
+# Build the Salmon library
+add_library(salmon_core STATIC ${SALMON_LIB_SRCS} )
+
+# Build the salmon executable
+add_executable(salmon ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS})
+
+#add_executable(salmon-read ${SALMON_READ_SRCS})
+#set_target_properties(salmon-read PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_LIBPTHREAD -D_PBGZF_USE -fopenmp"
+#    LINK_FLAGS "-DHAVE_LIBPTHREAD -D_PBGZF_USE -fopenmp")
+
+#set_target_properties(salmon_core salmon PROPERTIES LINK_SEARCH_END_STATIC TRUE)
+
+# Grumble grumble . . . OSX
+if (APPLE)
+    # only attempt install_name_tool for tbb if we installed it
+    if (${TBB_LIBRARY_DIRS} MATCHES ${GAT_SOURCE_DIR}/external/install/lib)
+        add_custom_command(TARGET salmon
+            PRE_LINK
+            COMMAND install_name_tool -id @rpath/libtbb.dylib ${TBB_LIBRARY_DIRS}/libtbb.dylib
+            COMMAND install_name_tool -id @rpath/libtbbmalloc.dylib ${TBB_LIBRARY_DIRS}/libtbbmalloc.dylib
+        )
+    endif()
+
+else()
+    # related to complete static linking --- on hold	
+    set (BOOST_THREAD_LIBRARY)
+endif()
+
+# Link the executable
+target_link_libraries(salmon
+    salmon_core 
+    gff
+    ${PTHREAD_LIB}
+    ${Boost_LIBRARIES} 
+    ${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a
+    ${ZLIB_LIBRARY} 
+    ${GAT_SOURCE_DIR}/external/install/lib/libjellyfish-2.0.a 
+    ${GAT_SOURCE_DIR}/external/install/lib/libbwa.a
+    m
+    ${LIBLZMA_LIBRARIES}
+    ${BZIP2_LIBRARIES}
+    ${TBB_LIBRARIES}
+    ${LIBSALMON_LINKER_FLAGS}
+    ${NON_APPLECLANG_LIBS}
+    ${FAST_MALLOC_LIB}
+)
+
+add_dependencies(salmon libbwa)
+
+##
+#  This ensures that the salmon executable should work with or without `make install`
+##
+if (APPLE)
+	add_custom_command(TARGET salmon
+		POST_BUILD
+		COMMAND install_name_tool -add_rpath ${GAT_SOURCE_DIR}/external/install/lib salmon 
+		COMMAND install_name_tool -add_rpath @executable_path/../lib salmon 
+		)
+endif()
+
+##### ======================================
+
+IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  SET(CMAKE_INSTALL_PREFIX
+    "${GAT_SOURCE_DIR}" CACHE PATH "Default install prefix" FORCE
+    )
+ENDIF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+
+set(INSTALL_LIB_DIR lib )
+set(INSTALL_BIN_DIR bin )
+set(INSTALL_INCLUDE_DIR include )
+
+install(DIRECTORY 
+        ${GAT_SOURCE_DIR}/external/install/lib/ 
+        DESTINATION ${INSTALL_LIB_DIR}
+	    FILES_MATCHING PATTERN "libtbb*.${SHARED_LIB_EXTENSION}*"
+    )
+if (APPLE)
+install(DIRECTORY 
+        ${GAT_SOURCE_DIR}/external/install/lib/ 
+        DESTINATION ${INSTALL_LIB_DIR}
+	    FILES_MATCHING PATTERN "libcmph*.${SHARED_LIB_EXTENSION}*"
+       )
+endif()
+
+
+# install(FILES ${Boost_LIBRARIES}
+# 	           DESTINATION ${INSTALL_LIB_DIR})
+
+install(TARGETS salmon salmon_core
+                RUNTIME DESTINATION bin 
+                LIBRARY DESTINATION lib
+                ARCHIVE DESTINATION lib
+        )
+
+set(POST_INSTALL_SCRIPT ${GAT_SOURCE_DIR}/cmake/PostInstall.cmake)
+
+install(
+    CODE 
+    "
+    execute_process(COMMAND \"${CMAKE_COMMAND}\"
+                            -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+                            -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} 
+                            -P \"${POST_INSTALL_SCRIPT}\")
+    "
+)
+
+include(InstallRequiredSystemLibraries)
+add_test( NAME salmon_read_test COMMAND ${CMAKE_COMMAND} -DTOPLEVEL_DIR=${GAT_SOURCE_DIR} -P ${GAT_SOURCE_DIR}/cmake/TestSalmon.cmake )
+
+####
+#
+# Deprecated or currently unused
+#
+####
+
+
+# # use, i.e. don't skip the full RPATH for the build tree
+# SET(CMAKE_SKIP_BUILD_RPATH  FALSE)
+
+# # when building, don't use the install RPATH already
+# # (but later on when installing)
+# SET(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) 
+# MESSAGE( "LINK DIRECTORIES : ${CMAKE_LIBRARY_PATH}")
+# SET(CMAKE_INSTALL_RPATH "${CMAKE_LIBRARY_PATH}")
+
+# # add the automatically determined parts of the RPATH
+# # which point to directories outside the build tree to the install RPATH
+# SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+
+# # the RPATH to be used when installing, but only if it's not a system directory
+# LIST(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir)
+# IF("${isSystemDir}" STREQUAL "-1")
+#    SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+# ENDIF("${isSystemDir}" STREQUAL "-1")
+
+
+# set( BUILD_LUT_SRCS 
+# BuildLUT.cpp
+# IndexedCounter.cpp
+# PerfectHashIndexer.cpp
+# )
+
+# add_executable( BuildLUT ${BUILD_LUT_SRCS} )
+# target_link_libraries( BuildLUT 
+# 	${Boost_LIBRARIES} ${ZLIB_LIBRARY} 
+# 	cmph # perfect hashing library
+# 	jellyfish-2.0 pthread 
+#     gomp lib_activeobject lib_g2logger m tbb)
+
+# INSTALL(TARGETS BuildLUT DESTINATION ${PROJECT_SOURCE_DIR}/bin COMPONENT comp_buildlut)
+
+# set ( BUILD_INDEX_SRCS Indexer.cpp )
+
+# set( ANALYZE_TRANSCRIPT_GRAPH 
+# AnalyzeTranscriptGraph.cpp
+# IndexedCounter.cpp
+# PerfectHashIndexer.cpp
+# )
+
+# set ( PROBES_TO_GENES_SRC MapProbesToGenes.cpp affymetrix_utils.cpp )
+# set( COMPUTE_TGRAPH_SRCS
+# ComputeTranscriptGraph.cpp
+# IndexedCounter.cpp
+# PerfectHashIndexer.cpp
+# )
+
+# add_executable( ComputeTranscriptGraph ${COMPUTE_TGRAPH_SRCS} )
+# target_link_libraries( ComputeTranscriptGraph
+# 	${Boost_LIBRARIES} ${ZLIB_LIBRARY} 
+# 	cmph # perfect hashing library
+# 	jellyfish pthread 
+#     gomp lib_activeobject lib_g2logger blas m tbb nnls)
+
+
+# add_executable( AnalyzeTranscriptGraph ${ANALYZE_TRANSCRIPT_GRAPH} )
+# target_link_libraries( AnalyzeTranscriptGraph 
+# 	${Boost_LIBRARIES} ${ZLIB_LIBRARY} 
+# 	cmph # perfect hashing library
+# 	jellyfish pthread 
+#     gomp lib_activeobject lib_g2logger blas m tbb)
+
+
+#add_executable( BuildIndex ${BUILD_INDEX_SRCS} )
+#target_link_libraries( BuildIndex ${Boost_LIBRARIES} jellyfish pthread m tbb)
+
+
+#add_executable( TestTsnnls ${TEST_SRCS} )
+#target_link_libraries( TestTsnnls ${Boost_LIBRARIES} ${ZLIB_LIBRARY} blas lapack gomp lib_activeobject lib_g2logger tsnnls )
+
diff --git a/src/CollapsedEMOptimizer.cpp b/src/CollapsedEMOptimizer.cpp
new file mode 100644
index 0000000..1fcba96
--- /dev/null
+++ b/src/CollapsedEMOptimizer.cpp
@@ -0,0 +1,449 @@
+#include <vector>
+#include <unordered_map>
+#include <atomic>
+
+#include "tbb/task_scheduler_init.h"
+#include "tbb/parallel_for.h"
+#include "tbb/parallel_for_each.h"
+#include "tbb/parallel_reduce.h"
+#include "tbb/blocked_range.h"
+#include "tbb/partitioner.h"
+
+//#include "fastapprox.h"
+#include <boost/math/special_functions/digamma.hpp>
+
+// C++ string formatting library
+#include "format.h"
+
+#include "cuckoohash_map.hh"
+#include "Eigen/Dense"
+
+#include "CollapsedEMOptimizer.hpp"
+#include "Transcript.hpp"
+#include "TranscriptGroup.hpp"
+#include "SalmonMath.hpp"
+#include "AlignmentLibrary.hpp"
+#include "ReadPair.hpp"
+#include "UnpairedRead.hpp"
+#include "ReadExperiment.hpp"
+
+
+using BlockedIndexRange =  tbb::blocked_range<size_t>;
+
+// intelligently chosen value adopted from
+// https://github.com/pachterlab/kallisto/blob/master/src/EMAlgorithm.h#L18
+constexpr double minEQClassWeight = std::numeric_limits<double>::denorm_min();
+constexpr double minWeight = std::numeric_limits<double>::denorm_min();
+
+double normalize(std::vector<tbb::atomic<double>>& vec) {
+    double sum{0.0};
+    for (auto& v : vec) {
+        sum += v;
+    }
+
+    // too small!
+    if (sum < minWeight) {
+        return sum;
+    }
+
+    double invSum  = 1.0 / sum;
+    for (auto& v : vec) {
+        v.store(v.load() * invSum);
+    }
+
+    return sum;
+}
+
+/*
+ * Use atomic compare-and-swap to update val to
+ * val + inc.  Update occurs in a loop in case other
+ * threads update in the meantime.
+ */
+void incLoop(tbb::atomic<double>& val, double inc) {
+        double oldMass = val.load();
+        double returnedMass = oldMass;
+        double newMass{oldMass + inc};
+        do {
+            oldMass = returnedMass;
+            newMass = oldMass + inc;
+            returnedMass = val.compare_and_swap(newMass, oldMass);
+        } while (returnedMass != oldMass);
+}
+
+/*
+ * Use the "standard" EM algorithm over equivalence
+ * classes to estimate the latent variables (alphaOut)
+ * given the current estimates (alphaIn).
+ */
+void EMUpdate_(
+        std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
+        std::vector<Transcript>& transcripts,
+        Eigen::VectorXd& effLens,
+        const CollapsedEMOptimizer::VecType& alphaIn,
+        CollapsedEMOptimizer::VecType& alphaOut) {
+
+    assert(alphaIn.size() == alphaOut.size());
+
+    tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
+            [&eqVec, &alphaIn, &alphaOut](const BlockedIndexRange& range) -> void {
+            for (auto eqID : boost::irange(range.begin(), range.end())) {
+            auto& kv = eqVec[eqID];
+
+            uint64_t count = kv.second.count;
+            // for each transcript in this class
+            const TranscriptGroup& tgroup = kv.first;
+            if (tgroup.valid) {
+                const std::vector<uint32_t>& txps = tgroup.txps;
+                const std::vector<double>& auxs = kv.second.weights;
+
+                double denom = 0.0;
+                size_t groupSize = txps.size();
+                // If this is a single-transcript group,
+                // then it gets the full count.  Otherwise,
+                // update according to our VBEM rule.
+                if (BOOST_LIKELY(groupSize > 1)) {
+                    for (size_t i = 0; i < groupSize; ++i) {
+                    auto tid = txps[i];
+                    auto aux = auxs[i];
+                    //double el = effLens(tid);
+                    //if (el <= 0) { el = 1.0; }
+                    double v = alphaIn[tid] * aux;
+                    denom += v;
+                    }
+
+                    if (denom <= ::minEQClassWeight) {
+                        // tgroup.setValid(false);
+                    } else {
+
+                        double invDenom = count / denom;
+                        for (size_t i = 0; i < groupSize; ++i) {
+                            auto tid = txps[i];
+                            auto aux = auxs[i];
+                            double v = alphaIn[tid] * aux;
+                            if (!std::isnan(v)) {
+                                incLoop(alphaOut[tid], v * invDenom);
+                            }
+                        }
+                    }
+                } else {
+                    incLoop(alphaOut[txps.front()], count);
+                }
+            }
+    }
+    });
+
+}
+
+/*
+ * Use the Variational Bayesian EM algorithm over equivalence
+ * classes to estimate the latent variables (alphaOut)
+ * given the current estimates (alphaIn).
+ */
+void VBEMUpdate_(
+        std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
+        std::vector<Transcript>& transcripts,
+        Eigen::VectorXd& effLens,
+        double priorAlpha,
+        double totLen,
+        const CollapsedEMOptimizer::VecType& alphaIn,
+        CollapsedEMOptimizer::VecType& alphaOut,
+	    CollapsedEMOptimizer::VecType& expTheta) {
+
+    assert(alphaIn.size() == alphaOut.size());
+
+    double alphaSum = {0.0};
+    for (auto& e : alphaIn) { alphaSum += e; }
+
+    double logNorm = boost::math::digamma(alphaSum);
+
+    tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts.size())),
+            [logNorm, priorAlpha, totLen, &effLens, &alphaIn,
+             &alphaOut, &expTheta]( const BlockedIndexRange& range) -> void {
+
+             double prior = priorAlpha;
+             double priorNorm = prior * totLen;
+
+             for (auto i : boost::irange(range.begin(), range.end())) {
+                if (alphaIn[i] > ::minWeight) {
+                    expTheta[i] = std::exp(boost::math::digamma(alphaIn[i].load()) - logNorm);
+                } else {
+                    expTheta[i] = 0.0;
+                }
+                alphaOut[i] = prior;
+            }
+        });
+
+    tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
+            [&eqVec, &alphaIn,
+             &alphaOut, &expTheta]( const BlockedIndexRange& range) -> void {
+            for (auto eqID : boost::irange(range.begin(), range.end())) {
+            auto& kv = eqVec[eqID];
+
+            uint64_t count = kv.second.count;
+            // for each transcript in this class
+            const TranscriptGroup& tgroup = kv.first;
+            if (tgroup.valid) {
+                const std::vector<uint32_t>& txps = tgroup.txps;
+                const std::vector<double>& auxs = kv.second.weights;
+
+                double denom = 0.0;
+                size_t groupSize = txps.size();
+                // If this is a single-transcript group,
+                // then it gets the full count.  Otherwise,
+                // update according to our VBEM rule.
+                if (BOOST_LIKELY(groupSize > 1)) {
+                    for (size_t i = 0; i < groupSize; ++i) {
+                        auto tid = txps[i];
+                        auto aux = auxs[i];
+                        if (expTheta[tid] > 0.0) {
+                            double v = expTheta[tid] * aux;
+                            denom += v;
+                       }
+                    }
+                    if (denom <= ::minEQClassWeight) {
+                        // tgroup.setValid(false);
+                    } else {
+                        double invDenom = count / denom;
+                        for (size_t i = 0; i < groupSize; ++i) {
+                            auto tid = txps[i];
+                            auto aux = auxs[i];
+                            if (expTheta[tid] > 0.0) {
+                              double v = expTheta[tid] * aux;
+                              incLoop(alphaOut[tid], v * invDenom);
+                            }
+                        }
+                    }
+
+                } else {
+                    incLoop(alphaOut[txps.front()], count);
+                }
+            }
+        }});
+
+}
+
+size_t markDegenerateClasses(
+        std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
+        CollapsedEMOptimizer::VecType& alphaIn,
+        std::shared_ptr<spdlog::logger> jointLog,
+        bool verbose=false) {
+
+    size_t numDropped{0};
+    size_t idx{0};
+    for (auto& kv : eqVec) {
+        uint64_t count = kv.second.count;
+        // for each transcript in this class
+        const TranscriptGroup& tgroup = kv.first;
+        const std::vector<uint32_t>& txps = tgroup.txps;
+        const std::vector<double>& auxs = kv.second.weights;
+
+        double denom = 0.0;
+        for (size_t i = 0; i < txps.size(); ++i) {
+            auto tid = txps[i];
+            auto aux = auxs[i];
+            double v = alphaIn[tid] * aux;
+            if (!std::isnan(v)) {
+                denom += v;
+            } else {
+                std::cerr << "val is NAN; alpha( "
+                          << tid << " ) = " << alphaIn[tid]
+                          << ", aux = " << aux << "\n";
+            }
+        }
+        if (denom <= minEQClassWeight) {
+            fmt::MemoryWriter errstream;
+
+            errstream << "\nDropping weighted eq class\n";
+            errstream << "============================\n";
+
+            errstream << "denom = 0, count = " << count << "\n";
+            errstream << "class = { ";
+            for (auto e : txps) {
+                errstream << e << " ";
+            }
+            errstream << "}\n";
+            errstream << "alphas = { ";
+            for (auto e : txps) {
+                errstream << alphaIn[e] << " ";
+            }
+            errstream << "}\n";
+            errstream << "weights = { ";
+            for (auto e : auxs) {
+                errstream << e << " ";
+            }
+            errstream << "}\n";
+            errstream << "============================\n\n";
+
+            jointLog->info(errstream.str());
+            ++numDropped;
+            kv.first.setValid(false);
+        }
+    }
+    return numDropped;
+}
+
+
+CollapsedEMOptimizer::CollapsedEMOptimizer() {}
+
+template <typename ExpT>
+bool CollapsedEMOptimizer::optimize(ExpT& readExp,
+        SalmonOpts& sopt,
+        double relDiffTolerance,
+        uint32_t maxIter) {
+
+    tbb::task_scheduler_init tbbScheduler(sopt.numThreads);
+    std::vector<Transcript>& transcripts = readExp.transcripts();
+
+    using VecT = CollapsedEMOptimizer::VecType;
+    // With atomics
+    VecType alphas(transcripts.size(), 0.0);
+    VecType alphasPrime(transcripts.size(), 0.0);
+    VecType expTheta(transcripts.size());
+    Eigen::VectorXd effLens(transcripts.size());
+
+    std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
+        readExp.equivalenceClassBuilder().eqVec();
+
+    bool useVBEM{sopt.useVBOpt};
+    // If we use VBEM, we'll need the prior parameters
+    double priorAlpha = 0.01;
+
+    auto jointLog = sopt.jointLog;
+
+    double totalNumFrags{static_cast<double>(readExp.numMappedReads())};
+    double totalLen{0.0};
+
+    for (size_t i = 0; i < transcripts.size(); ++i) {
+        double m = transcripts[i].mass(false);
+        if (std::isnan(m)) {
+            std::cerr << "FOUND NAN for txp " << i << "\n";
+        }
+        alphas[i] = (m == salmon::math::LOG_0) ? 0.0 : m;
+        effLens(i) = std::exp(transcripts[i].getCachedEffectiveLength());
+        totalLen += effLens(i);
+    }
+
+    // If the user requested *not* to use "rich" equivalence classes,
+    // then wipe out all of the weight information here and simply replace
+    // the weights with the effective length terms (here, the *inverse* of
+    // the effective length).
+    if (sopt.noRichEqClasses) {
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
+                [&eqVec, &effLens]( const BlockedIndexRange& range) -> void {
+                // For each index in the equivalence class vector
+                for (auto eqID : boost::irange(range.begin(), range.end())) {
+                    // The vector entry
+                    auto& kv = eqVec[eqID];
+                    // The label of the equivalence class
+                    const TranscriptGroup& k = kv.first;
+                    // The size of the label
+                    size_t classSize = k.txps.size();
+                    // The weights of the label
+                    TGValue& v = kv.second;
+
+                    // Iterate over each weight and set it equal to
+                    // 1 / effLen of the corresponding transcript
+                    for (size_t i = 0; i < classSize; ++i) {
+                        double el = effLens(k.txps[i]);
+                        v.weights[i] = (el <= 0.0) ? 1.0 : (1.0 / el);
+                    }
+                }
+        });
+    }
+
+    auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog);
+    sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate",
+            numRemoved);
+
+    size_t itNum{0};
+    double minAlpha = 1e-8;
+    double cutoff = (useVBEM) ? (priorAlpha + minAlpha) : minAlpha;
+
+
+    bool converged{false};
+    double maxRelDiff = -std::numeric_limits<double>::max();
+    while (itNum < maxIter and !converged) {
+
+        if (useVBEM) {
+            VBEMUpdate_(eqVec, transcripts, effLens,
+                        priorAlpha, totalLen, alphas, alphasPrime, expTheta);
+        } else {
+            EMUpdate_(eqVec, transcripts, effLens, alphas, alphasPrime);
+        }
+
+        converged = true;
+        maxRelDiff = -std::numeric_limits<double>::max();
+        for (size_t i = 0; i < transcripts.size(); ++i) {
+            if (alphas[i] > cutoff) {
+                double relDiff = std::abs(alphas[i] - alphasPrime[i]) / alphasPrime[i];
+                maxRelDiff = (relDiff > maxRelDiff) ? relDiff : maxRelDiff;
+                if (relDiff > relDiffTolerance) {
+                    converged = false;
+                }
+            }
+            alphas[i] = alphasPrime[i];
+            alphasPrime[i] = 0.0;
+        }
+
+        if (itNum % 100 == 0) {
+            jointLog->info("iteration = {} | max rel diff. = {}",
+                            itNum, maxRelDiff);
+        }
+
+        ++itNum;
+    }
+
+    jointLog->info("iteration = {} | max rel diff. = {}",
+                    itNum, maxRelDiff);
+
+    // Truncate tiny expression values
+    double alphaSum = 0.0;
+
+    alphaSum = 0.0;
+    for (size_t i = 0; i < alphas.size(); ++i) {
+      if (alphas[i] <= cutoff) { alphas[i] = 0.0; }
+      alphaSum += alphas[i];
+    }
+
+
+    if (alphaSum < minWeight) {
+        jointLog->error("Total alpha weight was too small! "
+                        "Make sure you ran salmon correclty.");
+        return false;
+    }
+
+    // Set the mass of each transcript using the
+    // computed alphas.
+    for (size_t i = 0; i < transcripts.size(); ++i) {
+        // Set the mass to the normalized (after truncation)
+        // relative abundance
+        transcripts[i].setMass(alphas[i] / alphaSum);
+    }
+
+    return true;
+}
+
+template
+bool CollapsedEMOptimizer::optimize<ReadExperiment>(ReadExperiment& readExp,
+        SalmonOpts& sopt,
+        double relDiffTolerance,
+        uint32_t maxIter);
+
+template
+bool CollapsedEMOptimizer::optimize<AlignmentLibrary<UnpairedRead>>(
+        AlignmentLibrary<UnpairedRead>& readExp,
+        SalmonOpts& sopt,
+        double relDiffTolerance,
+        uint32_t maxIter);
+
+
+template
+bool CollapsedEMOptimizer::optimize<AlignmentLibrary<ReadPair>>(
+        AlignmentLibrary<ReadPair>& readExp,
+        SalmonOpts& sopt,
+        double relDiffTolerance,
+        uint32_t maxIter);
+
+
+// Unused / old
+
diff --git a/src/CollapsedGibbsSampler.cpp b/src/CollapsedGibbsSampler.cpp
new file mode 100644
index 0000000..48d3567
--- /dev/null
+++ b/src/CollapsedGibbsSampler.cpp
@@ -0,0 +1,345 @@
+#include <vector>
+#include <unordered_map>
+#include <atomic>
+#include <random>
+
+#include "tbb/task_scheduler_init.h"
+#include "tbb/parallel_for.h"
+#include "tbb/parallel_for_each.h"
+#include "tbb/parallel_reduce.h"
+#include "tbb/blocked_range.h"
+#include "tbb/partitioner.h"
+
+//#include "fastapprox.h"
+#include <boost/math/special_functions/digamma.hpp>
+#include <boost/filesystem.hpp>
+
+// C++ string formatting library
+#include "format.h"
+
+#include "cuckoohash_map.hh"
+#include "Eigen/Dense"
+
+#include "CollapsedGibbsSampler.hpp"
+#include "Transcript.hpp"
+#include "TranscriptGroup.hpp"
+#include "SalmonMath.hpp"
+#include "AlignmentLibrary.hpp"
+#include "ReadPair.hpp"
+#include "UnpairedRead.hpp"
+#include "ReadExperiment.hpp"
+#include "MultinomialSampler.hpp"
+
+using BlockedIndexRange =  tbb::blocked_range<size_t>;
+
+// intelligently chosen value adopted from
+// https://github.com/pachterlab/kallisto/blob/master/src/EMAlgorithm.h#L18
+constexpr double minEQClassWeight = std::numeric_limits<double>::denorm_min();
+constexpr double minWeight = std::numeric_limits<double>::denorm_min();
+
+void initCountMap_(
+        std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
+	std::vector<Transcript>& transcriptsIn,
+	double priorAlpha,
+        MultinomialSampler& msamp,
+        std::vector<int>& countMap,
+        std::vector<double>& probMap,
+	std::vector<int>& txpCounts) {
+
+    size_t offset{0};
+    for (auto& eqClass : eqVec) {
+        uint64_t classCount = eqClass.second.count;
+
+        // for each transcript in this class
+        const TranscriptGroup& tgroup = eqClass.first;
+        const size_t groupSize = tgroup.txps.size();
+        if (tgroup.valid) {
+            const std::vector<uint32_t>& txps = tgroup.txps;
+            const std::vector<double>& auxs = eqClass.second.weights;
+
+            double denom = 0.0;
+            if (BOOST_LIKELY(groupSize > 1)) {
+
+                for (size_t i = 0; i < groupSize; ++i) {
+                    auto tid = txps[i];
+                    auto aux = auxs[i];
+                    denom += (priorAlpha + transcriptsIn[tid].mass(false)) * aux;
+                    countMap[offset + i] = 0;
+                }
+
+		if (denom > ::minEQClassWeight) {
+	   	   // Get the multinomial probabilities
+		   double norm = 1.0 / denom;
+		   for (size_t i = 0; i < groupSize; ++i) {
+		     auto tid = txps[i];
+		     auto aux = auxs[i];
+		     probMap[offset + i] = norm *
+                        ((priorAlpha + transcriptsIn[tid].mass(false)) * aux);
+		    }
+
+	   	    // re-sample
+	            msamp(countMap.begin() + offset,
+                      classCount,
+                      groupSize,
+                      probMap.begin() + offset);
+		}
+            } else {
+                countMap[offset] = classCount;
+            }
+
+
+            for (size_t i = 0; i < groupSize; ++i) {
+                auto tid = txps[i];
+                txpCounts[tid] += countMap[offset + i];
+            }
+
+            offset += groupSize;
+       } // valid group
+    } // loop over all eq classes
+}
+
+void sampleRound_(
+        std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
+        std::vector<int>& countMap,
+        std::vector<double>& probMap,
+        double priorAlpha,
+        std::vector<int>& txpCount,
+        MultinomialSampler& msamp) {
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<> dis(0.25, 0.75);
+    size_t offset{0};
+    // Choose a fraction of this class to re-sample
+
+    // The count substracted from each transcript
+    std::vector<int> txpResamp;
+
+    for (auto& eqClass : eqVec) {
+        uint64_t classCount = eqClass.second.count;
+        double sampleFrac = dis(gen);
+
+        // for each transcript in this class
+        const TranscriptGroup& tgroup = eqClass.first;
+        const size_t groupSize = tgroup.txps.size();
+        if (tgroup.valid) {
+            const std::vector<uint32_t>& txps = tgroup.txps;
+            const std::vector<double>& auxs = eqClass.second.weights;
+
+            double denom = 0.0;
+            // If this is a single-transcript group,
+            // then it gets the full count --- otherwise,
+	    // sample!
+            if (BOOST_LIKELY(groupSize > 1)) {
+
+                // Subtract some fraction of the current equivalence
+                // class' contribution from each transcript.
+		uint64_t numResampled{0};
+		if (groupSize > txpResamp.size()) {
+			txpResamp.resize(groupSize, 0);
+		}
+
+		// For each transcript in the group
+                for (size_t i = 0; i < groupSize; ++i) {
+                    auto tid = txps[i];
+                    auto aux = auxs[i];
+                    auto currCount = countMap[offset + i];
+		    uint64_t currResamp = std::round(sampleFrac * currCount);
+		    numResampled += currResamp;
+		    txpResamp[i] = currResamp;
+                    txpCount[tid] -= currResamp;
+                    countMap[offset + i] -= currResamp;
+                    denom += (priorAlpha + txpCount[tid]) * aux;
+                }
+
+		if (denom > ::minEQClassWeight) {
+			// Get the multinomial probabilities
+			double norm = 1.0 / denom;
+			for (size_t i = 0; i < groupSize; ++i) {
+			    auto tid = txps[i];
+			    auto aux = auxs[i];
+			    probMap[offset + i] = norm * ((priorAlpha + txpCount[tid]) * aux);
+			}
+
+			// re-sample
+			msamp(txpResamp.begin(),        // count array to fill in
+			      numResampled,		// multinomial n
+			      groupSize,		// multinomial k
+			      probMap.begin() + offset  // where to find multinomial probs
+			      );
+
+			for (size_t i = 0; i < groupSize; ++i) {
+				auto tid = txps[i];
+				countMap[offset + i] += txpResamp[i];
+				txpCount[tid] += txpResamp[i];
+			}
+
+		} else { // We didn't sample
+			// add back to txp-count!
+			for (size_t i = 0; i < groupSize; ++i) {
+			    auto tid = txps[i];
+			    txpCount[tid] += txpResamp[i];
+			    countMap[offset + i] += txpResamp[i];
+			}
+		}
+            }
+
+            offset += groupSize;
+        } // valid group
+    } // loop over all eq classes
+
+}
+
+CollapsedGibbsSampler::CollapsedGibbsSampler() {}
+
+class DistStats {
+	public:
+	DistStats() : meanVal(0.0), minVal(std::numeric_limits<double>::max()), maxVal(0.0) {}
+	double meanVal;
+	double minVal;
+	double maxVal;
+};
+
+template <typename ExpT>
+bool CollapsedGibbsSampler::sample(ExpT& readExp,
+        SalmonOpts& sopt,
+        uint32_t numSamples) {
+
+    namespace bfs = boost::filesystem;
+    tbb::task_scheduler_init tbbScheduler(sopt.numThreads);
+    std::vector<Transcript>& transcripts = readExp.transcripts();
+
+    std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
+        readExp.equivalenceClassBuilder().eqVec();
+
+    using VecT = CollapsedGibbsSampler::VecType;
+
+    std::vector<std::vector<int>> allSamples(numSamples,
+                                        std::vector<int>(transcripts.size(),0));
+    double priorAlpha = 1e-8;
+    auto numMappedReads = readExp.numMappedReads();
+
+
+    for (auto& txp : transcripts) {
+        txp.setMass(priorAlpha + (txp.mass(false) * numMappedReads));
+    }
+
+    tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(numSamples)),
+                [&eqVec, &transcripts, priorAlpha,
+                 &allSamples]( const BlockedIndexRange& range) -> void {
+
+
+                std::random_device rd;
+                MultinomialSampler ms(rd);
+
+                size_t countMapSize{0};
+                for (size_t i = 0; i < eqVec.size(); ++i) {
+                    if (eqVec[i].first.valid) {
+                        countMapSize += eqVec[i].first.txps.size();
+                    }
+                }
+
+                std::vector<int> countMap(countMapSize, 0);
+                std::vector<double> probMap(countMapSize, 0.0);
+
+                initCountMap_(eqVec, transcripts, priorAlpha, ms, countMap, probMap, allSamples[range.begin()]);
+
+                // For each sample this thread should generate
+                bool isFirstSample{true};
+		bool numInternalRounds = 10;
+                for (auto sampleID : boost::irange(range.begin(), range.end())) {
+                    if (sampleID % 100 == 0) {
+                        std::cerr << "gibbs sampling " << sampleID << "\n";
+                    }
+                    if (!isFirstSample) {
+                        // the counts start at what they were last round.
+                        allSamples[sampleID] = allSamples[sampleID-1];
+                    }
+		    for (size_t i = 0; i < numInternalRounds; ++i){
+			    sampleRound_(eqVec, countMap, probMap, priorAlpha,
+					 allSamples[sampleID], ms);
+		    }
+                    isFirstSample = false;
+                }
+    });
+
+    auto numTranscripts = transcripts.size();
+    std::vector<DistStats> ds(numTranscripts);
+
+    // get posterior means
+    tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(numTranscripts)),
+                [&allSamples, &transcripts, &ds, numMappedReads,
+                 numSamples]( const BlockedIndexRange& range) -> void {
+
+                // For each sample this thread should generate
+                for (auto tid : boost::irange(range.begin(), range.end())) {
+                    double meanNumReads = {0.0};
+                    for (size_t i = 0; i < numSamples; ++i) {
+                      auto val = allSamples[i][tid];
+                      if (val < ds[tid].minVal) { ds[tid].minVal = val; }
+                      if (val > ds[tid].maxVal) { ds[tid].maxVal = val; }
+                      meanNumReads += (1.0 / numSamples) * val;
+                    }
+		    ds[tid].meanVal = meanNumReads;
+                    transcripts[tid].setMass(ds[tid].meanVal);
+                }
+    });
+
+    bfs::path gibbsSampleFile = sopt.outputDirectory / "samples.txt";
+    sopt.jointLog->info("Writing posterior samples to {}", gibbsSampleFile.string());
+
+    std::ofstream statStream(gibbsSampleFile.string());
+    statStream << "# txpName\tsample_1\tsample_2\t...\tsample_n\n";
+
+    for (size_t i = 0; i < numTranscripts; ++i) {
+	    statStream << transcripts[i].RefName;
+        for (size_t s = 0; s < allSamples.size(); ++s) {
+            statStream << '\t' << allSamples[s][i];
+            /*
+		   << ds[i].meanVal << '\t'
+		   << ds[i].minVal << '\t'
+		   << ds[i].maxVal << '\n';
+           */
+        }
+        statStream << '\n';
+    }
+    statStream.close();
+    sopt.jointLog->info("done writing posterior samples");
+
+    double cutoff = priorAlpha + 1e-8;
+    // Truncate tiny expression values
+    double txpSumTrunc = 0.0;
+    for (size_t i = 0; i < transcripts.size(); ++i) {
+	// maybe use the first decile instead of the mean for the cutoff;
+	// this could let too much through
+        if (transcripts[i].mass(false) <= cutoff) { transcripts[i].setMass(0.0); }
+        txpSumTrunc += transcripts[i].mass(false);
+    }
+
+    for (size_t i = 0; i < transcripts.size(); ++i) {
+        // Set the mass to the normalized (after truncation)
+        // relative abundance
+        transcripts[i].setMass(transcripts[i].mass(false) / txpSumTrunc);
+    }
+
+    return true;
+}
+
+template
+bool CollapsedGibbsSampler::sample<ReadExperiment>(ReadExperiment& readExp,
+        SalmonOpts& sopt,
+        uint32_t maxIter);
+
+template
+bool CollapsedGibbsSampler::sample<AlignmentLibrary<UnpairedRead>>(
+        AlignmentLibrary<UnpairedRead>& readExp,
+        SalmonOpts& sopt,
+        uint32_t maxIter);
+
+
+template
+bool CollapsedGibbsSampler::sample<AlignmentLibrary<ReadPair>>(
+        AlignmentLibrary<ReadPair>& readExp,
+        SalmonOpts& sopt,
+        uint32_t maxIter);
+
diff --git a/src/ComputeBiasFeatures.cpp b/src/ComputeBiasFeatures.cpp
new file mode 100644
index 0000000..175587f
--- /dev/null
+++ b/src/ComputeBiasFeatures.cpp
@@ -0,0 +1,225 @@
+
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <boost/thread/thread.hpp>
+
+#include <iostream>
+#include <vector>
+#include <array>
+#include <atomic>
+#include <thread>
+
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+#include "jellyfish/mer_dna.hpp"
+
+#include "tbb/concurrent_queue.h"
+
+#include <boost/range/irange.hpp>
+#include <boost/filesystem.hpp>
+
+#include "CommonTypes.hpp"
+
+// holding 2-mers as a uint64_t is a waste of space,
+// but using Jellyfish makes life so much easier, so
+// we'll live with it for now.
+using Kmer = uint64_t;
+using Sailfish::TranscriptFeatures;
+namespace bfs = boost::filesystem;
+
+template <typename ParserT>
+bool computeBiasFeaturesHelper(ParserT& parser,
+                               tbb::concurrent_bounded_queue<TranscriptFeatures>& featQueue,
+                               size_t& numComplete, size_t numThreads) {
+
+    using stream_manager = jellyfish::stream_manager<char**>;
+    using sequence_parser = jellyfish::whole_sequence_parser<stream_manager>;
+
+    size_t merLen = 2;
+    Kmer lshift(2 * (merLen - 1));
+    Kmer masq((1UL << (2 * merLen)) - 1);
+    std::atomic<size_t> readNum{0};
+
+    size_t numActors = numThreads;
+    std::vector<std::thread> threads;
+    auto tstart = std::chrono::steady_clock::now();
+
+    for (auto i : boost::irange(size_t{0}, numActors)) {
+        threads.push_back(std::thread(
+	        [&featQueue, &numComplete, &parser, &readNum, &tstart, lshift, masq, merLen, numActors]() -> void {
+
+                size_t cmlen, numKmers;
+                jellyfish::mer_dna_ns::mer_base_dynamic<uint64_t> kmer(merLen);
+
+                // while there are transcripts left to process
+                while (true) { //producer.nextRead(s)) {
+                    sequence_parser::job j(parser);
+                    // If this job is empty, then we're done
+                    if (j.is_empty()) { return; }
+
+                    for (size_t i=0; i < j->nb_filled; ++i) {
+                        ++readNum;
+                        if (readNum % 100 == 0) {
+                            auto tend = std::chrono::steady_clock::now();
+                            auto sec = std::chrono::duration_cast<std::chrono::seconds>(tend-tstart);
+                            auto nsec = sec.count();
+                            auto rate = (nsec > 0) ? readNum / sec.count() : 0;
+                            std::cerr << "processed " << readNum << " transcripts (" << rate << ") transcripts/s\r\r";
+                        }
+
+                        // we iterate over the entire read
+                        const char* start     = j->data[i].seq.c_str();
+                        uint32_t readLen      = j->data[i].seq.size();
+                        const char* const end = start + readLen;
+
+                        TranscriptFeatures tfeat{};
+
+                        // reset all of the counts
+                        numKmers = 0;
+                        cmlen = 0;
+                        kmer.polyA();
+
+                        // the maximum number of kmers we'd have to store
+                        uint32_t maxNumKmers = (readLen >= merLen) ? readLen - merLen + 1 : 0;
+                        if (maxNumKmers == 0) { featQueue.push(tfeat); continue; }
+
+                        // The transcript name
+                        std::string fullHeader(j->data[i].header);
+                        tfeat.name = fullHeader.substr(0, fullHeader.find(' '));
+                        tfeat.length = readLen;
+                        auto nfact = 1.0 / readLen;
+
+                        // iterate over the read base-by-base
+                        size_t offset{0};
+                        size_t numChars{j->data[i].seq.size()};
+                        while (offset < numChars) {
+                            auto c = jellyfish::mer_dna::code(j->data[i].seq[offset]);
+                            kmer.shift_left(c);
+                            if (jellyfish::mer_dna::not_dna(c)) {
+                                cmlen = 0;
+                                ++offset;
+                                continue;
+                            }
+                            if (++cmlen >= merLen) {
+                                size_t twomer = kmer.get_bits(0, 2*merLen);
+                                tfeat.diNucleotides[twomer]++;
+                                switch(c) {
+                                    case jellyfish::mer_dna::CODE_G:
+                                    case jellyfish::mer_dna::CODE_C:
+                                        tfeat.gcContent += nfact;
+                                        break;
+                                }
+                            }
+                            ++offset;
+                        } // end while
+
+                        char lastBase = j->data[i].seq.back();
+                        auto c = jellyfish::mer_dna::code(lastBase);
+                        switch(c) {
+                            case jellyfish::mer_dna::CODE_G:
+                            case jellyfish::mer_dna::CODE_C:
+                                tfeat.gcContent += nfact;
+                                break;
+                        }
+
+                        featQueue.push(tfeat);
+                    } // end job
+                } // end while(true)
+            } // end lambda
+            ));
+
+        } // actor loop
+
+        for (auto& t : threads) { t.join(); ++numComplete; }
+        return true;
+}
+
+int computeBiasFeatures(
+    std::vector<std::string>& transcriptFiles,
+    bfs::path outFilePath,
+    bool useStreamingParser,
+    size_t numThreads) {
+
+    using std::string;
+    using std::vector;
+    using std::cerr;
+
+    size_t numActors = numThreads;
+    size_t numComplete = 0;
+    tbb::concurrent_bounded_queue<TranscriptFeatures> featQueue;
+
+    std::ofstream ofile(outFilePath.string());
+
+    auto outputThread = std::thread(
+         [&ofile, &numComplete, &featQueue, numActors]() -> void {
+             TranscriptFeatures tf{};
+             while( numComplete < numActors or !featQueue.empty() ) {
+                 while(featQueue.try_pop(tf)) {
+                     ofile << tf.name << '\t';
+                     ofile << tf.length << '\t';
+                     ofile << tf.gcContent << '\t';
+                     for (auto i : boost::irange(size_t{0}, tf.diNucleotides.size())) {
+                         ofile << tf.diNucleotides[i];
+                         char end = (i == tf.diNucleotides.size() - 1) ? '\n' : '\t';
+                         ofile << end;
+                     }
+                 }
+                 boost::this_thread::sleep_for(boost::chrono::milliseconds(100));
+
+             }
+             ofile.close();
+         });
+
+    for( auto rf : transcriptFiles) {
+        std::cerr << "readFile: " << rf << ", ";
+    }
+    std::cerr << "\n";
+
+    for (auto& readFile : transcriptFiles) {
+        std::cerr << "file " << readFile << ": \n";
+
+        //namespace bfs = boost::filesystem;
+        //bfs::path filePath(readFile);
+
+        char* pc = new char[readFile.size() + 1];
+        std::strcpy(pc, readFile.c_str());
+        char* fnames[] = {pc};
+
+        // Create a jellyfish parser
+        const int concurrentFile{1};
+
+        using stream_manager = jellyfish::stream_manager<char**>;
+        using sequence_parser = jellyfish::whole_sequence_parser<stream_manager>;
+        stream_manager streams(fnames, fnames + 1, concurrentFile);
+
+        size_t maxReadGroupSize{100};
+        sequence_parser parser(1*numActors, maxReadGroupSize, concurrentFile, streams);
+        computeBiasFeaturesHelper<sequence_parser>(
+                parser, featQueue, numComplete, numActors);
+        delete pc;
+    }
+
+    std::cerr << "\n";
+    outputThread.join();
+    return 0;
+}
diff --git a/src/ErrorModel.cpp b/src/ErrorModel.cpp
new file mode 100644
index 0000000..dc12042
--- /dev/null
+++ b/src/ErrorModel.cpp
@@ -0,0 +1,234 @@
+#include <iostream>
+#include <cstdio>
+
+#include <boost/config.hpp> // for BOOST_LIKELY/BOOST_UNLIKELY
+
+#include "ErrorModel.hpp"
+#include "Transcript.hpp"
+#include "SalmonMath.hpp"
+#include "SalmonStringUtils.hpp"
+#include "UnpairedRead.hpp"
+#include "ReadPair.hpp"
+
+ErrorModel::ErrorModel(double alpha, uint32_t maxExpectedReadLen) :
+    maxExpectedLen_(maxExpectedReadLen),
+    mismatchLeft_(maxExpectedReadLen, AtomicMatrix<double>(16, 4, alpha)),
+    mismatchRight_(maxExpectedReadLen, AtomicMatrix<double>(16, 4, alpha)),
+    isEnabled_(true),
+    maxLen_(0),
+    burnedIn_(false) {}
+
+bool ErrorModel::burnedIn() { return burnedIn_; }
+void ErrorModel::burnedIn(bool burnedIn) { burnedIn_ = burnedIn; }
+
+double ErrorModel::logLikelihood(bam_seq_t* read, Transcript& ref,
+                                 std::vector<AtomicMatrix<double>>& mismatchProfile){
+    using namespace salmon::stringtools;
+    bool useQual{false};
+    size_t readIdx{0};
+    auto transcriptIdx = bam_pos(read);
+    size_t transcriptLen = ref.RefLength;
+    // if the read starts before the beginning of the transcript,
+    // only consider the part overlapping the transcript
+    if (transcriptIdx < 0) {
+        readIdx = -transcriptIdx;
+        transcriptIdx = 0;
+    }
+
+    // unsigned version of transcriptIdx
+    size_t uTranscriptIdx = static_cast<size_t>(transcriptIdx);
+
+    if (uTranscriptIdx >= transcriptLen) {
+        std::lock_guard<std::mutex> l(outputMutex_);
+        std::cerr << "transcript index = " << uTranscriptIdx << ", transcript length = " << transcriptLen << "\n";
+        return salmon::math::LOG_0;
+    }
+
+    //salmon::stringtools::strand readStrand = (BAM_FREVERSE & read->core.flag) ? salmon::stringtools::strand::reverse :
+    //                                            salmon::stringtools::strand::forward;
+    salmon::stringtools::strand readStrand = salmon::stringtools::strand::forward;
+    double logLike = salmon::math::LOG_1;
+
+    uint8_t* qseq = reinterpret_cast<uint8_t*>(bam_seq(read));//bam1_seq(read);
+    auto qualStr = reinterpret_cast<uint8_t*>(bam_qual(read));//bam1_qual(read);
+    size_t readLen = static_cast<size_t>(bam_seq_len(read));
+    size_t basesChecked = 0;
+    size_t numInc = std::min(transcriptLen - uTranscriptIdx, readLen - readIdx);
+    while (basesChecked < numInc) {
+        size_t curReadBase = samToTwoBit[bam_seqi(qseq, readIdx)];
+        size_t prevReadBase = (readIdx > 0) ? samToTwoBit[bam_seqi(qseq, readIdx-1)] : 0;
+        size_t refBase = samToTwoBit[ref.baseAt(uTranscriptIdx, readStrand)];
+        size_t index = prevReadBase + refBase;
+        int qval = qualStr[readIdx];
+        double qual = (useQual) ? salmon::stringtools::phredToLogProb[qval] : salmon::math::LOG_1;
+        logLike += mismatchProfile[readIdx](index, curReadBase) + qual;
+        ++readIdx;
+        ++uTranscriptIdx;
+        ++basesChecked;
+    }
+    return logLike;
+}
+
+double ErrorModel::logLikelihood(const ReadPair& hit, Transcript& ref){
+    double logLike = salmon::math::LOG_1;
+    if (BOOST_UNLIKELY(!isEnabled_)) { return logLike; }
+
+    if (!hit.isPaired()) {
+        if (hit.isLeftOrphan()) {
+            return logLikelihood(hit.read1, ref, mismatchLeft_);
+        } else {
+            return logLikelihood(hit.read1, ref, mismatchRight_);
+        }
+    }
+
+    bam_seq_t* leftRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read1 : hit.read2;
+    bam_seq_t* rightRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read2 : hit.read1;
+
+    size_t leftLen = static_cast<size_t>(bam_seq_len(leftRead));
+    size_t rightLen = static_cast<size_t>(bam_seq_len(rightRead));
+
+    // NOTE: Raise a warning in this case?
+    if (BOOST_UNLIKELY((leftLen > maxExpectedLen_) or
+                       (rightLen > maxExpectedLen_))) {
+        return logLike;
+    }
+
+    if (leftRead) {
+        logLike += logLikelihood(leftRead, ref, mismatchLeft_);
+    }
+
+    if (rightRead) {
+        logLike += logLikelihood(rightRead, ref, mismatchRight_);
+    }
+    if (logLike == salmon::math::LOG_0) {
+            std::lock_guard<std::mutex> lock(outputMutex_);
+            std::cerr << "orphan status: " << hit.orphanStatus << "\n";
+            std::cerr << "error likelihood: " << logLike << "\n";
+    }
+
+    return logLike;
+}
+
+double ErrorModel::logLikelihood(const UnpairedRead& hit, Transcript& ref){
+    double logLike = salmon::math::LOG_1;
+    if (BOOST_UNLIKELY(!isEnabled_)) { return logLike; }
+
+    bam_seq_t* read = hit.read;
+    size_t readLen = static_cast<size_t>(bam_seq_len(read));
+    // NOTE: Raise a warning in this case?
+    if (BOOST_UNLIKELY(readLen > maxExpectedLen_)) {
+        return logLike;
+    }
+    logLike += logLikelihood(read, ref, mismatchLeft_);
+
+    if (logLike == salmon::math::LOG_0) {
+            std::lock_guard<std::mutex> lock(outputMutex_);
+            std::cerr << "error log likelihood: " << logLike << "\n";
+    }
+
+    return logLike;
+}
+
+void ErrorModel::update(const UnpairedRead& hit, Transcript& ref, double p, double mass){
+    if (mass == salmon::math::LOG_0) { return; }
+    if (BOOST_UNLIKELY(!isEnabled_)) { return; }
+    bam_seq_t* leftRead = hit.read;
+    update(leftRead, ref, p, mass, mismatchLeft_);
+}
+
+void ErrorModel::update(bam_seq_t* read, Transcript& ref, double p, double mass,
+                        std::vector<AtomicMatrix<double>>& mismatchProfile) {
+    using namespace salmon::stringtools;
+    bool useQual{false};
+    size_t readIdx{0};
+    auto transcriptIdx = bam_pos(read);
+    size_t transcriptLen = ref.RefLength;
+    // if the read starts before the beginning of the transcript,
+    // only consider the part overlapping the transcript
+    if (transcriptIdx < 0) {
+        readIdx = -transcriptIdx;
+        transcriptIdx = 0;
+    }
+    // unsigned version of transcriptIdx
+    size_t uTranscriptIdx = static_cast<size_t>(transcriptIdx);
+
+    // Only attempt to update the model if the read overlaps the transcript.
+    if (uTranscriptIdx < transcriptLen) {
+        salmon::stringtools::strand readStrand = salmon::stringtools::strand::forward;
+
+        uint32_t numMismatch{0};
+        uint8_t* qseq = reinterpret_cast<uint8_t*>(bam_seq(read));
+        uint8_t* qualStr = reinterpret_cast<uint8_t*>(bam_qual(read));
+        size_t readLen = static_cast<size_t>(bam_seq_len(read));
+        size_t basesChecked = 0;
+        size_t numInc = std::min(transcriptLen - uTranscriptIdx, readLen - readIdx);
+        while (basesChecked < numInc) {
+            size_t curReadBase = samToTwoBit[bam_seqi(qseq, readIdx)];
+            size_t prevReadBase = (readIdx > 0) ? samToTwoBit[bam_seqi(qseq, readIdx-1)] : 0;
+            size_t refBase = samToTwoBit[ref.baseAt(uTranscriptIdx, readStrand)];
+            size_t index = prevReadBase + refBase;
+            if (curReadBase != refBase) { ++numMismatch; }
+            int qval = qualStr[readIdx];
+            double qual = (useQual) ? salmon::stringtools::phredToLogProb[qval] : salmon::math::LOG_1;
+            mismatchProfile[readIdx].increment(index, curReadBase, mass+p+qual);
+            ++readIdx;
+            ++uTranscriptIdx;
+            ++basesChecked;
+        }
+
+        /*
+        // DEBUG: print number of mismatches
+        if (numMismatch > 3) {
+            std::lock_guard<std::mutex> lock(outputMutex_);
+            std::cerr << "read of length " << readLen << " had " << numMismatch
+                      << " mismatches; dir is " << ((readStrand == strand::forward) ? "forward" : "reverse") << "\n";
+           int32_t transcriptStartIdx = bam_pos(read);
+           size_t rstart = (transcriptStartIdx < 0) ? -transcriptStartIdx : 0;
+           if (transcriptStartIdx < 0) { transcriptStartIdx = 0; }
+           if (op == BAM_CSOFT_CLIP) {
+               rstart += opLen;
+           }
+           size_t numInc = std::min(transcriptLen - transcriptStartIdx, readLen - rstart);
+           int32_t rinc{1};
+            std::cerr << "read name is " << bam_name(read) << "\n";
+            std::cerr << "readStart = " << rstart << "\n";
+            std::cerr << "txp start = " << transcriptStartIdx << " (txp len = " << ref.RefLength << "), (txp name = " << ref.RefName << ")\n";
+            std::cerr << "read is: ";
+            int32_t rp = rstart;
+            for (int32_t ri = 0; ri < numInc; ri++, rp++) {
+                std::cerr << samCodeToChar[bam_seqi(qseq, rp)];
+            } std::cerr << "\n";
+            std::cerr << "ref is:  ";
+
+            int32_t tp = transcriptStartIdx;
+            for (int32_t ri = 0; ri < numInc; ri++, tp+=rinc) {
+                std::cerr << samCodeToChar[ref.baseAt(tp, readStrand)];
+            } std::cerr << "\n";
+        }
+        */
+
+        maxLen_ = std::max(maxLen_, readLen);
+        if (BOOST_UNLIKELY(maxLen_ > maxExpectedLen_)) {
+            std::lock_guard<std::mutex> lock(outputMutex_);
+            std::cerr << "Encountered read longer than maximum expected length of "
+                << maxExpectedLen_ << ", not applying error model\n";
+            isEnabled_ = false;
+        }
+    }
+}
+
+void ErrorModel::update(const ReadPair& hit, Transcript& ref, double p, double mass){
+    if (mass == salmon::math::LOG_0) { return; }
+    if (BOOST_UNLIKELY(!isEnabled_)) { return; }
+
+    bam_seq_t* leftRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read1 : hit.read2;
+    bam_seq_t* rightRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read2 : hit.read1;
+
+    if (leftRead) {
+        update(leftRead, ref, p, mass, mismatchLeft_);
+    }
+
+    if (rightRead) {
+        update(rightRead, ref, p, mass, mismatchRight_);
+    }
+}
diff --git a/src/FASTAParser.cpp b/src/FASTAParser.cpp
new file mode 100644
index 0000000..0da7e60
--- /dev/null
+++ b/src/FASTAParser.cpp
@@ -0,0 +1,54 @@
+#include <unistd.h>
+#include <cstdio>
+#include <iostream>
+#include <unordered_map>
+
+
+#include "jellyfish/mer_dna.hpp"
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+
+#include "FASTAParser.hpp"
+#include "Transcript.hpp"
+#include "SalmonStringUtils.hpp"
+
+FASTAParser::FASTAParser(const std::string& fname): fname_(fname) {}
+
+void FASTAParser::populateTargets(std::vector<Transcript>& refs) {
+    using stream_manager = jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
+    using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+
+    using std::string;
+    using std::unordered_map;
+
+    unordered_map<string, size_t> nameToID;
+    for (auto& ref : refs) {
+	nameToID[ref.RefName] = ref.id;
+    }
+
+    std::vector<std::string> readFiles{fname_};
+    size_t maxReadGroup{1000}; // Number of files to read simultaneously
+    size_t concurrentFile{1}; // Number of reads in each "job"
+    stream_manager streams(readFiles.cbegin(), readFiles.cend(), concurrentFile);
+    single_parser parser(4, maxReadGroup, concurrentFile, streams);
+
+    while(true) {
+        typename single_parser::job j(parser); // Get a job from the parser: a bunch of read (at most max_read_group)
+        if(j.is_empty()) break;           // If got nothing, quit
+
+        for(size_t i = 0; i < j->nb_filled; ++i) { // For all the read we got
+            std::string& header = j->data[i].header;
+            std::string name = header.substr(0, header.find(' '));
+
+            auto it = nameToID.find(name);
+            if (it == nameToID.end()) {
+                std::cerr << "WARNING: Transcript " << name << " appears in the reference but did not appear in the BAM\n";
+            } else {
+                std::string& seq = j->data[i].seq;
+                refs[it->second].Sequence = salmon::stringtools::encodeSequenceInSAM(seq.c_str(), seq.size());
+            }
+        }
+    }
+
+}
+
diff --git a/src/FragmentLengthDistribution.cpp b/src/FragmentLengthDistribution.cpp
new file mode 100644
index 0000000..cb2cfc3
--- /dev/null
+++ b/src/FragmentLengthDistribution.cpp
@@ -0,0 +1,208 @@
+/**
+ *  lengthdistribution.cpp
+ *  express
+ *
+ *  Created by Adam Roberts on 1/30/13.
+ *  Copyright 2013 Adam Roberts. All rights reserved.
+ */
+
+#include "FragmentLengthDistribution.hpp"
+#include "SalmonMath.hpp"
+#include <numeric>
+#include <cassert>
+#include <boost/assign.hpp>
+#include <iostream>
+#include <fstream>
+#include <boost/math/distributions/binomial.hpp>
+#include <boost/math/distributions/normal.hpp>
+
+using namespace std;
+
+FragmentLengthDistribution::FragmentLengthDistribution(double alpha, size_t max_val,
+                                       size_t prior_mu, size_t prior_sigma,
+                                       size_t kernel_n, double kernel_p,
+                                       size_t bin_size)
+    : hist_(max_val/bin_size+1),
+      cachedCMF_(hist_.size()),
+      haveCachedCMF_(false),
+      totMass_(salmon::math::LOG_0),
+      sum_(salmon::math::LOG_0),
+      min_(max_val/bin_size),
+      binSize_(bin_size) {
+
+    using salmon::math::logAdd;
+  max_val = max_val/bin_size;
+  kernel_n = kernel_n/bin_size;
+  assert(kernel_n % 2 == 0);
+
+  double tot = log(alpha);
+
+  // Set to prior distribution
+  if (prior_mu) {
+    boost::math::normal norm(prior_mu/bin_size,
+                             prior_sigma/(bin_size*bin_size));
+
+    for (size_t i = 0; i <= max_val; ++i) {
+      double norm_mass = boost::math::cdf(norm, i+0.5) -
+                         boost::math::cdf(norm, i-0.5);
+      double mass = salmon::math::LOG_EPSILON;
+      if (norm_mass != 0) {
+        mass = tot + log(norm_mass);
+      }
+      hist_[i].compare_and_swap(mass, hist_[i]);
+      sum_.compare_and_swap(logAdd(sum_, log((double)i)+mass), sum_);
+      totMass_.compare_and_swap(logAdd(totMass_, mass), totMass_);
+    }
+  } else {
+      hist_ = vector<tbb::atomic<double>>(max_val + 1, tot - log((double)max_val));
+      hist_[0].compare_and_swap(salmon::math::LOG_0, hist_[0]);
+      sum_.compare_and_swap(hist_[1] + log((double)(max_val * (max_val + 1))) - log(2.), sum_);
+      totMass_ = tot;
+  }
+
+  // Define kernel
+  boost::math::binomial_distribution<double> binom(kernel_n, kernel_p);
+  kernel_ = vector<double>(kernel_n + 1);
+  for (size_t i = 0; i <= kernel_n; i++) {
+    kernel_[i] = log(boost::math::pdf(binom, i));
+  }
+}
+
+size_t FragmentLengthDistribution::maxVal() const {
+  return (hist_.size()-1) * binSize_;
+}
+
+size_t FragmentLengthDistribution::minVal() const {
+  if (min_ == hist_.size() - 1) {
+    return 1;
+  }
+  return min_;
+}
+
+void FragmentLengthDistribution::addVal(size_t len, double mass) {
+    using salmon::math::logAdd;
+    //assert(!isnan(mass));
+    //assert(kernel_.size());
+
+//  len /= binSize_;
+
+  if (len > maxVal()) {
+      len = maxVal();
+  }
+  if (len < min_) {
+    min_ = len;
+  }
+
+  size_t offset = len - kernel_.size()/2;
+
+  for (size_t i = 0; i < kernel_.size(); i++) {
+    if (offset > 0 && offset < hist_.size()) {
+      double kMass = mass + kernel_[i];
+      double oldVal = hist_[offset];
+      double retVal = oldVal;
+      double newVal = 0.0;
+      do {
+          oldVal = retVal;
+          newVal = logAdd(oldVal, kMass);
+          retVal = hist_[offset].compare_and_swap(newVal, oldVal);
+      } while (retVal != oldVal);
+
+      retVal = sum_;
+      do {
+          oldVal = retVal;
+          newVal = logAdd(oldVal, log(static_cast<double>(offset))+kMass);
+          retVal = sum_.compare_and_swap(newVal, oldVal);
+      } while (retVal != oldVal);
+
+      retVal = totMass_;
+      do {
+          oldVal = retVal;
+          newVal = logAdd(oldVal, kMass);
+          retVal = totMass_.compare_and_swap(newVal, oldVal);
+      } while (retVal != oldVal);
+
+    }
+    offset++;
+  }
+}
+
+double FragmentLengthDistribution::pmf(size_t len) const {
+    len /= binSize_;
+    if (len > maxVal()) {
+        len = maxVal();
+    }
+    return hist_[len]-totMass_;
+}
+
+double FragmentLengthDistribution::cmf(size_t len) const {
+    if(haveCachedCMF_) {
+        return cachedCMF_[len];
+    } else {
+        double cum = salmon::math::LOG_0;
+        len /= binSize_;
+        if (len > maxVal()) {
+            len = maxVal();
+        }
+
+        for (size_t i = 0; i <= len; ++i) {
+            cum = salmon::math::logAdd(cum, hist_[i]);
+        }
+        return cum - totMass_;
+    }
+}
+
+void FragmentLengthDistribution::cacheCMF() {
+    std::lock_guard<std::mutex> lg(fldMut_);
+    if (!haveCachedCMF_) {
+        cachedCMF_ = cmf();
+        haveCachedCMF_ = true;
+    }
+}
+
+vector<double> FragmentLengthDistribution::cmf() const {
+  double cum = salmon::math::LOG_0;
+  vector<double> cdf(hist_.size());
+  for (size_t i = 0; i < hist_.size(); ++i) {
+    cum = salmon::math::logAdd(cum, hist_[i]);
+    cdf[i] = cum - totMass_;
+  }
+  //assert(approxEq(cum, totMass_));
+
+  return cdf;
+}
+
+double FragmentLengthDistribution::totMass() const {
+  return totMass_;
+}
+
+double FragmentLengthDistribution::mean() const {
+  return sum_ - totMass();
+}
+
+std::string FragmentLengthDistribution::toString() const {
+    std::stringstream ss;
+    for (size_t i = 0; i < hist_.size(); ++i) {
+        ss << std::exp(pmf(i*binSize_));
+        if (i != hist_.size() - 1) { ss << '\t'; }
+    }
+    ss << "\n";
+    return ss.str();
+}
+/*
+string FragmentLengthDistribution::to_string() const {
+  string s = "";
+  char buffer[50];
+  for(size_t i = 0; i < hist_.size(); i++) {
+    sprintf(buffer, "%e\t",sexp(pmf(i*binSize_)));
+    s += buffer;
+  }
+  s.erase(s.length()-1,1);
+  return s;
+}
+
+void FragmentLengthDistribution::append_output(ofstream& outfile,
+                                       string length_type) const {
+  outfile << ">" << length_type << " Length Distribution (0-" << maxVal()*binSize_;
+  outfile << ")\n" << to_string() << endl;
+}
+*/
diff --git a/src/FragmentList.cpp b/src/FragmentList.cpp
new file mode 100644
index 0000000..b32205e
--- /dev/null
+++ b/src/FragmentList.cpp
@@ -0,0 +1,231 @@
+#include <cstdlib>
+#include <cstdio>
+#include <ctime>
+#include <iostream>
+
+extern "C" {
+#include "info.h"
+#include "debug.h"
+#include "container.h"
+#include "manopt.h"
+#include "fileio.h"
+#include "sltypes.h"
+#include "slchain.h"
+#include "rangetree.h"
+//#include "clasp.h"
+}
+
+#include "FragmentList.hpp"
+
+#define SOP 	((unsigned char) (0 << 0))
+#define LIN	((unsigned char) (1 << 0))
+
+unsigned char mute;
+
+FragmentList::FragmentList(){
+    fragments = (Container *) malloc(sizeof(Container));
+    bl_containerInit(fragments, 20, sizeof(slmatch_t));
+    fragmentsRC = (Container *) malloc(sizeof(Container));
+    bl_containerInit(fragmentsRC, 20, sizeof(slmatch_t));
+}
+
+// Free all of the structures allocated for this fragment list
+FragmentList::~FragmentList() {
+    freeFragList(fragments);
+    freeFragList(fragmentsRC);
+}
+
+void FragmentList::freeFragList(Container* frags) {
+    for (uint32_t i = 0; i < bl_containerSize(frags); i++){
+        slmatch_t *sl = (slmatch_t *) bl_containerGet(frags, i);
+        if (sl->chain != nullptr){
+            std::cerr << "still at least one chain not freed before end: " << i << "\n";
+            exit(-1);
+            bl_slchainDestruct(sl->chain);
+            free(sl->chain);
+        }
+    }
+    bl_containerDestruct(frags, bl_slmatchDestruct);
+    free(frags);
+}
+
+bool FragmentList::computeBestChain_(Container* frags, double& maxScore, uint32_t& bestPos) {
+    double epsilon = 0.0;
+    double lambda = 1.0;
+    unsigned char chainMode = LIN;
+    // Did we find a chain with a score higher than the original
+    // maxScore parameter?
+    bool updatedMaxScore = false;
+
+    /* sort fragments */
+    qsort(frags->contspace, bl_containerSize(frags),
+            sizeof(slmatch_t), cmp_slmatch_qsort);
+    uint32_t  begin = 0;
+    for (uint32_t i = 1; i <= bl_containerSize(frags); i++){
+        /*
+         * end of fragments list or different database sequence
+         * --> process fragment[begin]...fragment[i-1], write output
+         *     and free chains (less memory consumption with large input files)
+         */
+        if (i == bl_containerSize(frags) ||
+                ((slmatch_t *) bl_containerGet(frags, begin))->subject !=
+                ((slmatch_t *) bl_containerGet(frags, i))->subject){
+            //fprintf(info.dev, "%d\t%d\n", begin, i-begin);
+            if (chainMode == SOP){
+                /* only use chaining without clustering if no ids are specified */
+                bl_slChainSop((slmatch_t *) frags->contspace + begin, i - begin,
+                        epsilon, lambda, 10);
+                /*
+                   bl_slClusterSop((slmatch_t *) info.fragments->contspace + begin, i - begin,
+                   info.epsilon, info.lambda, info.maxgap);
+                   */
+            }
+            else {
+                bl_slChainLin((slmatch_t *) frags->contspace + begin, i - begin,
+                        epsilon, lambda, 10);
+                /*
+                   bl_slClusterLin((slmatch_t *) info.fragments->contspace + begin, i - begin,
+                   info.epsilon, info.lambda, info.maxgap);
+                   */
+            }
+
+            for (uint32_t j = begin; j < i; j++){
+                slmatch_t *match = (slmatch_t *) bl_containerGet(frags, j);
+
+                if (match->chain) {
+                    slchain_t* chain = (slchain_t*) match->chain;
+                    if (chain->scr >= maxScore) {
+                        maxScore = chain->scr;
+                        bestPos = chain->p;
+                        updatedMaxScore = true;
+                    }
+
+                    /*
+                    // output matches (if desired)
+                    if (info.outputm){
+                    fprintf(info.dev, "M\t");
+                    if (!info.outputorig){
+                    if (info.idcol != NULL){
+                    fprintf(info.dev, "%s\t",
+                     *(char **) bl_containerGet(info.subject, match->subject));
+                     }
+                     fprintf(info.dev, "%d\t%d\t%d\t%d\t%.3f\n", match->i,
+                     match->i + match->j - 1, match->p,
+                     match->p + match->q - 1, match->scr);
+                     }
+                    // output in original format as input
+                    else {
+                    fprintf(info.dev, "%s\n", *(char **) bl_containerGet(info.lines, j));
+                    }
+                    }
+                    if (match->chain){
+                    slchain_t *chain = (slchain_t *) match->chain;
+                    if (info.outputc && chain->scr >= info.minscore &&
+                    bl_containerSize(chain->matches) >= info.minfrag){
+                    fprintf(info.dev, "C\t");
+                    if (info.idcol != NULL){
+                    fprintf(info.dev, "%s\t", *(char **) bl_containerGet(info.subject, chain->subject));
+                    }
+                    fprintf(info.dev, "%d\t%d\t%d\t%d\t%.3f\n", chain->i,
+                    chain->i + chain->j - 1, chain->p,
+                    chain->p + chain->q - 1, chain->scr);
+                    }
+                    // output chains and fragments (if requested)
+                    if (info.outputf && chain->scr >= info.minscore &&
+                    bl_containerSize(chain->matches) >= info.minfrag){
+                    for (k = 0; k < bl_containerSize(chain->matches); k++){
+                    slmatch_t *frag = *(slmatch_t **)
+                    bl_containerGet(chain->matches, k);
+                    fprintf(info.dev, "F\t");
+                    if (!info.outputorig){
+                    if (info.idcol != NULL){
+                    fprintf(info.dev, "%s\t",
+                     *(char **) bl_containerGet(info.subject, frag->subject));
+                     }
+                     fprintf(info.dev, "%d\t%d\t%d\t%d\t%.3f\n", frag->i,
+                     frag->i + frag->j - 1, frag->p, frag->p + frag->q - 1,
+                     frag->scr);
+                     }
+                    // output in original format as input
+                    else {
+                    fprintf(info.dev, "%s\n",
+                     *(char **) bl_containerGet(info.lines, frag->idx));
+                     }
+                     }
+                     }
+                     */
+                    bl_slchainDestruct(chain);
+                    free(chain);
+                    match->chain = nullptr;
+                } /* END OF if (frag->chain) */
+                }  /* END OF for (j = begin; j < i; j++) */
+                begin = i;
+            } /* END OF  if (i == bl_containerSize(info.fragments) ||
+                 ((slmatch_t *) bl_containerGet(info.fragments, begin))->subject !=
+                 ((slmatch_t *) bl_containerGet(info.fragments, i))->subject) */
+        } /* END OF for (i = 1; i <= bl_containerSize(info.fragments); i++) */
+
+        return updatedMaxScore;
+
+    }
+
+    bool FragmentList::isForward() { return isForward_; }
+
+    void FragmentList::computeBestChain() {
+        double maxScore = 0.0;
+        uint32_t bestPos = 0;
+
+        // we don't need the return value from the first call
+        static_cast<void>(computeBestChain_(fragments, maxScore, bestPos));
+        bool reverseIsBest = computeBestChain_(fragmentsRC, maxScore, bestPos);
+        isForward_ = not reverseIsBest;
+
+        bestHitScore = maxScore;
+        bestHitPos = bestPos;
+    }
+
+    void FragmentList::addFragMatch(uint32_t refStart, uint32_t queryStart, uint32_t len) {
+        addFragMatch(refStart, refStart + len, queryStart,
+                queryStart + len);
+    }
+
+    void FragmentList::addFragMatch(uint32_t refStart, uint32_t refEnd,
+            uint32_t queryStart, uint32_t queryEnd) {
+        slmatch_t frag;
+        bl_slmatchInit(&frag, 0);
+        frag.p = refStart;
+        frag.q = refEnd;
+        frag.i = queryStart;
+        frag.j = queryEnd;
+        frag.scr = static_cast<double>(queryEnd) - queryStart;
+        bl_containerAdd(fragments, &frag);
+    }
+
+
+    void FragmentList::addFragMatchRC(uint32_t refStart, uint32_t queryStart, uint32_t len) {
+        addFragMatchRC(refStart, refStart + len, queryStart,
+                queryStart + len);
+    }
+
+    void FragmentList::addFragMatchRC(uint32_t refStart, uint32_t refEnd,
+            uint32_t queryStart, uint32_t queryEnd) {
+        slmatch_t frag;
+        bl_slmatchInit(&frag, 0);
+        frag.p = refStart;
+        frag.q = refEnd;
+        frag.i = queryStart;
+        frag.j = queryEnd;
+        frag.scr = static_cast<double>(queryEnd) - queryStart;
+        bl_containerAdd(fragmentsRC, &frag);
+    }
+
+/*
+int main(int argc, char* argv[]) {
+    FragmentList fl;
+    fl.addFragment(0, 0, 15);
+    fl.addFragment(25, 25, 30);
+    fl.addFragment(10, 10, 20);
+    fl.computeBestChain();
+    std::cerr << "maxChain score = " << fl.maxChainScore << "\n";
+}
+*/
diff --git a/src/FragmentStartPositionDistribution.cpp b/src/FragmentStartPositionDistribution.cpp
new file mode 100644
index 0000000..c9342aa
--- /dev/null
+++ b/src/FragmentStartPositionDistribution.cpp
@@ -0,0 +1,141 @@
+/**
+ * Model the bias in read start positions across a set of transcripts.
+ * This class is similar to and inspired by the FragmentLengthDistribution
+ * class, which was itself modified from lengthdistribution.cpp ---
+ * originally written by Adam Roberts as part of the eXpress software.
+ * Rob Patro; 2014, 2015
+ */
+
+
+#include "FragmentStartPositionDistribution.hpp"
+#include "SalmonMath.hpp"
+#include <numeric>
+#include <cassert>
+#include <boost/assign.hpp>
+#include <iostream>
+#include <fstream>
+#include <boost/math/distributions/binomial.hpp>
+#include <boost/math/distributions/normal.hpp>
+
+using namespace std;
+
+FragmentStartPositionDistribution::FragmentStartPositionDistribution(uint32_t numBins)
+    : numBins_(numBins),
+      pmf_(numBins+2),
+      cmf_(numBins+2),
+      totMass_(salmon::math::LOG_1),
+      isUpdated_(false) {
+
+  using salmon::math::logAdd;
+  double uniMass = log(1.0 / numBins_);
+  for (auto i = 1; i <= numBins_; ++i) {
+    pmf_[i] = uniMass;
+    cmf_[i] = log(static_cast<double>(i)) + uniMass;
+  }
+}
+
+inline void logAddMass(tbb::atomic<double>& bin,
+                       double newMass) {
+      double oldVal = bin;
+      double retVal = oldVal;
+      double newVal = 0.0;
+      do {
+          oldVal = retVal;
+          newVal = salmon::math::logAdd(oldVal, newMass);
+          retVal = bin.compare_and_swap(newVal, oldVal);
+      } while (retVal != oldVal);
+}
+
+void FragmentStartPositionDistribution::addVal(
+        int32_t hitPos,
+        uint32_t txpLen,
+        double mass) {
+
+    using salmon::math::logAdd;
+    if (hitPos >= txpLen) return; // hit should happen within the transcript
+
+    if (hitPos < 0) { hitPos = 0; }
+    double logLen = log(txpLen);
+
+    // Modified from: https://github.com/deweylab/RSEM/blob/master/RSPD.h
+    int i;
+    // Fraction along the transcript where this hit occurs
+    double a = hitPos * 1.0 / txpLen;
+    double b;
+
+    for (i = ((long long)hitPos) * numBins_ / txpLen + 1;
+         i < (((long long)hitPos + 1) * numBins_ - 1) / txpLen + 1; i++) {
+        b = i * 1.0 / numBins_;
+        double updateMass = log(b - a) + logLen + mass;
+        logAddMass(pmf_[i], updateMass);
+        logAddMass(totMass_, updateMass);
+        a = b;
+    }
+    b = (hitPos + 1.0) / txpLen;
+    double updateMass = log(b - a) + logLen + mass;
+    logAddMass(pmf_[i], updateMass);
+    logAddMass(totMass_, updateMass);
+}
+
+double FragmentStartPositionDistribution::evalCDF(int32_t hitPos, uint32_t txpLen) {
+    int i = (static_cast<long long>(hitPos) * numBins_) / txpLen;
+    double val = hitPos * 1.0 / txpLen * numBins_;
+    return salmon::math::logAdd(cmf_[i], std::log(val - i) + pmf_[i+1]);
+}
+
+void FragmentStartPositionDistribution::update() {
+    std::lock_guard<std::mutex> lg(fspdMut_);
+    if (!isUpdated_) {
+        for (int i = 1; i <= numBins_; i++) {
+            pmf_[i] = pmf_[i] - totMass_;
+            cmf_[i] = salmon::math::logAdd(cmf_[i - 1], pmf_[i]);
+        }
+        isUpdated_ = true;
+    }
+}
+
+double FragmentStartPositionDistribution::operator()(
+        int32_t hitPos,
+        uint32_t txpLen,
+        double logEffLen) {
+    
+    if (hitPos < 0) { hitPos = 0; }
+    assert(hitPos < txpLen);
+    if (hitPos >= txpLen) {
+	    std::cerr << "\n\nhitPos = " << hitPos << ", txpLen = " << txpLen << "!!\n\n\n";
+	    return salmon::math::LOG_0;
+    }
+    // If we haven't updated the CDF yet, then
+    // just return log(1 / effLen);
+    if (!isUpdated_) {
+        return -logEffLen;
+    }
+
+    double a = hitPos * (1.0 / txpLen);
+
+    double effLen = std::exp(logEffLen);
+    if (effLen >= txpLen) {
+	    effLen = txpLen - 1;
+    }
+    double denom = evalCDF(static_cast<int32_t>(effLen), txpLen);
+    return ((denom >= salmon::math::LOG_EPSILON) ?
+            salmon::math::logSub(evalCDF(hitPos + 1, txpLen), evalCDF(hitPos, txpLen)) -
+	    denom : 0.0); 
+
+    //return salmon::math::logSub(evalCDF(hitPos + 1, txpLen), evalCDF(hitPos, txpLen));
+}
+
+double FragmentStartPositionDistribution::totMass() const {
+  return totMass_;
+}
+
+std::string FragmentStartPositionDistribution::toString() const {
+    std::stringstream ss;
+    for (size_t i = 0; i < pmf_.size(); ++i) {
+        ss << std::exp(pmf_[i] - totMass_);
+        if (i != pmf_.size() - 1) { ss << '\t'; }
+    }
+    ss << "\n";
+    return ss.str();
+}
+
diff --git a/src/GenomicFeature.cpp b/src/GenomicFeature.cpp
new file mode 100644
index 0000000..35a2fc6
--- /dev/null
+++ b/src/GenomicFeature.cpp
@@ -0,0 +1,104 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <boost/thread/thread.hpp>
+#include <atomic>
+#include <thread>
+#include <fstream>
+#include "GenomicFeature.hpp"
+
+std::ostream& operator<< ( std::ostream& out, const TranscriptGeneID& ids ) {
+  out << "transcript_id => " << ids.transcript_id << ", gene_id => " << ids.gene_id;
+  return out;
+}
+
+template< typename StaticAttributes>
+std::ostream& operator<< ( std::ostream& out, const GenomicFeature<StaticAttributes>& gf ) {
+   out << "Feature: [" <<
+          "id :" << gf.seqID << ", "  <<
+          "source : " << gf.source << ", " <<
+          "type : " << gf.type << ", " <<
+          "start : " << gf.start << ", " <<
+          "end : " << gf.end << ", " <<
+          "score : " << gf.score << ", " <<
+          "strand : " << gf.strand << ", " <<
+          "phase : " << gf.phase << " ** ";
+   out << gf.sattr << ", ";
+  /*for ( auto& kv : gf.attributes ) {
+      out << kv.first << " => " << kv.second << ", ";
+   }
+   */
+   return out;
+}
+
+template< typename StaticAttributes >
+std::istream& operator>> ( std::istream& in, GenomicFeature<StaticAttributes>& gf ) {
+  std::string line;
+  std::getline(in, gf.seqID, '\t');
+  std::getline(in, gf.source, '\t');
+  std::getline(in, gf.type, '\t');
+  
+  std::getline(in, line, '\t');
+  gf.start = atoi(line.c_str());
+
+  std::getline(in, line, '\t');
+  gf.end = atoi(line.c_str());
+
+  std::getline(in, line, '\t');
+  gf.score = atoi(line.c_str());
+
+  std::getline(in, line, '\t');
+  gf.strand = line[0];
+
+  std::getline(in, line, '\t');
+  gf.phase = line[0];
+
+  
+  std::getline(in, line);
+  using tokenizer = boost::tokenizer<boost::char_separator<char> >;
+  boost::char_separator<char> sep(";");  
+  tokenizer tokens(line, sep);
+
+  for ( auto tokIt : tokens ) {       
+    // Currently, we'll handle the following separators
+    // '\s+'
+    // '\s*=\s*'
+    tokIt = tokIt.substr(tokIt.find_first_not_of(' '));
+    auto kvsepStart = tokIt.find('=');
+    
+    // If we reached the end of the key, value token, then the string must have been
+    // separated by some set of spaces, and NO '='.  If this is the case, find the 'spaces' so that
+    // we can split on it.
+    if ( kvsepStart == tokIt.npos ) {
+      kvsepStart = tokIt.find(' ');
+    } 
+    
+    auto key = tokIt.substr(0, kvsepStart);
+    key = key.substr(0, key.find(' '));
+
+    auto kvsepStop = 1 + kvsepStart + tokIt.substr(kvsepStart+1).find_first_not_of(' ');
+    auto val = (tokIt[kvsepStop] == '"') ? tokIt.substr(kvsepStop+1, (tokIt.length() - (kvsepStop+2))) : tokIt.substr(kvsepStop, (tokIt.length() - (kvsepStop+1)));
+    gf.sattr.parseAttribute( key, val );
+  }
+
+  return in;
+}
diff --git a/src/JellyfishMerCounter.cpp b/src/JellyfishMerCounter.cpp
new file mode 100644
index 0000000..5d60418
--- /dev/null
+++ b/src/JellyfishMerCounter.cpp
@@ -0,0 +1,379 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+/*  This file is part of Jellyfish.
+
+    Jellyfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Jellyfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Jellyfish.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*  This file was reproduced directly from Jellyfish's mer_counter.cc file.  It is
+    meant to act as a pseudo-independent driver that builds Jellyfish hash for a
+    transcriptome, but which is callable directly (as a function) from Sailfish.
+ */
+
+#include <cstdlib>
+#include <unistd.h>
+#include <assert.h>
+#include <signal.h>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <map>
+#include <sstream>
+#include <memory>
+#include <chrono>
+
+#include <jellyfish/err.hpp>
+#include <jellyfish/thread_exec.hpp>
+#include <jellyfish/hash_counter.hpp>
+#include <jellyfish/locks_pthread.hpp>
+#include <jellyfish/stream_manager.hpp>
+#include <jellyfish/mer_overlap_sequence_parser.hpp>
+#include <jellyfish/whole_sequence_parser.hpp>
+#include <jellyfish/mer_iterator.hpp>
+#include <jellyfish/mer_qual_iterator.hpp>
+#include <jellyfish/jellyfish.hpp>
+#include <jellyfish/mer_dna_bloom_counter.hpp>
+#include <jellyfish/generator_manager.hpp>
+#include "merge_files.hpp"
+#include "count_main_cmdline.hpp"
+
+static count_main_cmdline args; // Command line switches and arguments
+
+using std::chrono::system_clock;
+using std::chrono::duration;
+using std::chrono::duration_cast;
+
+template<typename DtnType>
+inline double as_seconds(DtnType dtn) { return duration_cast<duration<double>>(dtn).count(); }
+
+using jellyfish::mer_dna;
+using jellyfish::mer_dna_bloom_counter;
+using jellyfish::mer_dna_bloom_filter;
+typedef std::vector<const char*> file_vector;
+
+// Types for parsing arbitrary sequence ignoring quality scores
+typedef jellyfish::mer_overlap_sequence_parser<jellyfish::stream_manager<file_vector::const_iterator> > sequence_parser;
+typedef jellyfish::mer_iterator<sequence_parser, mer_dna> mer_iterator;
+
+// Types for parsing reads with quality score. Interface match type
+// above.
+class sequence_qual_parser :
+  public jellyfish::whole_sequence_parser<jellyfish::stream_manager<file_vector::const_iterator> >
+{
+  typedef jellyfish::stream_manager<file_vector::const_iterator> StreamIterator;
+  typedef jellyfish::whole_sequence_parser<StreamIterator> super;
+public:
+  sequence_qual_parser(uint16_t mer_len, uint32_t max_producers, uint32_t size, size_t buf_size,
+                       StreamIterator& streams) :
+    super(size, 100, max_producers, streams)
+  { }
+};
+
+class mer_qual_iterator : public jellyfish::mer_qual_iterator<sequence_qual_parser, mer_dna> {
+  typedef jellyfish::mer_qual_iterator<sequence_qual_parser, mer_dna> super;
+public:
+  mer_qual_iterator(sequence_qual_parser& parser, bool canonical = false) :
+    super(parser, args.min_qual_char_arg[0], canonical)
+  { }
+};
+
+// k-mer filters. Organized in a linked list, interpreted as a &&
+// (logical and). I.e. all filter must return true for the result to
+// be true. By default, filter returns true.
+struct filter {
+  filter* prev_;
+  filter(filter* prev = 0) : prev_(prev) { }
+  virtual ~filter() { }
+  virtual bool operator()(const mer_dna& x) { return and_res(true, x); }
+  bool and_res(bool r, const mer_dna& x) const {
+    return r ? (prev_ ? (*prev_)(x) : true) : false;
+  }
+};
+
+struct filter_bc : public filter {
+  const mer_dna_bloom_counter& counter_;
+  filter_bc(const mer_dna_bloom_counter& counter, filter* prev = 0) :
+    filter(prev),
+    counter_(counter)
+  { }
+  bool operator()(const mer_dna& m) {
+    unsigned int c = counter_.check(m);
+    return and_res(c > 1, m);
+  }
+};
+
+struct filter_bf : public filter {
+  mer_dna_bloom_filter& bf_;
+  filter_bf(mer_dna_bloom_filter& bf, filter* prev = 0) :
+    filter(prev),
+    bf_(bf)
+  { }
+  bool operator()(const mer_dna& m) {
+    unsigned int c = bf_.insert(m);
+    return and_res(c > 0, m);
+  }
+};
+
+enum OPERATION { COUNT, PRIME, UPDATE };
+template<typename PathIterator, typename MerIteratorType, typename ParserType>
+class mer_counter_base : public jellyfish::thread_exec {
+  int                                     nb_threads_;
+  mer_hash&                               ary_;
+  jellyfish::stream_manager<PathIterator> streams_;
+  ParserType                              parser_;
+  filter*                                 filter_;
+  OPERATION                               op_;
+
+public:
+  mer_counter_base(int nb_threads, mer_hash& ary,
+                   PathIterator file_begin, PathIterator file_end,
+                   PathIterator pipe_begin, PathIterator pipe_end,
+                   uint32_t concurent_files,
+                   OPERATION op, filter* filter = new struct filter) :
+    ary_(ary),
+    streams_(file_begin, file_end, pipe_begin, pipe_end, concurent_files),
+    parser_(mer_dna::k(), streams_.nb_streams(), 3 * nb_threads, 4096, streams_),
+    filter_(filter),
+    op_(op)
+  { }
+
+  virtual void start(int thid) {
+    size_t count = 0;
+    MerIteratorType mers(parser_, args.canonical_flag);
+
+    switch(op_) {
+     case COUNT:
+      for( ; mers; ++mers) {
+        if((*filter_)(*mers))
+          ary_.add(*mers, 1);
+        ++count;
+      }
+      break;
+
+    case PRIME:
+      for( ; mers; ++mers) {
+        if((*filter_)(*mers))
+          ary_.set(*mers);
+        ++count;
+      }
+      break;
+
+    case UPDATE:
+      mer_dna tmp;
+      for( ; mers; ++mers) {
+        if((*filter_)(*mers))
+          ary_.update_add(*mers, 1, tmp);
+        ++count;
+      }
+      break;
+    }
+
+    ary_.done();
+  }
+};
+
+// Counter with and without quality value
+typedef mer_counter_base<file_vector::const_iterator, mer_iterator, sequence_parser> mer_counter;
+typedef mer_counter_base<file_vector::const_iterator, mer_qual_iterator, sequence_qual_parser> mer_qual_counter;
+
+mer_dna_bloom_counter* load_bloom_filter(const char* path) {
+  std::ifstream in(path, std::ios::in|std::ios::binary);
+  jellyfish::file_header header(in);
+  if(!in.good())
+    die << "Failed to parse bloom filter file '" << path << "'";
+  if(header.format() != "bloomcounter")
+    die << "Invalid format '" << header.format() << "'. Expected 'bloomcounter'";
+  if(header.key_len() != mer_dna::k() * 2)
+    die << "Invalid mer length in bloom filter";
+  jellyfish::hash_pair<mer_dna> fns(header.matrix(1), header.matrix(2));
+  auto res = new mer_dna_bloom_counter(header.size(), header.nb_hashes(), in, fns);
+  if(!in.good())
+    die << "Bloom filter file is truncated";
+  in.close();
+  return res;
+}
+
+// If get a termination signal, kill the manager and then kill myself.
+static pid_t manager_pid = 0;
+static void signal_handler(int sig) {
+  if(manager_pid)
+    kill(manager_pid, SIGTERM);
+  signal(sig, SIG_DFL);
+  kill(getpid(), sig);
+  _exit(EXIT_FAILURE); // Should not be reached
+}
+
+int jellyfish_count_main(int argc, char *argv[])
+{
+  auto start_time = system_clock::now();
+
+  jellyfish::file_header header;
+  header.fill_standard();
+  header.set_cmdline(argc, argv);
+
+  args.parse(argc, argv);
+
+  if(args.min_qual_char_given && args.min_qual_char_arg.size() != 1)
+    count_main_cmdline::error("[-Q, --min-qual-char] must be one character.");
+
+  mer_dna::k(args.mer_len_arg);
+
+  std::unique_ptr<jellyfish::generator_manager> generator_manager;
+  if(args.generator_given) {
+    auto gm =
+      new jellyfish::generator_manager(args.generator_arg, args.Generators_arg,
+                                       args.shell_given ? args.shell_arg : (const char*)0);
+    generator_manager.reset(gm);
+    generator_manager->start();
+    manager_pid = generator_manager->pid();
+    struct sigaction act;
+    memset(&act, '\0', sizeof(act));
+    act.sa_handler = signal_handler;
+    assert(sigaction(SIGTERM, &act, 0) == 0);
+  }
+
+  header.canonical(args.canonical_flag);
+  mer_hash ary(args.size_arg, args.mer_len_arg * 2, args.counter_len_arg, args.threads_arg, args.reprobes_arg);
+  if(args.disk_flag)
+    ary.do_size_doubling(false);
+
+  std::auto_ptr<jellyfish::dumper_t<mer_array> > dumper;
+  if(args.text_flag)
+    dumper.reset(new text_dumper(args.threads_arg, args.output_arg, &header));
+  else
+    dumper.reset(new binary_dumper(args.out_counter_len_arg, ary.key_len(), args.threads_arg, args.output_arg, &header));
+  ary.dumper(dumper.get());
+
+  auto after_init_time = system_clock::now();
+
+  OPERATION do_op = COUNT;
+  if(args.if_given) {
+    mer_counter counter(args.threads_arg, ary,
+                        args.if_arg.begin(), args.if_arg.end(),
+                        args.if_arg.end(), args.if_arg.end(), // no multi pipes
+                        args.Files_arg, PRIME);
+    counter.exec_join(args.threads_arg);
+    do_op = UPDATE;
+  }
+
+  // Iterators to the multi pipe paths. If no generator manager,
+  // generate an empty range.
+  auto pipes_begin = generator_manager.get() ? generator_manager->pipes().begin() : args.file_arg.end();
+  auto pipes_end = (bool)generator_manager ? generator_manager->pipes().end() : args.file_arg.end();
+
+  // Bloom counter read from file to filter out low frequency
+  // k-mers. Two pass algorithm.
+  std::unique_ptr<filter> mer_filter(new filter);
+  std::unique_ptr<mer_dna_bloom_counter> bc;
+  if(args.bc_given) {
+    bc.reset(load_bloom_filter(args.bc_arg));
+    mer_filter.reset(new filter_bc(*bc));
+  }
+
+  // Bloom filter to filter out low frequency k-mers. One pass
+  // algorithm.
+  std::unique_ptr<mer_dna_bloom_filter> bf;
+  if(args.bf_size_given) {
+    bf.reset(new mer_dna_bloom_filter(args.bf_fp_arg, args.bf_size_arg));
+    mer_filter.reset(new filter_bf(*bf));
+  }
+
+  if(args.min_qual_char_given) {
+    mer_qual_counter counter(args.threads_arg, ary,
+                             args.file_arg.begin(), args.file_arg.end(),
+                             pipes_begin, pipes_end,
+                             args.Files_arg,
+                             do_op, mer_filter.get());
+    counter.exec_join(args.threads_arg);
+  } else {
+    mer_counter counter(args.threads_arg, ary,
+                        args.file_arg.begin(), args.file_arg.end(),
+                        pipes_begin, pipes_end,
+                        args.Files_arg,
+                        do_op, mer_filter.get());
+    counter.exec_join(args.threads_arg);
+  }
+
+  // If we have a manager, wait for it
+  if(generator_manager) {
+    signal(SIGTERM, SIG_DFL);
+    manager_pid = 0;
+    if(!generator_manager->wait())
+      die << "Some generator commands failed";
+    generator_manager.reset();
+  }
+
+  auto after_count_time = system_clock::now();
+
+  // If no intermediate files, dump directly into output file. If not, will do a round of merging
+  if(!args.no_write_flag) {
+    if(dumper->nb_files() == 0) {
+      dumper->one_file(true);
+      if(args.lower_count_given)
+        dumper->min(args.lower_count_arg);
+      if(args.upper_count_given)
+        dumper->max(args.upper_count_arg);
+      dumper->dump(ary.ary());
+    } else {
+      dumper->dump(ary.ary());
+      if(!args.no_merge_flag) {
+        std::vector<const char*> files = dumper->file_names_cstr();
+        uint64_t min = args.lower_count_given ? args.lower_count_arg : 0;
+        uint64_t max = args.upper_count_given ? args.upper_count_arg : std::numeric_limits<uint64_t>::max();
+        try {
+          merge_files(files, args.output_arg, header, min, max);
+        } catch(MergeError e) {
+          die << e.what();
+        }
+        if(!args.no_unlink_flag) {
+          for(int i =0; i < dumper->nb_files(); ++i)
+            unlink(files[i]);
+        }
+      } // if(!args.no_merge_flag
+    } // if(!args.no_merge_flag
+  }
+
+  auto after_dump_time = system_clock::now();
+
+  if(args.timing_given) {
+    std::ofstream timing_file(args.timing_arg);
+    timing_file << "Init     " << as_seconds(after_init_time - start_time) << "\n"
+                << "Counting " << as_seconds(after_count_time - after_init_time) << "\n"
+                << "Writing  " << as_seconds(after_dump_time - after_count_time) << "\n";
+  }
+
+  return 0;
+}
diff --git a/src/LibraryFormat.cpp b/src/LibraryFormat.cpp
new file mode 100644
index 0000000..d921f94
--- /dev/null
+++ b/src/LibraryFormat.cpp
@@ -0,0 +1,99 @@
+#include "LibraryFormat.hpp"
+
+LibraryFormat::LibraryFormat(ReadType type_in, ReadOrientation orientation_in, ReadStrandedness strandedness_in) :
+    type(type_in), orientation(orientation_in), strandedness(strandedness_in) {}
+
+bool LibraryFormat::check() {
+    bool valid = true;
+    switch (type) {
+
+    case ReadType::SINGLE_END:
+        valid = valid and (orientation == ReadOrientation::NONE);
+        switch (strandedness) {
+            // If we're dealing with single end reads, then separate orientations
+            // are nonsensical.
+        case ReadStrandedness::SA:
+        case ReadStrandedness::AS:
+            valid = false;
+            break;
+        default:
+            break;
+        }
+        break;
+
+    case ReadType::PAIRED_END:
+        switch (orientation) {
+            // For paired-end reads, an orientation must be defined
+        case ReadOrientation::NONE:
+            valid = false;
+            break;
+            // If the reads are oriented in the same direction, then they
+            // have to come from the same strand.
+        case ReadOrientation::SAME:
+            valid = valid and ((strandedness == ReadStrandedness::S) or
+                               (strandedness == ReadStrandedness::A) or
+                               (strandedness == ReadStrandedness::U));
+
+            break;
+            // If the reads are oriented away from each other or toward each other,
+            // then they must come from different strands.
+        case ReadOrientation::AWAY:
+        case ReadOrientation::TOWARD:
+            valid = valid and ((strandedness == ReadStrandedness::SA) or
+                               (strandedness == ReadStrandedness::AS) or
+                               (strandedness == ReadStrandedness::U));
+            break;
+        }
+        break;
+    }
+    return valid;
+}
+
+std::ostream& operator<<(std::ostream& os, const LibraryFormat& lf) {
+    os << "Library format { type:";
+    switch (lf.type) {
+        case ReadType::SINGLE_END:
+            os << "single end";
+            break;
+        case ReadType::PAIRED_END:
+            os << "paired end";
+            break;
+    }
+
+    os << ", relative orientation:";
+    switch (lf.orientation)  {
+        case ReadOrientation::TOWARD:
+            os << "inward";
+            break;
+        case ReadOrientation::AWAY:
+           os << "outward";
+           break;
+        case ReadOrientation::SAME:
+           os << "matching";
+           break;
+        case ReadOrientation::NONE:
+           os << "none";
+           break;
+    }
+
+    os << ", strandedness:";
+    switch (lf.strandedness)  {
+        case ReadStrandedness::SA:
+            os << "(sense, antisense)";
+            break;
+        case ReadStrandedness::AS:
+           os << "(antisense, sense)";
+           break;
+        case ReadStrandedness::S:
+           os << "sense";
+           break;
+        case ReadStrandedness::A:
+           os << "antisense";
+           break;
+        case ReadStrandedness::U:
+           os << "unstranded";
+           break;
+    }
+    os << " }";
+    return os;
+}
diff --git a/src/LookUpTableUtils.cpp b/src/LookUpTableUtils.cpp
new file mode 100644
index 0000000..2bf0d27
--- /dev/null
+++ b/src/LookUpTableUtils.cpp
@@ -0,0 +1,255 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstdint>
+#include <cstring>
+#include <cstdio>
+#include <sstream>
+#include <memory>
+#include <functional>
+#include <unordered_map>
+#include <mutex>
+#include <thread>
+#include <chrono>
+#include <iomanip>
+
+#include "tbb/parallel_for.h"
+#include "tbb/parallel_for_each.h"
+
+#include <boost/range/irange.hpp>
+#include "ezETAProgressBar.hpp"
+#include "LookUpTableUtils.hpp"
+
+
+namespace LUTTools {
+
+/**
+ *  \brief Dump the k-mer memberships vector to the file fname
+ **/
+void dumpKmerEquivClasses(
+                          const std::vector<KmerID>& memberships,
+                          const std::string& fname) {
+
+  std::ofstream ofile(fname, std::ios::binary);
+
+  // Write the length of the vector to the file
+  size_t vecLen{memberships.size()};
+  ofile.write(reinterpret_cast<const char*>(&vecLen), sizeof(vecLen));
+
+  // Write the actual vector to the file
+  ofile.write(reinterpret_cast<const char*>(&memberships.front()), sizeof(memberships.front()) * vecLen);
+
+  // Close the file
+  ofile.close();
+}
+
+
+std::vector<KmerID> readKmerEquivClasses(const std::string& fname) {
+  std::ifstream ifile(fname, std::ios::binary);
+  size_t vecLen{0};
+  ifile.read(reinterpret_cast<char*>(&vecLen), sizeof(vecLen));
+
+  std::vector<KmerID> memberships(vecLen, std::numeric_limits<KmerID>::max());
+  ifile.read(reinterpret_cast<char*>(&memberships.front()), sizeof(memberships.front()) * vecLen);
+
+  ifile.close();
+  return memberships;
+}
+
+void dumpKmerLUT(
+    std::vector<TranscriptList> &transcriptsForKmerClass,
+    const std::string &fname) {
+
+    tbb::parallel_for_each( transcriptsForKmerClass.begin(), transcriptsForKmerClass.end(),
+    [&]( TranscriptList & t ) {
+        std::sort(t.begin(), t.end());
+    });
+
+    std::ofstream ofile(fname, std::ios::binary);
+    // number of kmers
+    auto numk = transcriptsForKmerClass.size();
+    ofile.write(reinterpret_cast<const char *>(&numk), sizeof(numk));
+    for (auto i : boost::irange(size_t(0), numk)) {
+        // write the vector's size
+        auto s = transcriptsForKmerClass[i].size();
+        ofile.write(reinterpret_cast<const char *>(&s), sizeof(s));
+        // write the vector's contents
+        if ( s > 0 ) {
+            ofile.write(reinterpret_cast<const char *>(&transcriptsForKmerClass[i][0]), s * sizeof(transcriptsForKmerClass[i][0]));
+        }
+    }
+    // close the output file
+    ofile.close();
+}
+
+void readKmerLUT(
+    const std::string &fname,
+    std::vector<TranscriptList> &transcriptsForKmer) {
+
+    std::ifstream ifile(fname, std::ios::binary);
+    // get the size of the vector from file
+    size_t numk = 0;
+    ifile.read(reinterpret_cast<char *>(&numk), sizeof(numk));
+    // resize the vector now
+    transcriptsForKmer.resize(numk);
+
+    for (auto i : boost::irange(size_t(0), numk)) {
+        // read the vector's size
+        size_t numTran = 0;
+        ifile.read(reinterpret_cast<char *>(&numTran), sizeof(numTran));
+        // read the vector's contents
+        if ( numTran > 0 ) {
+            transcriptsForKmer[i].resize(numTran);
+            ifile.read(reinterpret_cast<char *>(&transcriptsForKmer[i][0]), numTran * sizeof(TranscriptID));
+        }
+    }
+
+    ifile.close();
+}
+
+
+void writeTranscriptInfo (TranscriptInfo *ti, std::ofstream &ostream) {
+    size_t numKmers = ti->kmers.size();
+    size_t recordSize = sizeof(ti->transcriptID) +
+                        sizeof(ti->geneID) +
+                        sizeof(ti->name.length()) +
+                        ti->name.length() +
+                        sizeof(ti->length) +
+                        sizeof(numKmers) +
+                        sizeof(KmerID) * numKmers;
+
+    ostream.write(reinterpret_cast<const char *>(&recordSize), sizeof(recordSize));
+    ostream.write(reinterpret_cast<const char *>(&ti->transcriptID), sizeof(ti->transcriptID));
+    ostream.write(reinterpret_cast<const char *>(&ti->geneID), sizeof(ti->geneID));
+    auto l = ti->name.length();
+    ostream.write(reinterpret_cast<const char *>(&l), sizeof(l));
+    ostream.write(reinterpret_cast<const char *>(ti->name.c_str()), l);
+    ostream.write(reinterpret_cast<const char *>(&ti->length), sizeof(ti->length));
+    ostream.write(reinterpret_cast<const char *>(&numKmers), sizeof(numKmers));
+    ostream.write(reinterpret_cast<const char *>(&ti->kmers[0]), numKmers * sizeof(KmerID));
+}
+
+std::unique_ptr<TranscriptInfo> readTranscriptInfo(std::ifstream &istream) {
+    std::unique_ptr<TranscriptInfo> ti(new TranscriptInfo);
+    size_t recordSize = 0;
+    istream.read(reinterpret_cast<char *>(&recordSize), sizeof(recordSize));
+    istream.read(reinterpret_cast<char *>(&ti->transcriptID), sizeof(ti->transcriptID));
+    istream.read(reinterpret_cast<char *>(&ti->geneID), sizeof(ti->geneID));
+    size_t slen = 0;
+    istream.read(reinterpret_cast<char *>(&slen), sizeof(slen));
+    char *name = new char[slen+1];
+    name[slen] = '\0';
+    istream.read(name, slen);
+    ti->name = std::string(name);
+    // read the transcript's length
+    istream.read(reinterpret_cast<char *>(&ti->length), sizeof(ti->length));
+    size_t numKmers = 0;
+    istream.read(reinterpret_cast<char *>(&numKmers), sizeof(numKmers));
+    ti->kmers = std::vector<KmerID>(numKmers, 0);
+    istream.read(reinterpret_cast<char *>(&ti->kmers[0]), sizeof(KmerID)*numKmers);
+    return ti;
+}
+
+/*
+std::vector<std::unique_ptr<TranscriptInfo>> getTranscriptsFromFile(const std::string &tlutfname,
+        const std::vector<Offset> &offsets,
+std::pair<CTVecIt, CTVecIt> be) {
+
+    Offset INVALID = std::numeric_limits<Offset>::max();
+    std::cerr << "before creating transcript vector" << std::endl;
+    auto batchSize = std::distance(be.first, be.second);
+    std::cerr << "batch size = " << batchSize << "\n";
+    std::vector<std::unique_ptr<TranscriptInfo>> transcripts;//(batchSize);
+    //transcripts.reserve();
+    std::cerr << "after creating transcript vector " << std::endl;
+
+    std::cerr << "opening input file" << std::endl;
+    std::ifstream ifile(tlutfname, std::ios::binary);
+    std::cerr << "opened" << std::endl;
+    size_t idx = 0;
+    std::cerr << "iterating through batch" << std::endl;
+    for (auto it = be.first; it != be.second; ++it) {
+        std::cerr << "getting offset for transcript " << it->tid << std::endl;
+        auto offset = offsets[it->tid];
+        if ( offset == INVALID ) {
+            std::cerr << "encountered invalid offset for transcript " << it->tid << std::endl;
+            std::exit(1);
+        }
+        std::cerr << "seeking to " << offset << std::endl;
+        ifile.seekg(offset);
+        std::cerr << "reading in transcript from file" << std::endl;
+        transcripts.emplace_back(readTranscriptInfo(ifile));
+        ++idx;
+        // Check that the transcript we read is the one we expected
+        if (it->tid != transcripts.back()->transcriptID) {
+            std::cerr << "read the wrong transcript!\n";
+            std::exit(1);
+        }
+    }
+
+    ifile.close();
+    std::cerr << "done reading all transcripts from this batch" << std::endl;
+
+    return std::move(transcripts);
+}
+*/
+
+std::vector<Offset> buildTLUTIndex(const std::string &tlutfname, size_t numTranscripts) {
+
+    std::cerr << "creating vector of " << numTranscripts << " offsets \n";
+    Offset INVALID = std::numeric_limits<Offset>::max();
+    std::vector<Offset> offsets(numTranscripts, INVALID);
+    std::cerr << "done\n";
+
+    std::cerr << "opening file\n";
+    std::ifstream ifile(tlutfname, std::ios::binary);
+    std::cerr << "done\n";
+
+    size_t numRecords {0};
+    std::cerr << "reading numRecords\n";
+    ifile.read(reinterpret_cast<char *>(&numRecords), sizeof(numRecords));
+    std::cerr << "numRecords = " << numRecords << std::endl;
+    std::cerr << "numTranscripts = " << numTranscripts << std::endl;
+
+    std::cerr << "building transcript lookup table index" << std::endl;
+    ez::ezETAProgressBar pb(numRecords);
+    pb.start();
+    Offset offset = sizeof(numRecords);
+    for (auto recNum : boost::irange(size_t(0), numRecords)) {
+        size_t recordSize = 0;
+        TranscriptID tid = 0;
+        ifile.read(reinterpret_cast<char *>(&recordSize), sizeof(recordSize));
+        ifile.read(reinterpret_cast<char *>(&tid), sizeof(tid));
+        offsets[tid] = offset;
+        offset += recordSize + sizeof(recordSize);
+        ifile.seekg(offset);
+        ++pb;
+    }
+
+    ifile.close();
+    return offsets;
+}
+}
diff --git a/src/MultithreadedBAMParser.cpp b/src/MultithreadedBAMParser.cpp
new file mode 100644
index 0000000..8cdfcbe
--- /dev/null
+++ b/src/MultithreadedBAMParser.cpp
@@ -0,0 +1,95 @@
+/**
+>HEADER
+    Copyright (c) 2014 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include "MultithreadedBAMParser.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+#include <thread>
+#include <atomic>
+#include <iostream>
+
+#include <boost/filesystem.hpp>
+#include <boost/range/irange.hpp>
+#include "tbb/concurrent_queue.h"
+
+
+namespace bfs = boost::filesystem;
+using BamTools::BamReader;
+using BamTools::BamAlignment;
+
+MultithreadedBAMParser::MultithreadedBAMParser(std::vector<bfs::path>& files) : inputStreams_(files),
+        parsing_(false), parsingThread_(nullptr)
+    {
+        alnStructs_ = new BamTools::BamAlignment[queueCapacity_];
+        alnQueue_.set_capacity(queueCapacity_);
+        seqContainerQueue_.set_capacity(queueCapacity_);
+        for (size_t i = 0; i < queueCapacity_; ++i) {
+            seqContainerQueue_.push(&alnStructs_[i]);
+        }
+    }
+
+MultithreadedBAMParser::~MultithreadedBAMParser() {
+        parsingThread_->join();
+        delete [] alnStructs_;
+        delete parsingThread_;
+    }
+
+bool MultithreadedBAMParser::start() {
+        if (!parsing_) {
+            parsing_ = true;
+            parsingThread_ = new std::thread([this](){
+
+                BamTools::BamReader reader;
+
+                for (auto file : this->inputStreams_) {
+                    reader.Open(file.string());
+                    std::cerr << "reading from " << file.native() << "\n";
+
+                    BamAlignment* aln;
+                    this->seqContainerQueue_.pop(aln);
+                    while (reader.GetNextAlignment(*aln)) {
+                        this->alnQueue_.push(aln);
+                        this->seqContainerQueue_.pop(aln);
+                    }
+                    // close the file
+                    reader.Close();
+                }
+
+                this->parsing_ = false;
+            });
+            return true;
+        } else {
+            return false;
+        }
+
+    }
+
+bool MultithreadedBAMParser::nextAlignment(BamAlignment*& seq) {
+        while(parsing_ or !alnQueue_.empty()) {
+            if (alnQueue_.try_pop(seq)) { return true; }
+        }
+        return false;
+}
+
+void MultithreadedBAMParser::finishedWithAlignment(BamAlignment*& s) { seqContainerQueue_.push(s); }
diff --git a/src/PCA.cpp b/src/PCA.cpp
new file mode 100644
index 0000000..a7cbbd3
--- /dev/null
+++ b/src/PCA.cpp
@@ -0,0 +1,306 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#define NODEBUG
+#include <iostream>
+#ifdef DEBUG
+  #include <iterator>
+#endif
+#include <Eigen/SVD>
+#include "pca.h"
+
+using namespace std;
+using namespace Eigen;
+
+int Pca::Calculate(vector<float> &x,
+              const unsigned int &nrows,
+              const unsigned int &ncols,
+              const bool is_corr,
+              const bool is_center,
+              const bool is_scale) {
+  _ncols = ncols;
+  _nrows = nrows;
+  _is_corr = is_corr;
+  _is_center = is_center;
+  _is_scale = is_scale;
+  if (x.size()!= _nrows*_ncols) {
+    return -1;
+  }
+  if ((1 == _ncols) || (1 == nrows)) {
+    return -1;
+  }
+  // Convert vector to Eigen 2-dimensional matrix
+  //Map<MatrixXf> _xXf(x.data(), _nrows, _ncols);
+  _xXf.resize(_nrows, _ncols);
+  for (unsigned int i = 0; i < _nrows; ++i) {
+    for (unsigned int j = 0; j < _ncols; ++j) {
+      _xXf(i, j) = x[j + i*_ncols];
+    }
+  }
+  // Mean and standard deviation for each column
+  VectorXf mean_vector(_ncols);
+  mean_vector = _xXf.colwise().mean();
+  VectorXf sd_vector(_ncols);
+  unsigned int zero_sd_num = 0;
+  float denom = static_cast<float>((_nrows > 1)? _nrows - 1: 1);
+  for (unsigned int i = 0; i < _ncols; ++i) {
+     VectorXf curr_col  = VectorXf::Constant(_nrows, mean_vector(i)); // mean(x) for column x
+     curr_col = _xXf.col(i) - curr_col; // x - mean(x)
+     curr_col = curr_col.array().square(); // (x-mean(x))^2
+     sd_vector(i) = sqrt((curr_col.sum())/denom);
+     if (0 == sd_vector(i)) {
+      zero_sd_num++;
+     }
+  }
+  // If colums with sd == 0 are too many,
+  // don't continue calculations
+  if (1 > _ncols-zero_sd_num) {
+    return -1;
+  }
+  // Delete columns where sd == 0
+  MatrixXf tmp(_nrows, _ncols-zero_sd_num);
+  VectorXf tmp_mean_vector(_ncols-zero_sd_num);
+  unsigned int curr_col_num = 0;
+  for (unsigned int i = 0; i < _ncols; ++i) {
+    if (0 != sd_vector(i)) {
+      tmp.col(curr_col_num) = _xXf.col(i);
+      tmp_mean_vector(curr_col_num) = mean_vector(i);
+      curr_col_num++;
+    } else {
+      _eliminated_columns.push_back(i);
+    }
+  }
+  _ncols -= zero_sd_num;
+  _xXf = tmp;
+  mean_vector = tmp_mean_vector;
+  tmp.resize(0, 0); tmp_mean_vector.resize(0);
+  // Shift to zero
+  if (true == _is_center) {
+    for (unsigned int i = 0; i < _ncols; ++i) {
+      _xXf.col(i) -= VectorXf::Constant(_nrows, mean_vector(i));
+    }
+  }
+  // Scale to unit variance
+  if ( (false == _is_corr) || (true == _is_scale)) {
+    for (unsigned int i = 0; i < _ncols; ++i) {
+     _xXf.col(i) /= sqrt(_xXf.col(i).array().square().sum()/denom);
+    }
+  }
+  #ifdef DEBUG
+    cout << "\nScaled matrix:\n";
+    cout << _xXf << endl;
+    cout << "\nMean before scaling:\n" << mean_vector.transpose();
+    cout << "\nStandard deviation before scaling:\n" << sd_vector.transpose();
+  #endif
+  // When _nrows < _ncols then svd will be used.
+  // If corr is true and _nrows > _ncols then will be used correlation matrix
+  // (TODO): What about covariance?
+  if ( (_nrows < _ncols) || (false == _is_corr)) { // Singular Value Decomposition is on
+    _method = "svd";
+    JacobiSVD<MatrixXf> svd(_xXf, ComputeThinV);
+    VectorXf eigen_singular_values = svd.singularValues();
+    VectorXf tmp_vec = eigen_singular_values.array().square();
+    float tmp_sum = tmp_vec.sum();
+    tmp_vec /= tmp_sum;
+    // PC's standard deviation and
+    // PC's proportion of variance
+    _kaiser = 0;
+    unsigned int lim = (_nrows < _ncols)? _nrows : _ncols;
+    for (unsigned int i = 0; i < lim; ++i) {
+      _sd.push_back(eigen_singular_values(i)/sqrt(denom));
+      if (_sd[i] >= 1) {
+        _kaiser = i + 1;
+      }
+      _prop_of_var.push_back(tmp_vec(i));
+    }
+    #ifdef DEBUG
+      cout << "\n\nStandard deviations for PCs:\n";
+      copy(_sd.begin(), _sd.end(),std::ostream_iterator<float>(std::cout," "));
+      cout << "\n\nKaiser criterion: PC #" << _kaiser << endl;
+    #endif
+    tmp_vec.resize(0);
+    // PC's cumulative proportion
+    _thresh95 = 1;
+    _cum_prop.push_back(_prop_of_var[0]);
+    for (unsigned int i = 1; i < _prop_of_var.size(); ++i) {
+      _cum_prop.push_back(_cum_prop[i-1]+_prop_of_var[i]);
+      if (_cum_prop[i] < 0.95) {
+        _thresh95 = i+1;
+      }
+    }
+    #ifdef DEBUG
+      cout << "\nCumulative proportion:\n";
+      copy(_cum_prop.begin(), _cum_prop.end(),std::ostream_iterator<float>(std::cout," "));
+      cout << "\n\nThresh95 criterion: PC #" << _thresh95 << endl;
+    #endif
+    // Scores
+    MatrixXf eigen_scores = _xXf * svd.matrixV();
+    #ifdef DEBUG
+      cout << "\n\nRotated values (scores):\n" << eigen_scores;
+    #endif
+    _scores.reserve(lim*lim);
+    for (unsigned int i = 0; i < lim; ++i) {
+      for (unsigned int j = 0; j < lim; ++j) {
+        _scores.push_back(eigen_scores(i, j));
+      }
+    }
+    eigen_scores.resize(0, 0);
+    #ifdef DEBUG
+      cout << "\n\nScores in vector:\n";
+      copy(_scores.begin(), _scores.end(),std::ostream_iterator<float>(std::cout," "));
+      cout << "\n";
+    #endif
+  } else { // COR OR COV MATRICES ARE HERE
+    _method = "cor";
+    // Calculate covariance matrix
+    MatrixXf eigen_cov; // = MatrixXf::Zero(_ncols, _ncols);
+    VectorXf sds;
+    // (TODO) Should be weighted cov matrix, even if is_center == false
+    eigen_cov = (1.0 /(_nrows/*-1*/)) * _xXf.transpose() * _xXf;
+    sds = eigen_cov.diagonal().array().sqrt();
+    MatrixXf outer_sds = sds * sds.transpose();
+    eigen_cov = eigen_cov.array() / outer_sds.array();
+    outer_sds.resize(0, 0);
+    // ?If data matrix is scaled, covariance matrix is equal to correlation matrix
+    #ifdef DEBUG
+      cout << eigen_cov << endl;
+    #endif
+    EigenSolver<MatrixXf> edc(eigen_cov);
+    VectorXf eigen_eigenvalues = edc.eigenvalues().real();
+    #ifdef DEBUG
+      cout << endl << eigen_eigenvalues.transpose() << endl;
+    #endif
+    MatrixXf eigen_eigenvectors = edc.eigenvectors().real();
+    #ifdef DEBUG
+      cout << endl << eigen_eigenvectors << endl;
+    #endif
+    // The eigenvalues and eigenvectors are not sorted in any particular order.
+    // So, we should sort them
+    typedef pair<float, int> eigen_pair;
+    vector<eigen_pair> ep;
+    for (unsigned int i = 0 ; i < _ncols; ++i) {
+	    ep.push_back(make_pair(eigen_eigenvalues(i), i));
+    }
+    sort(ep.begin(), ep.end()); // Ascending order by default
+    // Sort them all in descending order
+    MatrixXf eigen_eigenvectors_sorted = MatrixXf::Zero(eigen_eigenvectors.rows(), eigen_eigenvectors.cols());
+    VectorXf eigen_eigenvalues_sorted = VectorXf::Zero(_ncols);
+    int colnum = 0;
+    int i = ep.size()-1;
+    for (; i > -1; i--) {
+      eigen_eigenvalues_sorted(colnum) = ep[i].first;
+      eigen_eigenvectors_sorted.col(colnum++) += eigen_eigenvectors.col(ep[i].second);
+    }
+    #ifdef DEBUG
+      cout << endl << eigen_eigenvalues_sorted.transpose() << endl;
+      cout << endl << eigen_eigenvectors_sorted << endl;
+    #endif
+    // We don't need not sorted arrays anymore
+    eigen_eigenvalues.resize(0);
+    eigen_eigenvectors.resize(0, 0);
+
+    _sd.clear(); _prop_of_var.clear(); _kaiser = 0;
+    float tmp_sum = eigen_eigenvalues_sorted.sum();
+    for (unsigned int i = 0; i < _ncols; ++i) {
+      _sd.push_back(sqrt(eigen_eigenvalues_sorted(i)));
+      if (_sd[i] >= 1) {
+        _kaiser = i + 1;
+      }
+      _prop_of_var.push_back(eigen_eigenvalues_sorted(i)/tmp_sum);
+    }
+    #ifdef DEBUG
+      cout << "\nStandard deviations for PCs:\n";
+      copy(_sd.begin(), _sd.end(), std::ostream_iterator<float>(std::cout," "));
+      cout << "\nProportion of variance:\n";
+      copy(_prop_of_var.begin(), _prop_of_var.end(), std::ostream_iterator<float>(std::cout," "));
+      cout << "\nKaiser criterion: PC #" << _kaiser << endl;
+    #endif
+    // PC's cumulative proportion
+    _cum_prop.clear(); _thresh95 = 1;
+    _cum_prop.push_back(_prop_of_var[0]);
+    for (unsigned int i = 1; i < _prop_of_var.size(); ++i) {
+      _cum_prop.push_back(_cum_prop[i-1]+_prop_of_var[i]);
+      if (_cum_prop[i] < 0.95) {
+        _thresh95 = i+1;
+      }
+    }
+    #ifdef DEBUG
+      cout << "\n\nCumulative proportions:\n";
+      copy(_cum_prop.begin(), _cum_prop.end(), std::ostream_iterator<float>(std::cout," "));
+      cout << "\n\n95% threshold: PC #" << _thresh95 << endl;
+    #endif
+    // Scores for PCA with correlation matrix
+    // Scale before calculating new values
+    for (unsigned int i = 0; i < _ncols; ++i) {
+     _xXf.col(i) /= sds(i);
+    }
+    sds.resize(0);
+    MatrixXf eigen_scores = _xXf * eigen_eigenvectors_sorted;
+    #ifdef DEBUG
+      cout << "\n\nRotated values (scores):\n" << eigen_scores;
+    #endif
+    _scores.clear();
+    _scores.reserve(_ncols*_nrows);
+    for (unsigned int i = 0; i < _nrows; ++i) {
+      for (unsigned int j = 0; j < _ncols; ++j) {
+        _scores.push_back(eigen_scores(i, j));
+      }
+    }
+    eigen_scores.resize(0, 0);
+    #ifdef DEBUG
+      cout << "\n\nScores in vector:\n";
+      copy(_scores.begin(), _scores.end(), std::ostream_iterator<float>(std::cout," "));
+      cout << "\n";
+    #endif
+  }
+
+  return 0;
+}
+std::vector<float> Pca::sd(void) { return _sd; };
+std::vector<float> Pca::prop_of_var(void) {return _prop_of_var; };
+std::vector<float> Pca::cum_prop(void) { return _cum_prop; };
+std::vector<float> Pca::scores(void) { return _scores; };
+std::vector<unsigned int> Pca::eliminated_columns(void) { return _eliminated_columns; }
+string Pca::method(void) { return _method; }
+unsigned int Pca::kaiser(void) { return _kaiser; };
+unsigned int Pca::thresh95(void) { return _thresh95; };
+unsigned int Pca::ncols(void) { return _ncols; }
+unsigned int Pca::nrows(void) { return _nrows; }
+bool Pca::is_scale(void) {  return _is_scale; }
+bool Pca::is_center(void) { return _is_center; }
+Pca::Pca(void) {
+  _nrows      = 0;
+  _ncols      = 0;
+  // Variables will be scaled by default
+  _is_center  = true;
+  _is_scale   = true;
+  // By default will be used singular value decomposition
+  _method   = "svd";
+  _is_corr  = false;
+
+  _kaiser   = 0;
+  _thresh95 = 1;
+}
+Pca::~Pca(void) {
+  _xXf.resize(0, 0);
+  _x.clear();
+}
diff --git a/src/PCAUtils.cpp b/src/PCAUtils.cpp
new file mode 100644
index 0000000..f091269
--- /dev/null
+++ b/src/PCAUtils.cpp
@@ -0,0 +1,133 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include "pca.h"
+#include <stdexcept>
+#include <sstream>
+#include <numeric>
+
+namespace stats {
+namespace utils {
+
+arma::Mat<double> make_covariance_matrix(const arma::Mat<double>& data) {
+	return std::move( (data.t()*data) * (1./(data.n_rows-1)) );
+}
+
+arma::Mat<double> make_shuffled_matrix(const arma::Mat<double>& data) {
+	const long n_rows = data.n_rows;
+	const long n_cols = data.n_cols;
+	arma::Mat<double> shuffle(n_rows, n_cols);
+	for (long j=0; j<n_cols; ++j) {
+	    for (long i=0; i<n_rows; ++i) {
+	    	shuffle(i, j) = data(std::rand()%n_rows, j);
+	    }
+    }
+    return std::move(shuffle);
+}
+
+arma::Col<double> compute_column_means(const arma::Mat<double>& data) {
+	const long n_cols = data.n_cols;
+	arma::Col<double> means(n_cols);
+	for (long i=0; i<n_cols; ++i)
+    	means(i) = arma::mean(data.col(i));
+	return std::move(means);
+}
+
+void remove_column_means(arma::Mat<double>& data, const arma::Col<double>& means) {
+	if (data.n_cols != means.n_elem)
+		throw std::range_error("Number of elements of means is not equal to the number of columns of data");
+    for (long i=0; i<long(data.n_cols); ++i)
+    	data.col(i) -= means(i);
+}
+
+arma::Col<double> compute_column_rms(const arma::Mat<double>& data) {
+	const long n_cols = data.n_cols;
+	arma::Col<double> rms(n_cols);
+    for (long i=0; i<n_cols; ++i) {
+        const double dot = arma::dot(data.col(i), data.col(i));
+        rms(i) = std::sqrt(dot / (data.col(i).n_rows-1));
+    }
+	return std::move(rms);
+}
+
+void normalize_by_column(arma::Mat<double>& data, const arma::Col<double>& rms) {
+	if (data.n_cols != rms.n_elem)
+		throw std::range_error("Number of elements of rms is not equal to the number of columns of data");
+	for (long i=0; i<long(data.n_cols); ++i) {
+        if (rms(i)==0)
+        	throw std::runtime_error("At least one of the entries of rms equals to zero");
+        data.col(i) *= 1./rms(i);
+    }
+}
+
+void enforce_positive_sign_by_column(arma::Mat<double>& data) {
+	for (long i=0; i<long(data.n_cols); ++i) {
+		const double max = arma::max(data.col(i));
+		const double min = arma::min(data.col(i));
+		bool change_sign = false;
+		if (std::abs(max)>=std::abs(min)) {
+			if (max<0) change_sign = true;
+		} else {
+			if (min<0) change_sign = true;
+		}
+		if (change_sign) data.col(i) *= -1;
+	}
+}
+
+std::vector<double> extract_column_vector(const arma::Mat<double>& data, long index) {
+	if (index<0 || index >= long(data.n_cols))
+		throw std::range_error(join("Index out of range: ", index));
+	const long n_rows = data.n_rows;
+	const double* memptr = data.colptr(index);
+	std::vector<double> result(memptr, memptr + n_rows);
+	return std::move(result);
+}
+
+std::vector<double> extract_row_vector(const arma::Mat<double>& data, long index) {
+	if (index<0 || index >= long(data.n_rows))
+		throw std::range_error(join("Index out of range: ", index));
+	const arma::Row<double> row(data.row(index));
+	const double* memptr = row.memptr();
+	std::vector<double> result(memptr, memptr + row.n_elem);
+	return std::move(result);
+}
+
+void assert_file_good(const bool& is_file_good, const std::string& filename) {
+	if (!is_file_good)
+		throw std::ios_base::failure(join("Cannot open file: ", filename));
+}
+
+double get_mean(const std::vector<double>& iter) {
+	const double init = 0;
+	return std::accumulate(iter.begin(), iter.end(), init) / iter.size();
+}
+
+double get_sigma(const std::vector<double>& iter) {
+	const double mean = get_mean(iter);
+	double sum = 0;
+	for (auto v=iter.begin(); v!=iter.end(); ++v)
+		sum += std::pow(*v - mean, 2.);
+	return std::sqrt(sum/(iter.size()-1));
+}
+
+} //utils
+} //stats
diff --git a/src/PerformBiasCorrection.cpp b/src/PerformBiasCorrection.cpp
new file mode 100644
index 0000000..f4c9e63
--- /dev/null
+++ b/src/PerformBiasCorrection.cpp
@@ -0,0 +1,349 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <iostream>
+#include <fstream>
+#include <istream>
+#include <vector>
+#include <array>
+#include <unordered_map>
+#include <limits>
+#include <cmath>
+#include <cstdint>
+#include <numeric>
+
+#include <boost/filesystem.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/multiprecision/cpp_dec_float.hpp>
+
+#include "Eigen/Dense"
+#include "PCA.hpp"
+
+#include "tensemble/TypeDef.h"
+#include "tensemble/RandomForestRegressor.h"
+#include "tensemble/RandomForestClassifier.h"
+#include "tensemble/GBMRegressor.h"
+#include "tensemble/GBMClassifier.h"
+#include "tensemble/ReadData.h"
+
+#include "CommonTypes.hpp"
+
+#define DEFAULT_N_TREES 100
+#define DEFAULT_N_JOBS 1
+#define DEFAULT_MAX_FEATURES_RATIO 1.0
+#define DEFAULT_MIN_SAMPLE_LEAF 5
+#define DEFAULT_MAX_DEPTH   4
+#define DEFAULT_SUBSAMPLE   1.0
+#define DEFAULT_SPLIT_CRITERION CRITERION_MSE
+#define DEFAULT_LOSS SQUARE_LOSS
+#define DEFAULT_LEARN_RATE 0.1
+#define DEFAULT_OOB 1
+#define DEFAULT_VERBOSE 0
+#define DEFAULT_BOOTSTRAP 1
+#define DEFAULT_COMPUTE_IMPORTANCE 0
+
+namespace bfs = boost::filesystem;
+using Kmer = ::uint64_t;
+using Sailfish::TranscriptFeatures;
+using mpdec = boost::multiprecision::cpp_dec_float_100;
+
+TranscriptFeatures parseFeature(std::ifstream& ifs) {
+        TranscriptFeatures tf{};
+        ifs >> tf.name;
+        ifs >> tf.length;
+        ifs >> tf.gcContent;
+        for (auto i : boost::irange(size_t{0}, tf.diNucleotides.size())) {
+                ifs >> tf.diNucleotides[i];
+        }
+        // eat the newline
+        char junk;
+        ifs.get(junk);
+        return tf;
+}
+
+std::vector<TranscriptFeatures> parseFeatureFile(const bfs::path& featureFile) {
+        std::ifstream ifile(featureFile.string());
+        std::vector<TranscriptFeatures> feats;
+        while (!ifile.eof()) {
+                feats.emplace_back( parseFeature(ifile) );
+                if (ifile.peek() == EOF) { break; }
+        }
+        ifile.close();
+        return feats;
+}
+
+struct TranscriptResult{
+        size_t length;
+        double tpm;
+        double rpkm;
+        double kpkm;
+        double approxKmerCount;
+        double approxCount;
+};
+
+struct ExpressionResults {
+        std::vector<std::string> comments;
+        std::unordered_map<std::string, TranscriptResult> expressions;
+};
+
+ExpressionResults parseSalmonFile(const bfs::path& expFile, double& numMappedReads) {
+        numMappedReads = 0.0;
+
+        std::ifstream ifile(expFile.string());
+        ExpressionResults res;
+        while(!ifile.eof()) {
+
+                if (ifile.peek() == '#') {
+                        std::string comment;
+                        std::getline(ifile, comment);
+                        res.comments.emplace_back(comment);
+                } else {
+                        std::string tname;
+                        TranscriptResult tr;
+                        ifile >> tname;
+                        ifile >> tr.length;
+                        ifile >> tr.tpm;
+                        ifile >> tr.approxCount;
+                        numMappedReads += tr.approxCount;
+                        res.expressions[tname] = tr;
+                        // eat the newline
+                        char nline; ifile.get(nline);
+                }
+
+                if (ifile.peek() == EOF) { break; }
+        }
+
+        return res;
+}
+
+int performBiasCorrectionSalmon(
+        bfs::path featureFile,
+        bfs::path expressionFile,
+        bfs::path outputFile,
+        size_t numThreads) {
+
+        auto features = parseFeatureFile(featureFile);
+        std::cerr << "parsed " << features.size() << " features\n";
+
+        double numMappedReads = 0.0;
+        auto salmonRes = parseSalmonFile(expressionFile, numMappedReads);
+        std::cerr << "parsed " << salmonRes.expressions.size() << " expression values\n";
+
+        auto numFeatureVectors = features.size();
+        auto numSamples = salmonRes.expressions.size();
+
+        bool skipBiasCorrection{false};
+
+        if (numFeatureVectors != numSamples) {
+            std::cerr << "The size of the feature map didn't match the "
+                      << "number of transcripts.  Bias correction will not "
+                      << "be performed\n";
+            skipBiasCorrection = true;
+        }
+
+        uint32_t minSamples{1000};
+        if (numSamples <= minSamples) {
+            std::cerr << "There are an insufficient number of transcripts "
+                      << "for post-hoc bias correction.  It will not be performed\n";
+            skipBiasCorrection = true;
+        }
+
+        if (skipBiasCorrection) {
+            std::ofstream ofile(outputFile.string());
+            for (auto& c : salmonRes.comments) {
+                ofile << c << "\n";
+            }
+            for (auto& kv : salmonRes.expressions) {
+                auto& name = kv.first;
+                auto& expRecord = kv.second;
+                ofile << name << '\t'
+                    << expRecord.length << '\t'
+                    << expRecord.tpm << '\t'
+                    << expRecord.approxCount << '\n';
+            }
+            ofile.close();
+            return 0;
+        }
+
+
+        std::vector<size_t> retainedRows;
+        std::vector<double> retainedTPMs;
+        std::vector<std::string> retainedNames;
+
+        double minLTPM, maxLTPM;
+        minLTPM = std::numeric_limits<double>::max();
+        maxLTPM = -minLTPM;
+
+        for (auto i : boost::irange(size_t{0}, features.size())) {
+                auto& tname = features[i].name;
+                auto tpm = salmonRes.expressions[tname].tpm; //
+                double v;
+
+                if ( tpm >= 1.0 ) {
+                        retainedRows.emplace_back(i);
+                        retainedNames.push_back(tname);
+                        v = std::log(tpm);
+                        retainedTPMs.push_back(v);
+                        minLTPM = std::min(minLTPM, v);
+                        maxLTPM = std::max(maxLTPM, v);
+                }
+        }
+
+
+        std::vector<float> pcavec;
+        std::vector<double> featData;
+        Eigen::MatrixXd featMat(retainedRows.size(), 17);
+        size_t fnum = 0;
+        size_t linearIndex{0};
+        for (auto r : retainedRows) {
+                auto& f = features[r];
+                pcavec.push_back(f.gcContent);
+                featMat(fnum, 0) = f.gcContent;
+                for (auto i : boost::irange(size_t{0}, f.diNucleotides.size())) {
+                        featMat(fnum, i) = f.diNucleotides[i];
+                        pcavec.push_back(f.diNucleotides[i]);
+                }
+                ++fnum;
+        }
+
+        PCA pca(featMat);
+
+        std::cerr << "Performing PCA decomposition\n";
+        pca.performDecomposition();
+
+        auto encodedXSub = pca.projectedData(0.95, true);
+
+        Data train;
+        size_t numCols = encodedXSub.cols();
+        train.set_size(retainedRows.size(), numCols+1);
+
+        size_t c = 0;
+        for (auto r : retainedRows) {
+                train.X[c][0] = std::log(static_cast<double>(features[r].length));
+
+                for (auto j : boost::irange(size_t{1}, numCols+1)) {
+                        train.X[c][j] = encodedXSub(c, j-1);
+                }
+                train.y[c] = retainedTPMs[c];
+                ++c;
+        }
+
+        /** Random Forest Regression **/
+        size_t minDepth = 5;
+        auto reg = std::unique_ptr<RandomForestRegressor>(new RandomForestRegressor(
+                500,
+                train.n_features,
+                5, // max tree depth
+                1, // min_samples_leaf
+                1.0, // features ratio
+                true, // bootstrap
+                true, //out-of-bag
+                true, // compute importance
+                0, // random seed
+                numThreads, // num jobs
+                true // verbose
+        ));
+
+        std::cerr << "there are " << train.n_samples << " samples\n";
+        std::cerr << "there are " << train.n_features << " features\n";
+        reg->build(train.X, train.y, train.n_samples);
+
+        std::vector<REAL> pred(train.n_samples, 0.0);
+        reg->predict(train.X, &pred[0], train.n_samples, train.n_features);
+
+        REAL trn_rmse=rmse(&pred[0], train.y, train.n_samples);
+        REAL trn_r2=R2(&pred[0], train.y, train.n_samples);
+        std::cerr << "Train RMSE=" << trn_rmse << ", Correlation Coefficient=" << trn_r2 << "\n";
+
+        double grandMean = 0.0;
+        size_t ntrain = train.n_samples;
+        for (auto i : boost::irange(size_t{0}, ntrain)) {
+                grandMean += retainedTPMs[i];
+        }
+        grandMean /= train.n_samples;
+
+        for (auto i : boost::irange(size_t{0}, ntrain)) {
+                pred[i] = grandMean + (retainedTPMs[i] - pred[i]);
+        }
+
+        trn_rmse=rmse(&pred[0], train.y, train.n_samples);
+        trn_r2=R2(&pred[0], train.y, train.n_samples);
+        std::cerr << "Train RMSE=" << trn_rmse << ", Correlation Coefficient=" << trn_r2 << "\n";
+
+        std::ofstream ofile(outputFile.string());
+        for (auto& c : salmonRes.comments) {
+                ofile << c << "\n";
+        }
+
+        size_t retainedCnt = 0;
+        vector<mpdec> tpms(features.size());
+        double tpmSum{0.0};
+        for (auto i : boost::irange(size_t{0}, size_t{features.size()})) {
+          auto& name = features[i].name;
+          auto& r = salmonRes.expressions[name];
+          if (i == retainedRows[retainedCnt]) {
+            double v = std::exp(pred[retainedCnt]);
+            tpms[i] = v;
+            tpmSum += v;
+            ++retainedCnt;
+          } else {
+              tpms[i] = r.tpm;
+              tpmSum += r.tpm;
+          }
+        }
+
+        double tpmNorm = 1000000.0 / tpmSum;
+        for (auto i : boost::irange(size_t{0}, size_t{features.size()})) {
+            tpms[i] *= tpmNorm;
+        }
+
+        vector<mpdec> estNumReads(features.size());
+
+        // use TPM estimates to computed estimated read counts
+        mpdec mpzero = 0;
+        mpdec totalNucDenom = 0;
+        for (auto i : boost::irange(size_t{0},  size_t{features.size()})) {
+            double len = features[i].length;
+            totalNucDenom += tpms[i] * len;
+        }
+
+        for (auto i : boost::irange(size_t{0},  size_t{features.size()})) {
+            mpdec len = features[i].length;
+            estNumReads[i] += ((tpms[i] * len) / totalNucDenom) * numMappedReads;
+        }
+
+        for (auto i : boost::irange(size_t{0}, size_t{features.size()})) {
+          auto& name = features[i].name;
+          auto& r = salmonRes.expressions[name];
+          auto length = r.length;
+          ofile << name << '\t'
+                << r.length << '\t'
+                << tpms[i] << '\t'
+                << estNumReads[i] << '\n';
+        }
+        std::cerr << "retainedCnt = " << retainedCnt << ", nsamps = " << train.n_samples << "\n";
+
+        ofile.close();
+    return 0;
+}
+
+
diff --git a/src/PerformBiasCorrection_old.cpp b/src/PerformBiasCorrection_old.cpp
new file mode 100644
index 0000000..f5d7696
--- /dev/null
+++ b/src/PerformBiasCorrection_old.cpp
@@ -0,0 +1,420 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <iostream>
+#include <fstream>
+#include <istream>
+#include <vector>
+#include <array>
+#include <unordered_map>
+#include <limits>
+
+#include <boost/filesystem.hpp>
+#include <boost/range/irange.hpp>
+
+#include "shark/Data/Dataset.h"
+#include "shark/Algorithms/Trainers/PCA.h"
+#include "shark/Algorithms/Trainers/RFTrainer.h"
+#include "shark/Algorithms/Trainers/CARTTrainer.h"
+
+#include "tensemble/TypeDef.h"
+#include "tensemble/RandomForestRegressor.h"
+#include "tensemble/RandomForestClassifier.h"
+#include "tensemble/GBMRegressor.h"
+#include "tensemble/GBMClassifier.h"
+#include "tensemble/ReadData.h"
+
+#define DEFAULT_N_TREES 100
+#define DEFAULT_N_JOBS 1
+#define DEFAULT_MAX_FEATURES_RATIO 1.0
+#define DEFAULT_MIN_SAMPLE_LEAF 10
+#define DEFAULT_MAX_DEPTH   4
+#define DEFAULT_SUBSAMPLE   1.0
+#define DEFAULT_SPLIT_CRITERION CRITERION_MSE
+#define DEFAULT_LOSS SQUARE_LOSS
+#define DEFAULT_LEARN_RATE 0.1
+#define DEFAULT_OOB 1
+#define DEFAULT_VERBOSE 0
+#define DEFAULT_BOOTSTRAP 1
+#define DEFAULT_COMPUTE_IMPORTANCE 0
+
+namespace bfs = boost::filesystem;
+using Kmer = uint64_t;
+
+struct TranscriptFeatures{
+	std::string name;
+	size_t length;
+	double gcContent;
+	std::array<Kmer, 16> diNucleotides;
+};
+
+TranscriptFeatures parseFeature(std::ifstream& ifs) {
+	TranscriptFeatures tf{};
+	ifs >> tf.name;
+	ifs >> tf.length;
+	ifs >> tf.gcContent;
+	for (auto i : boost::irange(size_t{0}, tf.diNucleotides.size())) {
+		ifs >> tf.diNucleotides[i];
+	}
+	// eat the newline
+	char junk;
+	ifs.get(junk);
+	return tf;
+}
+
+std::vector<TranscriptFeatures> parseFeatureFile(const bfs::path& featureFile) {
+	std::ifstream ifile(featureFile.string());
+	std::vector<TranscriptFeatures> feats;
+	while (!ifile.eof()) {
+		feats.emplace_back( parseFeature(ifile) );
+		if (ifile.peek() == EOF) { break; }
+	}
+	ifile.close();
+	return feats;
+}
+
+struct TranscriptResult{
+	size_t length;
+	double tpm;
+	double rpkm;
+};
+
+struct ExpressionResults {
+	std::vector<std::string> comments;
+	std::unordered_map<std::string, TranscriptResult> expressions;
+};
+
+ExpressionResults parseSailfishFile(const bfs::path& expFile) {
+	std::ifstream ifile(expFile.string());
+	ExpressionResults res;
+	while(!ifile.eof()) {
+
+		if (ifile.peek() == '#') {
+			std::string comment;
+			std::getline(ifile, comment);
+			res.comments.emplace_back(comment);
+		} else {
+			std::string tname;
+			TranscriptResult tr;
+			ifile >> tname;
+			ifile >> tr.length;
+			ifile >> tr.tpm;
+			ifile >> tr.rpkm;
+			res.expressions[tname] = tr;
+			// eat the newline
+			char nline; ifile.get(nline);
+		}
+
+		if (ifile.peek() == EOF) { break; }
+	}
+
+	return res;
+}
+
+int main(int argc, char* argv[]) {
+
+	using shark::PCA;
+	//PCA pca(data)
+	bfs::path featureFile = bfs::path(argv[1]);
+	auto features = parseFeatureFile(featureFile);
+	std::cerr << "parsed " << features.size() << " features\n";
+
+	bfs::path expressionFile = bfs::path(argv[2]);
+	auto sfres = parseSailfishFile(expressionFile);
+	std::cerr << "parsed " << sfres.expressions.size() << " expression values\n";
+
+	std::vector<shark::RealVector> RPKMLabels;//(features.size(), 1);
+	std::vector<size_t> retainedRows;
+	std::vector<double> retainedRPKMs;
+	std::vector<std::string> retainedNames;
+
+	double minLRPKM, maxLRPKM;
+	minLRPKM = std::numeric_limits<double>::max();
+	maxLRPKM = -minLRPKM;
+	double avgLRPKM = 0.0;
+
+	for (auto i : boost::irange(size_t{0}, features.size())) {
+		auto& tname = features[i].name;
+		auto rpkm = sfres.expressions[tname].rpkm;
+		shark::RealVector v(1);
+
+		if ( rpkm >= 0.001 ) {
+			retainedRows.emplace_back(i);
+			retainedNames.push_back(tname);
+			v(0) = std::log(rpkm);
+			retainedRPKMs.push_back(v(0));
+
+			if (std::fabs(rpkm - std::exp(v(0))) > 1e-2) {
+				std::cerr << "EXP DOES NOT INVERT LOG!!\n";
+			}
+			minLRPKM = std::min(minLRPKM, v(0));
+			maxLRPKM = std::max(maxLRPKM, v(0));
+			avgLRPKM += v(0);
+			RPKMLabels.emplace_back(v);
+		}
+	}
+	avgLRPKM /= retainedRows.size();
+
+
+	std::vector<shark::RealVector> featMat;
+	size_t fnum = 0;
+	for (auto r : retainedRows) {
+		auto& f = features[r];
+		shark::RealVector v(17);
+		v(0) = f.gcContent;
+		for (auto i : boost::irange(size_t{0}, f.diNucleotides.size())) { v(i+1) = f.diNucleotides[i]; }
+		featMat.emplace_back(v);
+		++fnum;
+	}
+
+	shark::UnlabeledData<shark::RealVector> Xsub = shark::createDataFromRange(featMat);
+	PCA pca(Xsub, true);
+	auto evals = pca.eigenvalues();
+	double totalVariance = 0.0;
+	for ( auto e : evals ) { totalVariance += e; }
+	double varCutoff = 0.95;
+	double varSum = 0.0;
+	size_t dimCutoff = 0;
+	size_t currentDim = 0;
+	for ( auto e : evals ) {
+		++currentDim;
+		varSum += e;
+		if (varSum / totalVariance >= varCutoff) {
+			dimCutoff = currentDim;
+			break;
+		}
+		//std::cerr << "ev: " << e <<  "\n";
+	}
+	std::cerr << varCutoff * 100.0 << "% of the variance is explained by " << dimCutoff << " dimensions\n";
+
+	shark::LinearModel<> enc;
+	pca.encoder(enc, dimCutoff);
+	auto encodedXsub = enc(Xsub);
+
+	std::vector<shark::RealVector> X;
+	std::vector<shark::RealVector> labels;
+
+	Data train;
+	Data train2;
+	train.set_size(retainedRows.size(), dimCutoff+1);
+	train2.set_size(retainedRows.size(), dimCutoff+1);
+
+	size_t c = 0;
+	for (auto r : retainedRows) {
+		shark::RealVector v(dimCutoff + 1);
+		shark::RealVector l(1);
+
+		v(0) = std::log(static_cast<double>(features[r].length));
+		train.X[c][0] = std::log(static_cast<double>(features[r].length));
+		train2.X[c][0] = std::log(static_cast<double>(features[r].length));
+		for (auto j : boost::irange(size_t{1}, dimCutoff+1)) {
+			train.X[c][j] = encodedXsub.element(c)(j-1);
+			train2.X[c][j] = encodedXsub.element(c)(j-1);
+			v(j) = encodedXsub.element(c)(j-1);
+			//std::cerr << "Train [" << c <<"][" << j	 << "] = " << train.X[c][j] << "\n";
+		}
+		train.y[c] = retainedRPKMs[c];// std::log(sfres[features[r].name].expressions.rpkm);
+		l(0) = retainedRPKMs[c];
+
+		X.emplace_back(v);
+		labels.emplace_back(l);
+
+		++c;
+	}
+
+	size_t numRetainedSamples = retainedRows.size();
+
+// #define DEFAULT_N_TREES 100
+// #define DEFAULT_N_JOBS 1
+// #define DEFAULT_MAX_FEATURES_RATIO 1.0
+// #define DEFAULT_MIN_SAMPLE_LEAF 2
+// #define DEFAULT_MAX_DEPTH   4
+// #define DEFAULT_SUBSAMPLE   1.0
+// #define DEFAULT_SPLIT_CRITERION CRITERION_MSE
+// #define DEFAULT_LOSS SQUARE_LOSS
+// #define DEFAULT_LEARN_RATE 0.1
+// #define DEFAULT_OOB 1
+// #define DEFAULT_VERBOSE 0
+// #define DEFAULT_BOOTSTRAP 1
+// #define DEFAULT_COMPUTE_IMPORTANCE 0
+
+	/** Random Forest Regression **/
+	/*
+	auto reg = std::unique_ptr<RandomForestRegressor>(new RandomForestRegressor(
+		1000,
+		train.n_features,
+		DEFAULT_MAX_DEPTH,
+		DEFAULT_MIN_SAMPLE_LEAF,
+		DEFAULT_MAX_FEATURES_RATIO,
+		true, // bootstrap
+		true, //out-of-bag
+		false,
+		0,
+		16,
+		true
+	));
+	*/
+
+	/*
+	auto reg = std::unique_ptr<GBMRegressor>(new GBMRegressor(
+		SQUARE_LOSS,
+		500,
+		train.n_features,
+		DEFAULT_MAX_DEPTH,
+		DEFAULT_MIN_SAMPLE_LEAF,
+		DEFAULT_MAX_FEATURES_RATIO,
+		DEFAULT_SUBSAMPLE, // subsample
+		0.25, //learn rate
+		true, //out-of-bag
+		false, // compute imporance
+		34239, // random seed
+		30, // num jobs
+		true // verbose
+	));*/
+
+	/*
+	std::cerr << "there are " << numRetainedSamples << " samples\n";
+	std::cerr << "there are " << train.n_features << " features\n";
+	reg->build(train.X, train.y, numRetainedSamples);
+
+	REAL* pred = new REAL[numRetainedSamples];
+	for (auto i : boost::irange(size_t{0}, size_t{numRetainedSamples})) { pred[i] = 0.0; }
+	reg->predict(train2.X, pred, numRetainedSamples, train.n_features);
+
+ REAL trn_rmse=rmse(pred, train.y, numRetainedSamples);
+  REAL trn_r2=R2(pred, train.y, numRetainedSamples);
+  std::cerr << "Train RMSE=" << trn_rmse << ", Correlation Coefficient=" << trn_r2 << "\n";
+	*/
+
+
+	shark::Data<shark::RealVector> input = shark::createDataFromRange(X);
+	shark::Data<shark::RealVector> regLabels = shark::createDataFromRange(labels);
+
+	shark::LabeledData<shark::RealVector, shark::RealVector> regressionData(input, regLabels);
+
+	shark::RFTrainer trainer;
+	shark::RFClassifier model;
+
+	trainer.setNodeSize(20);
+	trainer.setNTrees(500);
+	trainer.setOOBratio(0.001);
+
+	//std::cerr << "number of classes = " << shark::numberOfClasses(regressionSubset) << "\n";
+	//std::cerr << "number of classes = " << shark::numberOfClasses(regressionSubset) << "\n";
+	trainer.train(model, regressionData);
+
+	auto predictionData = model(regressionData.inputs());
+	std::cerr << "inputs\n";
+	std::vector<double> pred(retainedRows.size(), 0.0);
+	//std::cerr << regressionSubset << "\n";
+	std::cerr << predictionData << "\n";
+	size_t ctr = 0;
+	std::cerr << "NUM ELEMENTS: " << predictionData.numberOfElements() << "\n";
+	for (auto i : boost::irange(size_t{0}, predictionData.numberOfElements())) {
+		pred[i] = predictionData.element(i)(0);
+	}
+	std::cerr << "cha!\n";
+
+	//auto& inputLabels = regressionSubset.labels();
+	/*
+	auto mm = std::minmax_element(train.y, train.y + numRetainedSamples);
+	double minLRPKM = std::get<0>(mm);
+	double maxLRPKM = std::get<1>(mm);
+	*/
+
+	for (auto i : boost::irange(size_t{0}, size_t{numRetainedSamples})) {
+		pred[i] = RPKMLabels[i](0) - pred[i];
+	}
+
+	auto mmpred = std::minmax_element(pred.begin(), pred.begin() + numRetainedSamples);
+	double minPred = *(mmpred.first);
+	double maxPred = *(mmpred.second);
+
+	double scale = std::fabs(maxLRPKM - minLRPKM) / std::fabs(maxPred - minPred);
+
+	std::cerr << "min,max LRPKM : " << minLRPKM << ", " << maxLRPKM << "\n";
+	std::cerr << "min, max pred : " << minPred << ", " << maxPred  << "\n";
+	std::cerr << "SCALE: " << scale << "\n";
+
+
+	minPred = std::numeric_limits<double>::max();
+	for (auto i : boost::irange(size_t{0}, size_t{numRetainedSamples})) {
+		pred[i] *= scale;
+		minPred = std::min(minPred, pred[i]);
+	}
+
+	double shift{minLRPKM - minPred};
+	minPred = std::numeric_limits<double>::max();
+	maxPred = -minPred;
+	for (auto i : boost::irange(size_t{0}, size_t{numRetainedSamples})) {
+		pred[i] += shift;
+		minPred = std::min(minPred, pred[i]);
+		maxPred = std::max(maxPred, pred[i]);
+	}
+
+	/*
+   trn_rmse=rmse(pred, train.y, numRetainedSamples);
+   trn_r2=R2(pred, train.y, numRetainedSamples);
+  std::cerr << "Train RMSE=" << trn_rmse << ", Correlation Coefficient=" << trn_r2 << "\n";
+	*/
+
+	std::ofstream ofile(argv[3]);
+	for (auto& c : sfres.comments) {
+		ofile << c << "\n";
+	}
+
+	size_t retainedCnt = 0;
+	for (auto i : boost::irange(size_t{0}, size_t{features.size()})) {
+
+		double rpkm = 0.0;
+		auto& name = features[i].name;
+		auto& r = sfres.expressions[name];
+
+		if (i == retainedRows[retainedCnt]) {
+			//rpkm = std::exp(pred[retainedCnt]);
+			rpkm = std::exp(pred[retainedCnt]);
+			++retainedCnt;
+		} else {
+			rpkm = r.rpkm;
+		}
+		/*
+		if (i == retainedRows[retainedCnt]) {
+			if (retainedNames[retainedCnt] != features[i].name) {
+				std::cerr << "AHHH!!!\n";
+			}
+
+			rpkm = std::exp(RPKMLabels[retainedCnt](0));// pred[retainedCnt]);
+
+			++retainedCnt;
+		} else {
+			rpkm = sfres.expressions[features[i].name].rpkm;
+		}
+		*/
+		//std::cerr << "Feature = " << features[i].name << ", rpkm = " << rpkm << "\n";
+
+		ofile << name << '\t' << r.length << '\t' << r.tpm << '\t' << rpkm << '\n';
+	}
+	std::cerr << "retainedCnt = " << retainedCnt << ", nsamps = " << numRetainedSamples << "\n";
+	//for ( auto i : retainedRows ) { std::cerr << i << " "; }
+	ofile.close();
+
+}
diff --git a/src/QSufSort.c b/src/QSufSort.c
new file mode 100644
index 0000000..36c5a51
--- /dev/null
+++ b/src/QSufSort.c
@@ -0,0 +1,402 @@
+/* QSufSort.c
+
+   Original source from qsufsort.c
+
+   Copyright 1999, N. Jesper Larsson, all rights reserved.
+
+   This file contains an implementation of the algorithm presented in "Faster
+   Suffix Sorting" by N. Jesper Larsson (jesper at cs.lth.se) and Kunihiko
+   Sadakane (sada at is.s.u-tokyo.ac.jp).
+
+   This software may be used freely for any purpose. However, when distributed,
+   the original source must be clearly stated, and, when the source code is
+   distributed, the copyright notice must be retained and any alterations in
+   the code must be clearly marked. No warranty is given regarding the quality
+   of this software.
+
+   Modified by Wong Chi-Kwong, 2004
+
+   Changes summary:	- Used long variable and function names
+					- Removed global variables
+					- Replace pointer references with array references
+					- Used insertion sort in place of selection sort and increased insertion sort threshold
+					- Reconstructing suffix array from inverse becomes an option
+					- Add handling where end-of-text symbol is not necessary < all characters
+					- Removed codes for supporting alphabet size > number of characters
+  
+  No warrenty is given regarding the quality of the modifications.
+
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "QSufSort.h"
+
+#define min(value1, value2)						( ((value1) < (value2)) ? (value1) : (value2) )
+#define med3(a, b, c)							( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
+#define swap(a, b, t);							t = a; a = b; b = t;
+
+// Static functions
+static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, 
+							  const qsint_t highestPos, const qsint_t numSortedChar);
+static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, 
+							   const qsint_t highestPos, const qsint_t numSortedChar);
+static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, 
+									const qsint_t highestPos, const qsint_t numSortedChar);
+static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize);
+static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, 
+							 const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated);
+
+/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
+   n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
+   contents of x[n] is disregarded, the n-th symbol being regarded as
+   end-of-string smaller than all other symbols.*/
+void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, 
+						const qsint_t smallestInputSymbol, const int skipTransform)
+{
+	qsint_t i, j;
+	qsint_t s, negatedSortedGroupLength;
+	qsint_t numSymbolAggregated;
+	qsint_t numSortedPos = 1;
+	qsint_t newAlphabetSize;
+   
+	if (!skipTransform) {
+		/* bucketing possible*/
+		newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, 
+											numChar, &numSymbolAggregated);
+		QSufSortBucketSort(V, I, numChar, newAlphabetSize);
+		I[0] = -1;
+		V[numChar] = 0;
+		numSortedPos = numSymbolAggregated;
+	}
+
+	while ((qsint_t)(I[0]) >= -(qsint_t)numChar) {
+		i = 0;
+		negatedSortedGroupLength = 0;
+		do {
+			s = I[i];
+			if (s < 0) {
+				i -= s;						/* skip over sorted group.*/
+				negatedSortedGroupLength += s;
+			} else {
+				if (negatedSortedGroupLength) {
+					I[i+negatedSortedGroupLength] = negatedSortedGroupLength;	/* combine preceding sorted groups */
+					negatedSortedGroupLength = 0;
+				}
+				j = V[s] + 1;
+				QSufSortSortSplit(V, I, i, j - 1, numSortedPos);
+				i = j;
+			}
+		} while (i <= numChar);
+		if (negatedSortedGroupLength) {
+			/* array ends with a sorted group.*/
+			I[i+negatedSortedGroupLength] = negatedSortedGroupLength;	/* combine sorted groups at end of I.*/
+		}
+		numSortedPos *= 2;	/* double sorted-depth.*/
+	}
+}
+
+void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar)
+{
+	qsint_t i;
+	for (i=0; i<=numChar; i++)
+		I[V[i]] = i + 1;
+}
+
+/* Sorting routine called for each unsorted group. Sorts the array of integers
+   (suffix numbers) of length n starting at p. The algorithm is a ternary-split
+   quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
+   Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
+   function is based on Program 7.*/
+static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, 
+							  const qsint_t highestPos, const qsint_t numSortedChar) {
+
+	qsint_t a, b, c, d;
+	qsint_t l, m;
+	qsint_t f, v, s, t;
+	qsint_t tmp;
+	qsint_t numItem;
+
+	numItem = highestPos - lowestPos + 1;
+
+	if (numItem <= INSERT_SORT_NUM_ITEM) {
+		QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar);
+		return;
+	}
+
+	v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar);
+
+	a = b = lowestPos;
+	c = d = highestPos;
+
+	while (1) {
+		while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
+			if (f == v) {
+				swap(I[a], I[b], tmp);
+				a++;
+			}
+			b++;
+		}
+		while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) {
+			if (f == v) {
+				swap(I[c], I[d], tmp);
+				d--;
+			}
+			c--;
+		}
+		if (b > c)
+			break;
+		swap(I[b], I[c], tmp);
+		b++;
+		c--;
+	}
+
+	s = a - lowestPos;
+	t = b - a;
+	s = min(s, t);
+	for (l = lowestPos, m = b - s; m < b; l++, m++) {
+		swap(I[l], I[m], tmp);
+	}
+
+	s = d - c;
+	t = highestPos - d;
+	s = min(s, t);
+	for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) {
+		swap(I[l], I[m], tmp);
+	}
+
+	s = b - a;
+	t = d - c;
+	if (s > 0)
+		QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
+
+	// Update group number for equal portion
+	a = lowestPos + s;
+	b = highestPos - t;
+	if (a == b) {
+		// Sorted group
+		V[I[a]] = a;
+		I[a] = -1;
+	} else {
+		// Unsorted group
+		for (c=a; c<=b; c++)
+			V[I[c]] = b;
+	}
+
+	if (t > 0)
+		QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
+
+}
+
+/* Algorithm by Bentley & McIlroy.*/
+static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, 
+							   const qsint_t highestPos, const qsint_t numSortedChar) {
+
+	qsint_t m;
+	qsint_t keyl, keym, keyn;
+	qsint_t key1, key2, key3;
+	qsint_t s;
+	qsint_t numItem;
+
+	numItem = highestPos - lowestPos + 1;
+
+	m = lowestPos + numItem / 2;
+
+	s = numItem / 8;
+	key1 = KEY(V, I, lowestPos, numSortedChar);
+	key2 = KEY(V, I, lowestPos+s, numSortedChar);
+	key3 = KEY(V, I, lowestPos+2*s, numSortedChar);
+	keyl = med3(key1, key2, key3);
+	key1 = KEY(V, I, m-s, numSortedChar);
+	key2 = KEY(V, I, m, numSortedChar);
+	key3 = KEY(V, I, m+s, numSortedChar);
+	keym = med3(key1, key2, key3);
+	key1 = KEY(V, I, highestPos-2*s, numSortedChar);
+	key2 = KEY(V, I, highestPos-s, numSortedChar);
+	key3 = KEY(V, I, highestPos, numSortedChar);
+	keyn = med3(key1, key2, key3);
+
+	return med3(keyl, keym, keyn);
+
+
+}
+
+/* Quadratic sorting method to use for small subarrays. */
+static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, 
+									const qsint_t highestPos, const qsint_t numSortedChar)
+{
+	qsint_t i, j;
+	qsint_t tmpKey, tmpPos;
+	qsint_t numItem;
+	qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
+	qsint_t negativeSortedLength;
+	qsint_t groupNum;
+
+	numItem = highestPos - lowestPos + 1;
+
+	for (i=0; i<numItem; i++) {
+		pos[i] = I[lowestPos + i];
+		key[i] = V[pos[i] + numSortedChar];
+	}
+
+	for (i=1; i<numItem; i++) {
+		tmpKey = key[i];
+		tmpPos = pos[i];
+		for (j=i; j>0 && key[j-1] > tmpKey; j--) {
+			key[j] = key[j-1];
+			pos[j] = pos[j-1];
+		}
+		key[j] = tmpKey;
+		pos[j] = tmpPos;
+	}
+
+	negativeSortedLength = -1;
+
+	i = numItem - 1;
+	groupNum = highestPos;
+	while (i > 0) {
+		I[i+lowestPos] = pos[i];
+		V[I[i+lowestPos]] = groupNum;
+		if (key[i-1] == key[i]) {
+			negativeSortedLength = 0;
+		} else {
+			if (negativeSortedLength < 0)
+				I[i+lowestPos] = negativeSortedLength;
+			groupNum = i + lowestPos - 1;
+			negativeSortedLength--;
+		}
+		i--;
+	}
+
+	I[lowestPos] = pos[0];
+	V[I[lowestPos]] = groupNum;
+	if (negativeSortedLength < 0)
+		I[lowestPos] = negativeSortedLength;
+}
+
+/* Bucketsort for first iteration.
+
+   Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
+   at least once. x[n] is 0. (This is the corresponding output of transform.) k
+   must be at most n+1. p is array of size n+1 whose contents are disregarded.
+
+   Output: x is V and p is I after the initial sorting stage of the refined
+   suffix sorting algorithm.*/
+      
+static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize)
+{
+	qsint_t i, c;
+	qsint_t d;
+	qsint_t groupNum;
+	qsint_t currentIndex;
+
+	// mark linked list empty
+	for (i=0; i<alphabetSize; i++)
+		I[i] = -1;
+
+	// insert to linked list
+	for (i=0; i<=numChar; i++) {
+		c = V[i];
+		V[i] = (qsint_t)(I[c]);
+		I[c] = i;
+	}
+
+	currentIndex = numChar;
+	for (i=alphabetSize; i>0; i--) {
+		c = I[i-1];
+		d = (qsint_t)(V[c]);
+		groupNum = currentIndex;
+		V[c] = groupNum;
+		if (d >= 0) {
+			I[currentIndex] = c;
+			while (d >= 0) {
+				c = d;
+				d = V[c];
+				V[c] = groupNum;
+				currentIndex--;
+				I[currentIndex] = c;
+			}
+		} else {
+			// sorted group
+			I[currentIndex] = -1;
+		}
+		currentIndex--;
+	}
+}
+
+/* Transforms the alphabet of x by attempting to aggregate several symbols into
+   one, while preserving the suffix order of x. The alphabet may also be
+   compacted, so that x on output comprises all integers of the new alphabet
+   with no skipped numbers.
+
+   Input: x is an array of size n+1 whose first n elements are positive
+   integers in the range l...k-1. p is array of size n+1, used for temporary
+   storage. q controls aggregation and compaction by defining the maximum intue
+   for any symbol during transformation: q must be at least k-l; if q<=n,
+   compaction is guaranteed; if k-l>n, compaction is never done; if q is
+   INT_MAX, the maximum number of symbols are aggregated into one.
+   
+   Output: Returns an integer j in the range 1...q representing the size of the
+   new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
+   set to the number of old symbols grouped into one. Only x[n] is 0.*/
+static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, 
+							 const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated)
+{
+	qsint_t c, i, j;
+	qsint_t a;	// numSymbolAggregated
+	qsint_t mask;
+	qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0;
+	qsint_t newAlphabetSize;
+	qsint_t maxNumInputSymbol, maxNumBit, maxSymbol;
+
+	maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
+
+	for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit;
+	maxSymbol = QSINT_MAX >> maxNumBit;
+
+	c = maxNumInputSymbol;
+	for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
+		minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1);
+		maxSymbolInChunk = c;
+		c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol;
+	}
+
+	mask = (1 << (a-1) * maxNumBit) - 1;	/* mask masks off top old symbol from chunk.*/
+	V[numChar] = smallestInputSymbol - 1;	/* emulate zero terminator.*/
+
+	/* bucketing possible, compact alphabet.*/
+	for (i=0; i<=maxSymbolInChunk; i++)
+		I[i] = 0;	/* zero transformation table.*/
+	c = minSymbolInChunk;
+	for (i=a; i<=numChar; i++) {
+		I[c] = 1;			/* mark used chunk symbol.*/
+		c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1);	/* shift in next old symbol in chunk.*/
+	}
+	for (i=1; i<a; i++) {	/* handle last r-1 positions.*/
+		I[c] = 1;			/* mark used chunk symbol.*/
+		c = (c & mask) << maxNumBit;	/* shift in next old symbol in chunk.*/
+	}
+	newAlphabetSize = 1;
+	for (i=0; i<=maxSymbolInChunk; i++) {
+		if (I[i]) {
+			I[i] = newAlphabetSize;
+			newAlphabetSize++;
+		}
+	}
+	c = minSymbolInChunk;
+	for (i=0, j=a; j<=numChar; i++, j++) {
+		V[i] = I[c];						/* transform to new alphabet.*/
+		c = ((c & mask) << maxNumBit) | (V[j] - smallestInputSymbol + 1);	/* shift in next old symbol in chunk.*/
+	}
+	for (; i<numChar; i++) {	/* handle last a-1 positions.*/
+		V[i] = I[c];			/* transform to new alphabet.*/
+		c = (c & mask) << maxNumBit;	/* shift right-end zero in chunk.*/
+	}
+
+	V[numChar] = 0;		/* end-of-string symbol is zero.*/
+
+    *numSymbolAggregated = a;
+	return newAlphabetSize;
+}
diff --git a/src/SailfishUtils.cpp b/src/SailfishUtils.cpp
new file mode 100644
index 0000000..f127c69
--- /dev/null
+++ b/src/SailfishUtils.cpp
@@ -0,0 +1,699 @@
+/**
+>HEADER
+    Copyright (c) 2015 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <boost/thread/thread.hpp>
+#include <boost/filesystem.hpp>
+#include <algorithm>
+#include <iostream>
+#include <tuple>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+#include <boost/filesystem.hpp>
+#include <boost/range/join.hpp>
+
+#include "gff.h"
+
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+
+#include "jellyfish/mer_dna.hpp"
+#include "TranscriptGeneMap.hpp"
+#include "GenomicFeature.hpp"
+#include "SalmonUtils.hpp"
+
+namespace salmon{
+namespace utils {
+using std::string;
+using NameVector = std::vector<string>;
+using IndexVector = std::vector<size_t>;
+using KmerVector = std::vector<uint64_t>;
+
+/**
+ * This function parses the library format string that specifies the format in which
+ * the reads are to be expected.
+ */
+LibraryFormat parseLibraryFormatStringNew(std::string& fmt) {
+	using std::vector;
+	using std::string;
+	using std::map;
+	using std::stringstream;
+
+    map<string, LibraryFormat> formatMap = {
+        {"IU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::U)},
+        {"ISF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::SA)},
+        {"ISR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::AS)},
+        {"OU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::U)},
+        {"OSF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::SA)},
+        {"OSR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::AS)},
+        {"MU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::U)},
+        {"MSF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::S)},
+        {"MSR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::A)},
+        {"U", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U)},
+        {"SF", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::S)},
+        {"SR", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::A)}};
+
+	// inspired by http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c
+	// first convert the string to upper-case
+	for (auto& c : fmt) { c = std::toupper(c); }
+
+
+    auto libFmtIt = formatMap.find(fmt);
+
+	if (libFmtIt == formatMap.end()) {
+		stringstream errstr;
+		errstr << "unknown library format string : " << fmt;
+		throw std::invalid_argument(errstr.str());
+	}
+
+    return libFmtIt->second;
+}
+
+/**
+ * Parses a set of __ordered__ command line options and extracts the relevant
+ * read libraries from them.
+ */
+std::vector<ReadLibrary> extractReadLibraries(boost::program_options::parsed_options& orderedOptions) {
+	// The current (default) format for paired end data
+	LibraryFormat peFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::U);
+	// The current (default) format for single end data
+	LibraryFormat seFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U);
+
+	std::vector<ReadLibrary> peLibs{ReadLibrary(peFormat)};
+	std::vector<ReadLibrary> seLibs{ReadLibrary(seFormat)};
+	for (auto& opt : orderedOptions.options) {
+		// Update the library type
+		if (opt.string_key == "libType") {
+			auto libFmt = parseLibraryFormatStringNew(opt.value[0]);
+			if (libFmt.type == ReadType::PAIRED_END) {
+				peFormat = libFmt;
+				peLibs.emplace_back(libFmt);
+			} else {
+				seFormat = libFmt;
+				seLibs.emplace_back(libFmt);
+			}
+		}
+		if (opt.string_key == "mates1") {
+			peLibs.back().addMates1(opt.value);
+		}
+		if (opt.string_key == "mates2") {
+			peLibs.back().addMates2(opt.value);
+		}
+		if (opt.string_key == "unmatedReads") {
+			seLibs.back().addUnmated(opt.value);
+		}
+	}
+
+	std::vector<ReadLibrary> libs;
+	libs.reserve(peLibs.size() + seLibs.size());
+	for (auto& lib : boost::range::join(seLibs, peLibs)) {
+		if (lib.format().type == ReadType::SINGLE_END) {
+			if (lib.unmated().size() == 0) {
+				// Didn't use default single end library type
+				continue;
+			}
+		} else if (lib.format().type == ReadType::PAIRED_END) {
+			if (lib.mates1().size() == 0 or lib.mates2().size() == 0) {
+                // Didn't use default paired-end library type
+				continue;
+			}
+		}
+		libs.push_back(lib);
+	}
+    size_t numLibs = libs.size();
+	std::cerr << "there " << ((numLibs > 1) ? "are " : "is ") << libs.size() << ((numLibs > 1) ? " libs\n" : " lib\n");
+	return libs;
+}
+
+
+
+/**
+ * This function parses the library format string that specifies the format in which
+ * the reads are to be expected.
+ */
+LibraryFormat parseLibraryFormatString(std::string& fmt) {
+    using std::vector;
+    using std::string;
+    using std::map;
+    using std::stringstream;
+
+    // inspired by http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c
+
+    // first convert the string to upper-case
+    for (auto& c : fmt) { c = std::toupper(c); }
+    // split on the delimiter ':', and put the key, value (k=v) pairs into a map
+    stringstream ss(fmt);
+    string item;
+    map<string, string> kvmap;
+    while (std::getline(ss, item, ':')) {
+        auto splitPos = item.find('=', 0);
+        string key{item.substr(0, splitPos)};
+        string value{item.substr(splitPos+1)};
+        kvmap[key] = value;
+    }
+
+    map<string, ReadType> readType = {{"SE", ReadType::SINGLE_END}, {"PE", ReadType::PAIRED_END}};
+    map<string, ReadOrientation> orientationType = {{">>", ReadOrientation::SAME},
+                                           {"<>", ReadOrientation::AWAY},
+                                           {"><", ReadOrientation::TOWARD},
+                                           {"*", ReadOrientation::NONE}};
+    map<string, ReadStrandedness> strandType = {{"SA", ReadStrandedness::SA},
+                                    {"AS", ReadStrandedness::AS},
+                                    {"A", ReadStrandedness::A},
+                                    {"S", ReadStrandedness::S},
+                                    {"U", ReadStrandedness::U}};
+    auto it = kvmap.find("T");
+    string typeStr = "";
+    if (it != kvmap.end()) {
+        typeStr = it->second;
+    } else {
+        it = kvmap.find("TYPE");
+        if (it != kvmap.end()) {
+            typeStr = it->second;
+        }
+    }
+
+    if (typeStr != "SE" and typeStr != "PE") {
+        string e = typeStr + " is not a valid read type; must be one of {SE, PE}";
+        throw std::invalid_argument(e);
+    }
+
+    ReadType type = (typeStr == "SE") ? ReadType::SINGLE_END : ReadType::PAIRED_END;
+    ReadOrientation orientation = (type == ReadType::SINGLE_END) ? ReadOrientation::NONE : ReadOrientation::TOWARD;
+    ReadStrandedness strandedness{ReadStrandedness::U};
+    // Construct the LibraryFormat class from the key, value map
+    for (auto& kv : kvmap) {
+        auto& k = kv.first; auto& v = kv.second;
+        if (k == "O" or k == "ORIENTATION") {
+            auto it = orientationType.find(v);
+            if (it != orientationType.end()) { orientation = orientationType[it->first]; } else {
+                string e = v + " is not a valid orientation type; must be one of {>>, <>, ><}";
+                throw std::invalid_argument(e);
+            }
+
+        }
+        if (k == "S" or k == "STRAND") {
+            auto it = strandType.find(v);
+            if (it != strandType.end()) { strandedness = strandType[it->first]; } else {
+                string e = v + " is not a valid strand type; must be one of {SA, AS, S, A, U}";
+                throw std::invalid_argument(e);
+            }
+        }
+
+    }
+    LibraryFormat lf(type, orientation, strandedness);
+    return lf;
+}
+
+
+
+uint64_t encode(uint64_t tid, uint64_t offset) {
+    uint64_t res = (((tid & 0xFFFFFFFF) << 32) | (offset & 0xFFFFFFFF));
+    return res;
+}
+
+uint32_t transcript(uint64_t enc) {
+    uint32_t t = (enc & 0xFFFFFFFF00000000) >> 32;
+    return t;
+}
+
+uint32_t offset(uint64_t enc) {
+    uint32_t o = enc & 0xFFFFFFFF;
+    return o;
+}
+
+size_t numberOfReadsInFastaFile(const std::string& fname) {
+    constexpr size_t bufferSize = 16184;
+    char buffer[bufferSize];
+    std::ifstream ifile(fname, std::ifstream::in);
+    ifile.rdbuf()->pubsetbuf(buffer, bufferSize);
+
+    size_t numReads = 0;
+    std::string s;
+    while (ifile >> s) { if (s.front() == '>') { ++numReads; } }
+
+    ifile.close();
+
+    return numReads;
+}
+
+bool readKmerOrder( const std::string& fname, std::vector<uint64_t>& kmers ) {
+
+  std::ifstream mlist(fname, std::ios::in | std::ios::binary);
+  // Get the number of kmers from file
+  size_t numKmers{0};
+  mlist.read( reinterpret_cast<char*>( &numKmers ), sizeof( size_t ) );
+
+  // Resize the array that will hold the sorted kmers
+  kmers.resize(numKmers, 0);
+  mlist.read( reinterpret_cast<char*>( &kmers[0] ), sizeof( uint64_t) * kmers.size() );
+
+  mlist.close();
+
+  return true;
+}
+
+template <template<typename> class S, typename T>
+bool overlap( const S<T> &a, const S<T> &b ) {
+    // Query from the smaller set to the larger set
+    if ( a.size() <= b.size() ) {
+        for ( auto & ae : a ) {
+            if (b.find(ae) != b.end()) {
+                return true;
+            }
+        }
+    } else {
+        for ( auto & be : b ) {
+            if (a.find(be) != b.end()) {
+                return true;
+            }
+        }
+    }
+    // If nothing from the smaller set is in the larger set, then they don't overlap
+    return false;
+}
+
+
+TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname, std::string key) {
+
+    using std::unordered_set;
+    using std::unordered_map;
+    using std::vector;
+    using std::tuple;
+    using std::string;
+    using std::get;
+
+    // Use GffReader to read the file
+    GffReader reader(const_cast<char*>(fname.c_str()));
+    // Remember the optional attributes
+    reader.readAll(true);
+
+    struct TranscriptKeyPair {
+        const char* transcript_id;
+        const char* key;
+        TranscriptKeyPair(const char* t, const char* k) :
+            transcript_id(t), key(k) {}
+    };
+
+    // The user can group transcripts by gene_id, gene_name, or
+    // an optinal attribute that they provide as a string.
+    enum class TranscriptKey { GENE_ID, GENE_NAME, DYNAMIC };
+
+    // Select the proper attribute by which to group
+    TranscriptKey tkey = TranscriptKey::GENE_ID;
+
+    if (key == "gene_id") {
+    } else if (key == "gene_name") {
+        tkey = TranscriptKey::GENE_NAME;
+    } else {
+        tkey = TranscriptKey::DYNAMIC;
+    }
+
+    // Iterate over all transcript features and build the
+    // transcript <-> key vector.
+    auto nfeat = reader.gflst.Count();
+    std::vector<TranscriptKeyPair> feats;
+    for (int i=0; i < nfeat; ++i) {
+        auto f = reader.gflst[i];
+        if (f->isTranscript()) {
+            const char* keyStr;
+            switch (tkey) {
+                case TranscriptKey::GENE_ID:
+                    keyStr = f->getGeneID();
+                    break;
+                case TranscriptKey::GENE_NAME:
+                    keyStr = f->getGeneName();
+                    break;
+                case TranscriptKey::DYNAMIC:
+                    keyStr = f->getAttr(key.c_str());
+                    break;
+            }
+            feats.emplace_back(f->getID(), keyStr);
+        }
+    }
+
+    // Given the transcript <-> key vector, build the
+    // TranscriptGeneMap.
+
+    IndexVector t2g;
+    NameVector transcriptNames;
+    NameVector geneNames;
+
+    // holds the mapping from transcript ID to gene ID
+    IndexVector t2gUnordered;
+    // holds the set of gene IDs
+    unordered_map<string, size_t> geneNameToID;
+
+    // To read the input and assign ids
+    size_t transcriptCounter = 0;
+    size_t geneCounter = 0;
+    string transcript;
+    string gene;
+
+    std::sort( feats.begin(), feats.end(),
+    []( const TranscriptKeyPair & a, const TranscriptKeyPair & b) -> bool {
+        return std::strcmp(a.transcript_id, b.transcript_id) < 0;
+    } );
+
+    std::string currentTranscript = "";
+    for ( auto & feat : feats ) {
+
+        std::string gene(feat.key);
+        std::string transcript(feat.transcript_id);
+
+        if ( transcript != currentTranscript ) {
+            auto geneIt = geneNameToID.find(gene);
+            size_t geneID = 0;
+
+            if ( geneIt == geneNameToID.end() ) {
+                // If we haven't seen this gene yet, give it a new ID
+                geneNameToID[gene] = geneCounter;
+                geneID = geneCounter;
+                geneNames.push_back(gene);
+                ++geneCounter;
+            } else {
+                // Otherwise lookup the ID
+                geneID = geneIt->second;
+            }
+
+            transcriptNames.push_back(transcript);
+            t2g.push_back(geneID);
+
+            //++transcriptID;
+            currentTranscript = transcript;
+        }
+
+    }
+
+    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
+
+}
+
+
+TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile ) {
+
+    using std::unordered_set;
+    using std::unordered_map;
+    using std::vector;
+    using std::tuple;
+    using std::string;
+    using std::get;
+
+    using NameID = tuple<string, size_t>;
+
+    IndexVector t2g;
+    NameVector transcriptNames;
+    NameVector geneNames;
+
+    // holds the transcript name ID mapping
+    vector<NameID> transcripts;
+    // holds the mapping from transcript ID to gene ID
+    IndexVector t2gUnordered;
+    // holds the set of gene IDs
+    unordered_map<string, size_t> geneNameToID;
+
+    // To read the input and assign ids
+    size_t transcriptCounter = 0;
+    size_t geneCounter = 0;
+    string transcript;
+    string gene;
+
+    while ( ifile >> transcript >> gene ) {
+        // The transcript and it's ID
+        transcripts.push_back( make_tuple(transcript, transcriptCounter) );
+
+        auto geneIt = geneNameToID.find(gene);
+        size_t geneID = 0;
+
+        if ( geneIt == geneNameToID.end() ) {
+            // If we haven't seen this gene yet, give it a new ID
+            geneNameToID[gene] = geneCounter;
+            geneID = geneCounter;
+            geneNames.push_back(gene);
+            ++geneCounter;
+        } else {
+            // Otherwise lookup the ID
+            geneID = geneIt->second;
+        }
+
+        // Map the transcript to the gene in terms of their IDs
+        t2gUnordered.push_back(geneID);
+
+        ++transcriptCounter;
+    }
+
+    std::sort( transcripts.begin(), transcripts.end(),
+               []( const NameID & a, const NameID & b) -> bool { return get<0>(a) < get<0>(b); } );
+
+    // Resize these vectors for fast access
+    transcriptNames.resize(t2gUnordered.size());
+    t2g.resize(t2gUnordered.size());
+
+    for ( size_t newID = 0; newID < transcripts.size(); ++newID ) {
+        // For each transcript, map it to the appropriate gene
+        string oldName; size_t oldID;
+        std::tie(oldName, oldID) = transcripts[newID];
+        t2g[newID] = t2gUnordered[oldID];
+        transcriptNames[newID] = oldName;
+    }
+
+    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
+}
+
+
+TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile ) {
+    using std::vector;
+    using stream_manager = jellyfish::stream_manager<char**>;
+    using sequence_parser = jellyfish::whole_sequence_parser<stream_manager>;
+    namespace bfs = boost::filesystem;
+
+    NameVector transcriptNames;
+    NameVector geneNames {"gene"};
+
+    vector<bfs::path> paths{transcriptsFile};
+
+    // Create a jellyfish parser
+    const int concurrentFile{1};
+    char** fnames = new char*[1];
+    fnames[0] = const_cast<char*>(transcriptsFile.c_str());
+    stream_manager streams(fnames, fnames + 1, concurrentFile);
+
+    size_t maxReadGroupSize{100};
+    sequence_parser parser(4, maxReadGroupSize, concurrentFile, streams);
+
+    // while there are transcripts left to process
+    while (true) {
+        sequence_parser::job j(parser);
+        // If this job is empty, then we're done
+        if (j.is_empty()) { break; }
+
+        for (size_t i=0; i < j->nb_filled; ++i) {
+            // The transcript name
+            std::string fullHeader(j->data[i].header);
+            std::string header = fullHeader.substr(0, fullHeader.find(' '));
+            transcriptNames.emplace_back(header);
+        }
+    }
+
+    // Sort the transcript names
+    std::sort(transcriptNames.begin(), transcriptNames.end());
+
+    // Since we have no real gene groupings, the t2g vector is trivial,
+    // everything maps to gene 0.
+    IndexVector t2g(transcriptNames.size(), 0);
+
+    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
+}
+
+class ExpressionRecord {
+    public:
+        ExpressionRecord(const std::string& targetIn, uint32_t lengthIn,
+                         std::vector<double>& expValsIn) :
+            target(targetIn), length(lengthIn), expVals(expValsIn) {}
+
+        ExpressionRecord( ExpressionRecord&& other ) {
+            std::swap(target, other.target);
+            length = other.length;
+            std::swap(expVals, other.expVals);
+        }
+
+        ExpressionRecord(std::vector<std::string>& inputLine) {
+            if (inputLine.size() < 3) {
+                std::string err ("Any expression line must contain at least 3 tokens");
+                throw std::invalid_argument(err);
+            } else {
+                auto it = inputLine.begin();
+                target = *it; ++it;
+                length = std::stoi(*it); ++it;
+                for (; it != inputLine.end(); ++it) {
+                    expVals.push_back(std::stod(*it));
+                }
+            }
+        }
+
+        std::string target;
+        uint32_t length;
+        std::vector<double> expVals;
+};
+
+// From : http://stackoverflow.com/questions/9435385/split-a-string-using-c11
+std::vector<std::string> split(const std::string& str, int delimiter(int) = ::isspace){
+    using namespace std;
+    vector<string> result;
+    auto e=str.end();
+    auto i=str.begin();
+    while (i != e) {
+        i = find_if_not(i,e, delimiter);
+        if (i == e) break;
+        auto j = find_if(i,e, delimiter);
+        result.push_back(string(i,j));
+        i = j;
+    }
+    return result;
+}
+
+void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm, boost::filesystem::path& inputPath) {
+
+    using std::vector;
+    using std::string;
+    using std::ofstream;
+    using std::unordered_map;
+    using std::move;
+    using std::cerr;
+    using std::max;
+
+    std::ifstream expFile(inputPath.string());
+
+    if (!expFile.is_open()) {
+        perror("Error reading file");
+    }
+
+    //====================== From GeneSum ====================
+    vector<string> comments;
+    unordered_map<string, vector<ExpressionRecord>> geneExps;
+    string l;
+    size_t ln{0};
+
+    while (getline(expFile, l)) {
+        if (++ln % 1000 == 0) {
+            cerr << "\r\rParsed " << ln << " expression lines";
+        }
+        auto it = find_if(l.begin(), l.end(),
+                    [](char c) -> bool {return !isspace(c);});
+        if (it != l.end()) {
+            if (*it == '#') {
+                comments.push_back(l);
+            } else {
+                vector<string> toks = split(l);
+                ExpressionRecord er(toks);
+                auto gn = tgm.geneName(er.target);
+                geneExps[gn].push_back(move(er));
+            }
+        }
+    }
+    cerr << "\ndone\n";
+    expFile.close();
+
+    cerr << "Aggregating expressions to gene level . . .";
+    boost::filesystem::path outputFilePath(inputPath);
+    outputFilePath.replace_extension(".genes.sf");
+    ofstream outFile(outputFilePath.string());
+
+    // preserve any comments in the output
+    for (auto& c : comments) {
+        outFile << c << '\n';
+    }
+
+    for (auto& kv : geneExps) {
+        auto& gn = kv.first;
+
+        uint32_t geneLength{kv.second.front().length};
+        vector<double> expVals(kv.second.front().expVals.size(), 0);
+        const size_t NE{expVals.size()};
+
+        for (auto& tranExp : kv.second) {
+            geneLength = max(geneLength, tranExp.length);
+            for (size_t i = 0; i < NE; ++i) { expVals[i] += tranExp.expVals[i]; }
+        }
+
+        outFile << gn << '\t' << geneLength;
+        for (size_t i = 0; i < NE; ++i) {
+            outFile << '\t' << expVals[i];
+        }
+        outFile << '\n';
+    }
+
+    outFile.close();
+    cerr << " done\n";
+    //====================== From GeneSum =====================
+}
+
+void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
+                                boost::filesystem::path& estDir,
+                                bool haveBiasCorrectedFile) {
+    namespace bfs = boost::filesystem;
+    std::cerr << "Computing gene-level abundance estimates\n";
+    bfs::path gtfExtension(".gtf");
+    auto extension = geneMapPath.extension();
+
+    TranscriptGeneMap tranGeneMap;
+    // parse the map as a GTF file
+    if (extension == gtfExtension) {
+        // Using libgff
+        tranGeneMap = salmon::utils::transcriptGeneMapFromGTF(geneMapPath.string(), "gene_id");
+    } else { // parse the map as a simple format files
+        std::ifstream tgfile(geneMapPath.string());
+        tranGeneMap = salmon::utils::readTranscriptToGeneMap(tgfile);
+        tgfile.close();
+    }
+
+    std::cerr << "There were " << tranGeneMap.numTranscripts() << " transcripts mapping to "
+        << tranGeneMap.numGenes() << " genes\n";
+
+    bfs::path estFilePath = estDir / "quant.sf";
+    if (!bfs::exists(estFilePath)) {
+        std::stringstream errstr;
+        errstr << "Attempting to compute gene-level esimtates, but could not \n"
+            << "find isoform-level file " << estFilePath;
+        throw std::invalid_argument(errstr.str());
+    } else {
+        salmon::utils::aggregateEstimatesToGeneLevel(tranGeneMap, estFilePath);
+    }
+
+    /** Create a gene-level summary of the bias-corrected estimates as well if these exist **/
+    if (haveBiasCorrectedFile) {
+        bfs::path biasCorrectEstFilePath = estDir / "quant_bias_corrected.sf";
+        if (!bfs::exists(biasCorrectEstFilePath)) {
+            std::stringstream errstr;
+            errstr << "Attempting to compute gene-level esimtates, but could not \n"
+                << "find bias-corrected isoform-level file " << biasCorrectEstFilePath;
+            throw std::invalid_argument(errstr.str());
+        } else {
+            salmon::utils::aggregateEstimatesToGeneLevel(tranGeneMap, biasCorrectEstFilePath);
+        }
+    }
+}
+
+
+
+}
+}
diff --git a/src/Salmon.cpp b/src/Salmon.cpp
new file mode 100644
index 0000000..b7ba2d6
--- /dev/null
+++ b/src/Salmon.cpp
@@ -0,0 +1,237 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <boost/thread/thread.hpp>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstdint>
+#include <cstring>
+#include <cstdio>
+#include <sstream>
+#include <string>
+#include <memory>
+#include <functional>
+#include <unordered_map>
+#include <thread>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/filesystem.hpp>
+
+#include "BiasIndex.hpp"
+#include "SailfishUtils.hpp"
+#include "GenomicFeature.hpp"
+#include "SalmonConfig.hpp"
+#include "VersionChecker.hpp"
+
+int help(int argc, char* argv[]) {
+    auto helpmsg = R"(
+    ===============
+
+    Please invoke salmon with one of the following commands {index, quant, swim}.
+    For more inforation on the options for theses particular methods, use the -h
+    flag along with the method name.  For example:
+
+    salmon index -h
+
+    will give you detailed help information about the index command.
+    )";
+
+    std::cerr << "    Salmon v" << salmon::version << helpmsg << "\n";
+    return 1;
+}
+
+
+int dualModeMessage() {
+    auto helpmsg = R"(
+    ===============
+
+    salmon quant has two modes --- one quantifies expression using raw reads
+    and the other makes use of already-aligned reads (in BAM/SAM format).
+    which algorithm is used depends on the arguments passed to salmon quant.
+    If you provide salmon with alignments '-a [ --alignments ]' then the
+    alignment-based algorithm will be used, otherwise the algorithm for
+    quantifying from raw reads will be used.
+
+    To view the help for salmon's alignment-based mode, use the command
+
+    salmon quant --help-alignment
+
+    to view the help for salmon's read-based mode, use the command
+
+    salmon quant --help-reads
+    )";
+    std::cerr << "    Salmon v" << salmon::version << helpmsg << "\n";
+    return 1;
+}
+
+
+/**
+ * Bonus!
+ */
+int salmonSwim(int argc, char* argv[]) {
+
+  std::cerr << R"(
+    _____       __
+   / ___/____ _/ /___ ___  ____  ____
+   \__ \/ __ `/ / __ `__ \/ __ \/ __ \
+  ___/ / /_/ / / / / / / / /_/ / / / /
+ /____/\__,_/_/_/ /_/ /_/\____/_/ /_/
+
+
+)";
+
+  return 0;
+
+}
+
+int salmonIndex(int argc, char* argv[]);
+int salmonQuantify(int argc, char* argv[]);
+int salmonAlignmentQuantify(int argc, char* argv[]);
+
+bool verbose = false;
+
+int main( int argc, char* argv[] ) {
+  using std::string;
+  namespace po = boost::program_options;
+
+  try {
+
+    po::options_description hidden("hidden");
+    hidden.add_options()
+    ("command", po::value<string>(), "command to run {index, quant, sf}");
+
+    po::options_description sfopts("Allowed Options");
+    sfopts.add_options()
+    ("version,v", "print version string")
+    ("no-version-check", "don't check with the server to see if this is the latest version")
+    ("help,h", "produce help message")
+    ;
+
+    po::options_description all("Allowed Options");
+    all.add(sfopts).add(hidden);
+
+    po::positional_options_description pd;
+    pd.add("command", 1);
+
+    size_t topLevelArgc = argc;
+    for (size_t i : boost::irange(size_t{1}, static_cast<size_t>(argc))) {
+      if (argv[i][0] != '-') {
+        topLevelArgc = i+1;
+        break;
+      }
+    }
+
+    po::variables_map vm;
+    po::parsed_options parsed = po::command_line_parser(topLevelArgc, argv).options(all).positional(pd).allow_unregistered().run();
+    po::store(parsed, vm);
+
+    if (vm.count("version")) {
+      std::cerr << "version : " << salmon::version << "\n";
+      std::exit(0);
+    }
+
+    if (vm.count("help") and !vm.count("command")) {
+        std::cout << sfopts << std::endl;
+        help(argc, argv);
+        std::exit(0);
+    }
+
+    if (!vm.count("no-version-check")){
+      std::string versionMessage = getVersionMessage();
+      std::cerr << versionMessage;
+    }
+
+    po::notify(vm);
+
+    std::unordered_map<string, std::function<int(int, char*[])>> cmds({
+      {"index", salmonIndex},
+      {"quant", salmonQuantify},
+      {"swim", salmonSwim}
+    });
+
+    string cmd = vm["command"].as<string>();
+
+    int subCommandArgc = argc - topLevelArgc + 1;
+    char** argv2 = new char*[subCommandArgc];
+    argv2[0] = argv[0];
+    std::copy_n( &argv[topLevelArgc], argc-topLevelArgc, &argv2[1] );
+
+    auto cmdMain = cmds.find(cmd);
+    if (cmdMain == cmds.end()) {
+      help(subCommandArgc, argv2);
+    } else {
+      // If the command is quant; determine whether
+      // we're quantifying with raw sequences or alignemnts
+      if (cmdMain->first == "quant") {
+
+        // detect mode-specific help request
+        if (strncmp(argv2[1], "--help-alignment", 16) == 0) {
+            std::vector<char> helpStr{'-','-','h','e','l','p','\0'};
+            char* helpArgv[] = {argv[0], &helpStr[0]};
+            salmonAlignmentQuantify(2, helpArgv);
+        } else if (strncmp(argv2[1], "--help-reads", 12) == 0) {
+            std::vector<char> helpStr{'-','-','h','e','l','p','\0'};
+            char* helpArgv[] = {argv[0], &helpStr[0]};
+            salmonQuantify(2, helpArgv);
+        }
+
+        // detect general help request
+        if (strncmp(argv2[1], "--help", 6) == 0 or
+            strncmp(argv2[1], "-h", 2) == 0) {
+            dualModeMessage();
+            std::exit(0);
+        }
+
+        // otherwise, detect and dispatch the correct mode
+        bool useSalmonAlign{false};
+        for (size_t i = 0; i < subCommandArgc; ++i) {
+            if (strncmp(argv2[i], "-a", 2) == 0 or
+                strncmp(argv2[i], "--alignments", 12) == 0) {
+                useSalmonAlign = true;
+                break;
+            }
+        }
+        if (useSalmonAlign) {
+            salmonAlignmentQuantify(subCommandArgc, argv2);
+        } else {
+            salmonQuantify(subCommandArgc, argv2);
+        }
+      } else {
+        cmdMain->second(subCommandArgc, argv2);
+      }
+    }
+    delete[] argv2;
+
+  } catch (po::error &e) {
+    std::cerr << "Program Option Error (main) : [" << e.what() << "].\n Exiting.\n";
+    std::exit(1);
+  } catch (...) {
+    std::cerr << argv[0] << " was invoked improperly.\n";
+    std::cerr << "For usage information, try " << argv[0] << " --help\nExiting.\n";
+  }
+
+  return 0;
+}
diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp
new file mode 100644
index 0000000..add7f11
--- /dev/null
+++ b/src/SalmonQuantify.cpp
@@ -0,0 +1,3209 @@
+/**
+>HEADER
+    Copyright (c) 2013, 2014, 2015 Rob Patro rob.patro at cs.stonybrook.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <unordered_map>
+#include <map>
+#include <vector>
+#include <unordered_set>
+#include <mutex>
+#include <thread>
+#include <sstream>
+#include <exception>
+#include <random>
+#include <queue>
+#include <unordered_map>
+#include "btree_map.h"
+#include "btree_set.h"
+
+// C++ string formatting library
+#include "format.h"
+
+// C Includes for BWA
+#include <cstdio>
+#include <unistd.h>
+#include <cstdlib>
+#include <cstring>
+#include <cctype>
+
+extern "C" {
+#include "bwa.h"
+#include "bwamem.h"
+#include "ksort.h"
+#include "kvec.h"
+#include "utils.h"
+}
+
+// Jellyfish 2 include
+#include "jellyfish/mer_dna.hpp"
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+
+// Boost Includes
+#include <boost/filesystem.hpp>
+#include <boost/container/flat_map.hpp>
+#include <boost/dynamic_bitset/dynamic_bitset.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/program_options.hpp>
+#include <boost/lockfree/queue.hpp>
+#include <boost/thread/thread.hpp>
+
+// TBB Includes
+#include "tbb/concurrent_unordered_set.h"
+#include "tbb/concurrent_vector.h"
+#include "tbb/concurrent_unordered_map.h"
+#include "tbb/concurrent_queue.h"
+#include "tbb/parallel_for.h"
+#include "tbb/parallel_for_each.h"
+#include "tbb/parallel_reduce.h"
+#include "tbb/blocked_range.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/partitioner.h"
+
+// logger includes
+#include "spdlog/spdlog.h"
+
+// Cereal includes
+#include "cereal/types/vector.hpp"
+#include "cereal/archives/binary.hpp"
+
+#include "concurrentqueue.h"
+
+// salmon / Salmon includes
+#include "ClusterForest.hpp"
+#include "SalmonMath.hpp"
+#include "Transcript.hpp"
+#include "LibraryFormat.hpp"
+#include "SalmonUtils.hpp"
+#include "ReadLibrary.hpp"
+#include "SalmonConfig.hpp"
+#include "IOUtils.hpp"
+
+#include "AlignmentGroup.hpp"
+#include "PairSequenceParser.hpp"
+#include "ForgettingMassCalculator.hpp"
+#include "FragmentLengthDistribution.hpp"
+#include "ReadExperiment.hpp"
+#include "SalmonOpts.hpp"
+#include "EquivalenceClassBuilder.hpp"
+#include "CollapsedEMOptimizer.hpp"
+#include "CollapsedGibbsSampler.hpp"
+
+/* This allows us to use CLASP for optimal MEM
+ * chaining.  However, this seems to be neither
+ * computationally efficient, nor significantly
+ * better than the greedy chaining, so I'm temporarily
+ * removing this un-necessary dependency.  If you
+ * (other dev or future Rob) re-instate this in the future
+ * remember to re-enable the CLASP fetch and build
+ * steps in the CMakeLists.txt files
+ *
+ *#include "FragmentList.hpp"
+ */
+
+extern unsigned char nst_nt4_table[256];
+char const* bwa_pg = "cha";
+
+
+/******* STUFF THAT IS STATIC IN BWAMEM THAT WE NEED HERE --- Just re-define it *************/
+#define intv_lt(a, b) ((a).info < (b).info)
+KSORT_INIT(mem_intv, bwtintv_t, intv_lt)
+
+typedef struct {
+    bwtintv_v mem, mem1, *tmpv[2];
+} smem_aux_t;
+
+static smem_aux_t *smem_aux_init()
+{
+    smem_aux_t *a;
+    a = static_cast<smem_aux_t*>(calloc(1, sizeof(smem_aux_t)));
+    a->tmpv[0] = static_cast<bwtintv_v*>(calloc(1, sizeof(bwtintv_v)));
+    a->tmpv[1] = static_cast<bwtintv_v*>(calloc(1, sizeof(bwtintv_v)));
+    return a;
+}
+
+static void smem_aux_destroy(smem_aux_t *a)
+{
+    free(a->tmpv[0]->a); free(a->tmpv[0]);
+    free(a->tmpv[1]->a); free(a->tmpv[1]);
+    free(a->mem.a); free(a->mem1.a);
+    free(a);
+}
+
+static void mem_collect_intv(const SalmonOpts& sopt, const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a)
+{
+    int i, k, x = 0, old_n;
+    int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1;
+    int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
+    a->mem.n = 0;
+    // first pass: find all SMEMs
+    while (x < len) {
+        if (seq[x] < 4) {
+            x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
+            // EDIT
+            // x += 8;
+            for (i = 0; i < a->mem1.n; ++i) {
+                bwtintv_t *p = &a->mem1.a[i];
+                int slen = (uint32_t)p->info - (p->info>>32); // seed length
+                if (slen >= opt->min_seed_len)
+                    kv_push(bwtintv_t, a->mem, *p);
+            }
+        } else ++x;
+    }
+    // second pass: find MEMs inside a long SMEM
+    old_n = a->mem.n;
+    for (k = 0; k < old_n; ++k) {
+        bwtintv_t *p = &a->mem.a[k];
+        int start = p->info>>32, end = (int32_t)p->info;
+        if (end - start < split_len || p->x[2] > opt->split_width) continue;
+        bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
+        for (i = 0; i < a->mem1.n; ++i)
+            if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
+                kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
+    }
+    // third pass: LAST-like
+    if (sopt.extraSeedPass and opt->max_mem_intv > 0) {
+        x = 0;
+        while (x < len) {
+            if (seq[x] < 4) {
+                if (1) {
+                    bwtintv_t m;
+                    x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
+                    if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
+                } else { // for now, we never come to this block which is slower
+                    x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
+                    for (i = 0; i < a->mem1.n; ++i)
+                        kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
+                }
+            } else ++x;
+        }
+    }
+    // sort
+    // ks_introsort(mem_intv, a->mem.n, a->mem.a);
+}
+
+
+/******* END OF STUFF THAT IS STATIC IN BWAMEM THAT WE NEED HERE --- Just re-define it *************/
+
+using paired_parser = pair_sequence_parser<char**>;
+using stream_manager = jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
+using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+
+using TranscriptID = uint32_t;
+using TranscriptIDVector = std::vector<TranscriptID>;
+using KmerIDMap = std::vector<TranscriptIDVector>;
+using my_mer = jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;
+
+constexpr uint32_t miniBatchSize{1000};
+
+class SMEMAlignment {
+    public:
+        SMEMAlignment() :
+            transcriptID_(std::numeric_limits<TranscriptID>::max()),
+            format_(LibraryFormat::formatFromID(0)),
+            score_(0.0),
+            hitPos_(0),
+            fragLength_(0),
+            logProb(salmon::math::LOG_0),
+            logBias(salmon::math::LOG_0){}
+
+        SMEMAlignment(TranscriptID transcriptIDIn, LibraryFormat format,
+                  double scoreIn = 0.0,
+                  int32_t hitPosIn = 0,
+                  uint32_t fragLengthIn= 0,
+                  double logProbIn = salmon::math::LOG_0) :
+            transcriptID_(transcriptIDIn), format_(format), score_(scoreIn),
+            hitPos_(hitPosIn), fragLength_(fragLengthIn), logProb(logProbIn) {}
+
+        SMEMAlignment(const SMEMAlignment& o) = default;
+        SMEMAlignment(SMEMAlignment&& o) = default;
+        SMEMAlignment& operator=(SMEMAlignment& o) = default;
+        SMEMAlignment& operator=(SMEMAlignment&& o) = default;
+
+
+        inline TranscriptID transcriptID() const { return transcriptID_; }
+        inline uint32_t fragLength() const { return fragLength_; }
+        inline LibraryFormat libFormat() const { return format_; }
+        inline double score() const { return score_; }
+        inline int32_t hitPos() const { return hitPos_; }
+        // inline double coverage() {  return static_cast<double>(kmerCount) / fragLength_; };
+        uint32_t kmerCount;
+        double logProb;
+        double logBias;
+        template <typename Archive>
+        void save(Archive& archive) const {
+            archive(transcriptID_, format_.formatID(), score_, hitPos_, fragLength_);
+        }
+
+        template <typename Archive>
+        void load(Archive& archive) {
+            uint8_t formatID;
+            archive(transcriptID_, formatID, score_, hitPos_, fragLength_);
+            format_ = LibraryFormat::formatFromID(formatID);
+        }
+
+    private:
+        TranscriptID transcriptID_;
+        LibraryFormat format_;
+        double score_;
+        int32_t hitPos_;
+        uint32_t fragLength_;
+};
+
+#define __MOODYCAMEL__
+#if defined(__MOODYCAMEL__)
+ using AlnGroupQueue = moodycamel::ConcurrentQueue<AlignmentGroup<SMEMAlignment>*>;
+#else
+ using AlnGroupQueue = tbb::concurrent_queue<AlignmentGroup<SMEMAlignment>*>;
+#endif
+
+void processMiniBatch(
+        ReadExperiment& readExp,
+        ForgettingMassCalculator& fmCalc,
+        uint64_t firstTimestepOfRound,
+        ReadLibrary& readLib,
+        const SalmonOpts& salmonOpts,
+        std::vector<AlignmentGroup<SMEMAlignment>*>& batchHits,
+        std::vector<Transcript>& transcripts,
+        ClusterForest& clusterForest,
+        FragmentLengthDistribution& fragLengthDist,
+        std::atomic<uint64_t>& numAssignedFragments,
+        std::default_random_engine& randEng,
+        bool initialRound,
+        bool& burnedIn
+        ) {
+
+    using salmon::math::LOG_0;
+    using salmon::math::LOG_1;
+    using salmon::math::LOG_ONEHALF;
+    using salmon::math::logAdd;
+    using salmon::math::logSub;
+
+    const uint64_t numBurninFrags = salmonOpts.numBurninFrags;
+    bool useMassBanking = (!initialRound and salmonOpts.useMassBanking);
+
+    auto log = spdlog::get("jointLog");
+    size_t numTranscripts{transcripts.size()};
+    size_t localNumAssignedFragments{0};
+    size_t priorNumAssignedFragments{numAssignedFragments};
+    std::uniform_real_distribution<> uni(0.0, 1.0 + std::numeric_limits<double>::min());
+    std::vector<uint64_t> libTypeCounts(LibraryFormat::maxLibTypeID() + 1);
+
+    std::vector<FragmentStartPositionDistribution>& fragStartDists =
+        readExp.fragmentStartPositionDistributions();
+    auto& biasModel = readExp.sequenceBiasModel();
+
+    bool updateCounts = initialRound;
+    bool useReadCompat = salmonOpts.incompatPrior != salmon::math::LOG_1;
+    bool useFSPD{!salmonOpts.noFragStartPosDist};
+    bool useFragLengthDist{!salmonOpts.noFragLengthDist};
+    bool useSequenceBiasModel{!salmonOpts.noSeqBiasModel};
+    bool noFragLenFactor{salmonOpts.noFragLenFactor};
+
+    const auto expectedLibraryFormat = readLib.format();
+    uint64_t zeroProbFrags{0};
+
+    //EQClass
+    EquivalenceClassBuilder& eqBuilder = readExp.equivalenceClassBuilder();
+
+    // Build reverse map from transcriptID => hit id
+    using HitID = uint32_t;
+    /* This isn't used anymore!!!
+    btree::btree_map<TranscriptID, std::vector<SMEMAlignment*>> hitsForTranscript;
+    size_t hitID{0};
+    for (auto& hv : batchHits) {
+        for (auto& tid : hv->alignments()) {
+            hitsForTranscript[tid.transcriptID()].push_back(&tid);
+        }
+        ++hitID;
+    }
+    double clustTotal = std::log(batchHits.size()) + logForgettingMass;
+    */
+
+    double logForgettingMass{0.0};
+    uint64_t currentMinibatchTimestep{0};
+    fmCalc.getLogMassAndTimestep(logForgettingMass, currentMinibatchTimestep);
+
+    double startingCumulativeMass = fmCalc.cumulativeLogMassAt(firstTimestepOfRound);
+    // BEGIN: DOUBLY-COLLAPSED TESTING
+    struct HitInfo {
+        uint32_t numHits = 0;
+        bool observed = false;
+        double newUniqueMass = LOG_0;
+    };
+
+    std::unordered_map<TranscriptID, HitInfo> hitInfo;
+    // We only need to fill this in if it's not the first round
+    if (useMassBanking) {
+        for (auto& alnGroup : batchHits) {
+            for (auto a : alnGroup->alignments()) {
+                auto transcriptID = a.transcriptID();
+                if (transcriptID < 0 or transcriptID >= transcripts.size()) {
+                    salmonOpts.jointLog->warn("Invalid Transcript ID [{}] encountered", transcriptID);
+                }
+                auto& info = hitInfo[transcriptID];
+                auto& txp = transcripts[transcriptID];
+                if(!info.observed) {
+                    info.observed = true;
+
+                    if (txp.uniqueCount() > 0) {
+                        double dormantInterval = static_cast<double>(currentMinibatchTimestep -
+                                    firstTimestepOfRound + 1);
+                        // The cumulative mass last time this was updated
+                        double prevUpdateMass = startingCumulativeMass;//fmCalc.cumulativeLogMassAt(startTime);
+                        double currentUpdateMass = fmCalc.cumulativeLogMassAt(currentMinibatchTimestep);
+                        double updateFraction = std::log(txp.uniqueUpdateFraction());
+
+                        // The new unique mass to be added to this transcript
+                        double newUniqueMass = salmon::math::logSub(currentUpdateMass, prevUpdateMass) +
+                            updateFraction - std::log(dormantInterval);
+                        info.newUniqueMass = newUniqueMass;
+                    }
+                }
+                info.numHits++;
+            } // end alignments in group
+        } // end batch hits
+    } // end initial round
+    // END: DOUBLY-COLLAPSED TESTING
+
+    int i{0};
+    {
+        // Iterate over each group of alignments (a group consists of all alignments reported
+        // for a single read).  Distribute the read's mass to the transcripts
+        // where it potentially aligns.
+        for (auto& alnGroup : batchHits) {
+            if (alnGroup->size() == 0) { continue; }
+
+            // We start out with probability 0
+            double sumOfAlignProbs{LOG_0};
+            // Record whether or not this read is unique to a single transcript.
+            bool transcriptUnique{true};
+
+            auto firstTranscriptID = alnGroup->alignments().front().transcriptID();
+            std::unordered_set<size_t> observedTranscripts;
+
+            // EQCLASS
+            std::vector<uint32_t> txpIDs;
+            std::vector<double> auxProbs;
+            double auxDenom = salmon::math::LOG_0;
+	        size_t txpIDsHash{0};
+
+            double avgLogBias = salmon::math::LOG_0;
+            uint32_t numInGroup{0};
+            // For each alignment of this read
+            for (auto& aln : alnGroup->alignments()) {
+                auto transcriptID = aln.transcriptID();
+                auto& transcript = transcripts[transcriptID];
+                transcriptUnique = transcriptUnique and (transcriptID == firstTranscriptID);
+
+                double refLength = transcript.RefLength > 0 ? transcript.RefLength : 1.0;
+                double coverage = aln.score();
+                double logFragCov = (coverage > 0) ? std::log(coverage) : LOG_0;
+
+                // The alignment probability is the product of a
+                // transcript-level term (based on abundance and) an
+                // alignment-level term.
+                double logRefLength;
+                if (salmonOpts.noEffectiveLengthCorrection or !burnedIn) {
+                    logRefLength = std::log(transcript.RefLength);
+                } else {
+                    logRefLength = transcript.getCachedEffectiveLength();
+                }
+
+                double transcriptLogCount = transcript.mass(initialRound);
+
+                // BEGIN: DOUBLY-COLLAPSED TESTING
+                // If this is not the initial round, then add the
+                // appropriate proportion of unique read mass for
+                // every ambiguous alignment we encounter. We do
+                // this before the line (below) where we
+                // retrieve this transcript's mass.
+                if (useMassBanking and transcript.uniqueCount() > 0) {
+                    auto txpHitInfo = hitInfo[transcriptID];
+                    transcriptLogCount = salmon::math::logAdd(
+                            transcriptLogCount,
+                            txpHitInfo.newUniqueMass);
+                }
+                // END: DOUBLY-COLLAPSED TESTING
+
+
+                if ( transcriptLogCount != LOG_0 ) {
+                    double errLike = salmon::math::LOG_1;
+                    if (burnedIn) {
+                        // TODO: Make error model for smem-based quantification
+                        //errLike = errMod.logLikelihood(aln, transcript);
+                    }
+
+                    double logFragProb = (useFragLengthDist) ?
+                        ((aln.fragLength() > 0) ?
+                         fragLengthDist.pmf(static_cast<size_t>(aln.fragLength())) :
+                         LOG_1) :
+                         LOG_1;
+
+                    // TESTING
+                    if (noFragLenFactor) { logFragProb = LOG_1; }
+
+
+                    // TODO: Take the fragment length distribution into account
+                    // for single-end fragments as in the alignment-based code below
+                    /*
+                    if (!salmonOpts.noFragLengthDist) {
+                        if(aln->fragLen() == 0) {
+                            if (aln->isLeft() and transcript.RefLength - aln->left() < fragLengthDist.maxVal()) {
+                                logFragProb = fragLengthDist.cmf(transcript.RefLength - aln->left());
+                            } else if (aln->isRight() and aln->right() < fragLengthDist.maxVal()) {
+                                logFragProb = fragLengthDist.cmf(aln->right());
+                            }
+                        } else {
+                            logFragProb = fragLengthDist.pmf(static_cast<size_t>(aln->fragLen()));
+                        }
+                    }
+                    */
+
+                    // The probability that the fragments align to the given strands in the
+                    // given orientations.
+                    double logAlignCompatProb = (useReadCompat) ?
+                                                (salmon::utils::logAlignFormatProb(aln.libFormat(), expectedLibraryFormat, salmonOpts.incompatPrior)) :
+                                                LOG_1;
+
+                    // Allow for a non-uniform fragment start position distribution
+                    double startPosProb = -logRefLength;
+                    auto hitPos = aln.hitPos();
+                    if (useFSPD and burnedIn and hitPos < refLength) {
+                        auto& fragStartDist =
+                            fragStartDists[transcript.lengthClassIndex()];
+                        startPosProb = fragStartDist(hitPos, refLength, logRefLength);
+                    }
+
+                    double logBiasProb = salmon::math::LOG_1;
+                    if (useSequenceBiasModel and burnedIn) {
+                        double fragLength = aln.fragLength();
+                        if (fragLength > 0) {
+                            int32_t leftHitPos = hitPos;
+                            int32_t rightHitPos = hitPos + fragLength;
+                            logBiasProb = biasModel.biasFactor(transcript,
+                                                               leftHitPos,
+                                                               rightHitPos,
+                                                               aln.libFormat());
+                        } else {
+                            logBiasProb = biasModel.biasFactor(transcript,
+                                                               hitPos,
+                                                               aln.libFormat());
+                        }
+
+                    }
+
+                    // Increment the count of this type of read that we've seen
+                    ++libTypeCounts[aln.libFormat().formatID()];
+
+                    double auxProb = startPosProb + logFragProb + logFragCov +
+                                     logAlignCompatProb + logBiasProb;
+
+                    aln.logProb = transcriptLogCount + auxProb;
+
+                    if (std::abs(aln.logProb) == LOG_0) { continue; }
+
+                    if (useSequenceBiasModel and burnedIn) {
+                        avgLogBias = salmon::math::logAdd(avgLogBias, logBiasProb);
+                        numInGroup++;
+                        aln.logBias = logBiasProb;
+                    } else {
+                        avgLogBias = salmon::math::logAdd(avgLogBias, logBiasProb);
+                        numInGroup++;
+                        aln.logBias = salmon::math::LOG_1;
+                    }
+
+                    sumOfAlignProbs = logAdd(sumOfAlignProbs, aln.logProb);
+
+                    if (updateCounts and
+                        observedTranscripts.find(transcriptID) == observedTranscripts.end()) {
+                        transcripts[transcriptID].addTotalCount(1);
+                        observedTranscripts.insert(transcriptID);
+                    }
+                    // EQCLASS
+                    txpIDs.push_back(transcriptID);
+                    auxProbs.push_back(auxProb);
+                    auxDenom = salmon::math::logAdd(auxDenom, auxProb);
+    	            boost::hash_combine(txpIDsHash, transcriptID);
+                } else {
+                    aln.logProb = LOG_0;
+                }
+            }
+
+            // If this fragment has a zero probability,
+            // go to the next one
+            if (sumOfAlignProbs == LOG_0) {
+                ++zeroProbFrags;
+                continue;
+            } else { // otherwise, count it as assigned
+                ++localNumAssignedFragments;
+            }
+
+            if (numInGroup > 0){
+                avgLogBias = avgLogBias - std::log(numInGroup);
+            }
+
+            // EQCLASS
+            TranscriptGroup tg(txpIDs, txpIDsHash);
+            double auxProbSum{0.0};
+            for (auto& p : auxProbs) {
+                //p -= auxDenom;
+                p = std::exp(p - auxDenom);
+                auxProbSum += p;
+            }
+            if (std::abs(auxProbSum - 1.0) > 0.01) {
+                std::cerr << "weights had sum of " << auxProbSum
+                          << " but it should be 1!!\n\n";
+            }
+            eqBuilder.addGroup(std::move(tg), auxProbs);
+
+
+            // normalize the hits
+            for (auto& aln : alnGroup->alignments()) {
+                if (std::abs(aln.logProb) == LOG_0) { continue; }
+                // Normalize the log-probability of this alignment
+                aln.logProb -= sumOfAlignProbs;
+                // Get the transcript referenced in this alignment
+                auto transcriptID = aln.transcriptID();
+                auto& transcript = transcripts[transcriptID];
+
+                // Add the new mass to this transcript
+                double newMass = logForgettingMass + aln.logProb;
+
+                // If this is not the initial round, and we need to
+                // add "banked" mass for this hit, do it now.
+                if (useMassBanking and transcript.uniqueCount() > 0) {
+                    newMass = salmon::math::logAdd(newMass, hitInfo[transcriptID].newUniqueMass);
+                }
+                transcript.addMass( newMass );
+                transcript.addBias( aln.logBias );
+
+                double r = uni(randEng);
+                if (!burnedIn and r < std::exp(aln.logProb)) {
+                    //errMod.update(aln, transcript, aln.logProb, logForgettingMass);
+                    double fragLength = aln.fragLength();
+                    if (useFragLengthDist and fragLength > 0.0) {
+                        //if (aln.fragType() == ReadType::PAIRED_END) {
+                        fragLengthDist.addVal(fragLength, logForgettingMass);
+                    }
+                    if (useFSPD) {
+                        auto hitPos = aln.hitPos();
+                        auto& fragStartDist =
+                            fragStartDists[transcript.lengthClassIndex()];
+                        fragStartDist.addVal(hitPos,
+                                             transcript.RefLength,
+                                             logForgettingMass);
+                    }
+                    if (useSequenceBiasModel) {
+                        if (fragLength > 0.0) {
+                            int32_t leftPos = aln.hitPos();
+                            int32_t rightPos = leftPos + fragLength;
+                            biasModel.update(transcript, leftPos, rightPos,
+                                             aln.libFormat(), logForgettingMass, LOG_1);
+                        } else {
+                            int32_t hitPos = aln.hitPos();
+                            biasModel.update(transcript, hitPos,
+                                             aln.libFormat(),
+                                             logForgettingMass, LOG_1);
+                        }
+                    }
+                }
+            } // end normalize
+
+            double avgBias = std::exp(avgLogBias);
+            // update the single target transcript
+            if (transcriptUnique) {
+                if (updateCounts) {
+                    transcripts[firstTranscriptID].addUniqueCount(1);
+                }
+                clusterForest.updateCluster(
+                        firstTranscriptID,
+                        1.0,
+                        logForgettingMass, updateCounts);
+            } else { // or the appropriate clusters
+                clusterForest.mergeClusters<SMEMAlignment>(alnGroup->alignments().begin(), alnGroup->alignments().end());
+                clusterForest.updateCluster(
+                        alnGroup->alignments().front().transcriptID(),
+                        1.0,
+                        logForgettingMass, updateCounts);
+            }
+
+            } // end read group
+        }// end timer
+
+        double individualTotal = LOG_0;
+        {
+            /*
+            // M-step
+            double totalMass{0.0};
+            for (auto kv = hitsForTranscript.begin(); kv != hitsForTranscript.end(); ++kv) {
+                auto transcriptID = kv->first;
+                // The target must be a valid transcript
+                if (transcriptID >= numTranscripts or transcriptID < 0) {std::cerr << "index " << transcriptID << " out of bounds\n"; }
+
+                auto& transcript = transcripts[transcriptID];
+
+                // The prior probability
+                double hitMass{LOG_0};
+
+                // The set of alignments that match transcriptID
+                auto& hits = kv->second;
+                std::for_each(hits.begin(), hits.end(), [&](SMEMAlignment* aln) -> void {
+                        if (!std::isfinite(aln->logProb)) { std::cerr << "hitMass = " << aln->logProb << "\n"; }
+                        hitMass = logAdd(hitMass, aln->logProb);
+                        });
+
+                double updateMass = logForgettingMass + hitMass;
+                individualTotal = logAdd(individualTotal, updateMass);
+                totalMass = logAdd(totalMass, updateMass);
+                transcript.addMass(updateMass);
+            } // end for
+            */
+        } // end timer
+
+        if (zeroProbFrags > 0) {
+            log->warn("Minibatch contained {} "
+                      "0 probability fragments", zeroProbFrags);
+        }
+
+        numAssignedFragments += localNumAssignedFragments;
+        if (numAssignedFragments >= numBurninFrags and !burnedIn) {
+            burnedIn = true;
+            for (auto& t : transcripts) {  t.updateEffectiveLength(fragLengthDist); }
+            if (useFSPD) {
+                // update all of the fragment start position
+                // distributions
+                for (auto& fspd : fragStartDists) {
+                    fspd.update();
+                }
+            }
+        }
+        if (initialRound) {
+            readLib.updateLibTypeCounts(libTypeCounts);
+        }
+}
+
+uint32_t basesCovered(std::vector<uint32_t>& kmerHits) {
+    std::sort(kmerHits.begin(), kmerHits.end());
+    uint32_t covered{0};
+    uint32_t lastHit{0};
+    uint32_t kl{20};
+    for (auto h : kmerHits) {
+        covered += std::min(h - lastHit, kl);
+        lastHit = h;
+    }
+    return covered;
+}
+
+uint32_t basesCovered(std::vector<uint32_t>& posLeft, std::vector<uint32_t>& posRight) {
+    return basesCovered(posLeft) + basesCovered(posRight);
+}
+
+class KmerVote {
+    public:
+        KmerVote(int32_t vp, uint32_t rp, uint32_t vl) : votePos(vp), readPos(rp), voteLen(vl) {}
+        int32_t votePos{0};
+        uint32_t readPos{0};
+        uint32_t voteLen{0};
+        /*
+        std::string str(){
+            return "<" + votePos  + ", "  + readPos  + ", "  + voteLen + ">";
+        }
+        */
+};
+class MatchFragment {
+    public:
+        MatchFragment(uint32_t refStart_, uint32_t queryStart_, uint32_t length_) :
+            refStart(refStart_), queryStart(queryStart_), length(length_) {}
+
+        uint32_t refStart, queryStart, length;
+        uint32_t weight;
+        double score;
+};
+
+bool precedes(const MatchFragment& a, const MatchFragment& b) {
+    return (a.refStart + a.length) < b.refStart and
+           (a.queryStart + a.length) < b.queryStart;
+}
+
+
+class TranscriptHitList {
+    public:
+        int32_t bestHitPos{0};
+        uint32_t bestHitCount{0};
+        double bestHitScore{0.0};
+
+        std::vector<KmerVote> votes;
+        std::vector<KmerVote> rcVotes;
+
+        uint32_t targetID;
+
+        bool isForward_{true};
+
+        void addFragMatch(uint32_t tpos, uint32_t readPos, uint32_t voteLen) {
+            int32_t votePos = static_cast<int32_t>(tpos) - static_cast<int32_t>(readPos);
+            votes.emplace_back(votePos, readPos, voteLen);
+        }
+
+        void addFragMatchRC(uint32_t tpos, uint32_t readPos, uint32_t voteLen, uint32_t readLen) {
+            //int32_t votePos = static_cast<int32_t>(tpos) - (readPos) + voteLen;
+            int32_t votePos = static_cast<int32_t>(tpos) - (readLen - readPos);
+            rcVotes.emplace_back(votePos, readPos, voteLen);
+        }
+
+        uint32_t totalNumHits() { return std::max(votes.size(), rcVotes.size()); }
+
+        bool computeBestLoc_(std::vector<KmerVote>& sVotes, Transcript& transcript,
+                             std::string& read, bool isRC,
+                             int32_t& maxClusterPos, uint32_t& maxClusterCount, double& maxClusterScore) {
+            // Did we update the highest-scoring cluster? This will be set to
+            // true iff we have a cluster of a higher score than the score
+            // currently given in maxClusterCount.
+            bool updatedMaxScore{false};
+
+            if (sVotes.size() == 0) { return updatedMaxScore; }
+
+            struct VoteInfo {
+                uint32_t coverage = 0;
+                int32_t rightmostBase = 0;
+            };
+
+            uint32_t readLen = read.length();
+
+            boost::container::flat_map<uint32_t, VoteInfo> hitMap;
+            int32_t currClust{static_cast<int32_t>(sVotes.front().votePos)};
+
+            for (size_t j = 0; j < sVotes.size(); ++j) {
+
+                int32_t votePos = sVotes[j].votePos;
+                uint32_t readPos = sVotes[j].readPos;
+                uint32_t voteLen = sVotes[j].voteLen;
+
+                if (votePos >= currClust) {
+                    if (votePos - currClust > 10) {
+                        currClust = votePos;
+                    }
+                    auto& hmEntry = hitMap[currClust];
+
+                    hmEntry.coverage += std::min(voteLen, (votePos + readPos + voteLen) - hmEntry.rightmostBase);
+                    hmEntry.rightmostBase = votePos + readPos + voteLen;
+                } else if (votePos < currClust) {
+                    std::cerr << "Should not have votePos = " << votePos << " <  currClust = " << currClust << "\n";
+                    std::exit(1);
+                }
+
+                if (hitMap[currClust].coverage > maxClusterCount) {
+                    maxClusterCount = hitMap[currClust].coverage;
+                    maxClusterPos = currClust;
+                    maxClusterScore = maxClusterCount / static_cast<double>(readLen);
+                    updatedMaxScore = true;
+                }
+
+            }
+            return updatedMaxScore;
+        }
+
+        bool computeBestLoc2_(std::vector<KmerVote>& sVotes, uint32_t tlen,
+                              int32_t& maxClusterPos, uint32_t& maxClusterCount, double& maxClusterScore) {
+
+            bool updatedMaxScore{false};
+
+            if (sVotes.size() == 0) { return updatedMaxScore; }
+
+            double weights[] = { 1.0, 0.983471453822, 0.935506985032,
+                0.860707976425, 0.765928338365, 0.6592406302, 0.548811636094,
+                0.441902209585, 0.344153786865, 0.259240260646,
+                0.188875602838};
+
+            uint32_t maxGap = 4;
+            uint32_t leftmost = (sVotes.front().votePos > maxGap) ? (sVotes.front().votePos - maxGap) : 0;
+            uint32_t rightmost = std::min(sVotes.back().votePos + maxGap, tlen);
+
+            uint32_t span = (rightmost - leftmost);
+            std::vector<double> probAln(span, 0.0);
+            double kwidth = 1.0 / (2.0 * maxGap);
+
+            size_t nvotes = sVotes.size();
+            for (size_t j = 0; j < nvotes; ++j) {
+                uint32_t votePos = sVotes[j].votePos;
+                uint32_t voteLen = sVotes[j].voteLen;
+
+                auto x = j + 1;
+                while (x < nvotes and sVotes[x].votePos == votePos) {
+                    voteLen += sVotes[x].voteLen;
+                    j += 1;
+                    x += 1;
+                }
+
+
+                uint32_t dist{0};
+                size_t start = (votePos >= maxGap) ? (votePos - maxGap - leftmost) : (votePos - leftmost);
+                size_t mid = votePos - leftmost;
+                size_t end = std::min(votePos + maxGap - leftmost, rightmost - leftmost);
+                for (size_t k = start; k < end; k += 1) {
+                    dist = (mid > k) ? mid - k : k - mid;
+                    probAln[k] += weights[dist] * voteLen;
+                    if (probAln[k] > maxClusterScore) {
+                        maxClusterScore = probAln[k];
+                        maxClusterPos = k + leftmost;
+                        updatedMaxScore = true;
+                    }
+                }
+            }
+
+            return updatedMaxScore;
+        }
+
+
+        inline uint32_t numSampledHits_(Transcript& transcript, std::string& readIn,
+                                        int32_t votePos, int32_t posInRead, int32_t voteLen, bool isRC, uint32_t numTries) {
+
+
+            // The read starts at this position in the transcript (may be negative!)
+            int32_t readStart = votePos;
+            // The (uncorrected) length of the read
+            int32_t readLen = readIn.length();
+            // Pointer to the sequence of the read
+            const char* read = readIn.c_str();
+            // Don't mess around with unsigned arithmetic here
+            int32_t tlen = transcript.RefLength;
+
+            // If the read starts before the first base of the transcript,
+            // trim off the initial overhang  and correct the other variables
+            if (readStart < 0) {
+                if (isRC) {
+                    uint32_t correction = -readStart;
+                    //std::cerr << "readLen = " << readLen << ", posInRead = " << posInRead << ", voteLen = " << voteLen << ", correction = " << correction << "\n";
+                    //std::cerr << "tlen = " << tlen << ", votePos = " << votePos << "\n";
+                    read += correction;
+                    readLen -= correction;
+                    posInRead -= correction;
+                    readStart = 0;
+                } else {
+                    uint32_t correction = -readStart;
+                    read += correction;
+                    readLen -= correction;
+                    posInRead -= correction;
+                    readStart = 0;
+                }
+            }
+            // If the read hangs off the end of the transcript,
+            // shorten its effective length.
+            if (readStart + readLen >= tlen) {
+                if (isRC) {
+                    uint32_t correction = (readStart + readLen) - transcript.RefLength + 1;
+                    //std::cerr << "Trimming RC hit: correction = " << correction << "\n";
+                    //std::cerr << "untrimmed read : "  << read << "\n";
+                    read += correction;
+                    readLen -= correction;
+                    if (voteLen > readLen) { voteLen = readLen; }
+                    posInRead = 0;
+                } else {
+                    readLen = tlen - (readStart + 1);
+                    voteLen = std::max(voteLen, readLen - (posInRead + voteLen));
+                }
+            }
+            // Finally, clip any reverse complement reads starting at 0
+            if (isRC) {
+
+                if (voteLen > readStart) {
+                    readLen -= (readLen - (posInRead + voteLen));
+                }
+
+            }
+
+            // If the read is too short, it's not useful
+            if (readLen <= 15) { return 0; }
+            // The step between sample centers (given the number of samples we're going to take)
+            double step = (readLen - 1) / static_cast<double>(numTries-1);
+            // The strand of the transcript from which we'll extract sequence
+            auto dir = (isRC) ? salmon::stringtools::strand::reverse :
+                                salmon::stringtools::strand::forward;
+
+            bool superVerbose{false};
+
+            if (superVerbose) {
+                std::stringstream ss;
+                ss << "Supposed hit " << (isRC ? "RC" : "") << "\n";
+                ss << "info: votePos = " << votePos << ", posInRead = " << posInRead
+                    << ", voteLen = " << voteLen << ", readLen = " << readLen
+                    << ", tran len = " << tlen << ", step = " << step << "\n";
+                if (readStart + readLen > tlen ) {
+                    ss << "ERROR!!!\n";
+                    std::cerr << "[[" << ss.str() << "]]";
+                    std::exit(1);
+                }
+                ss << "Transcript name = " << transcript.RefName << "\n";
+                ss << "T : ";
+                try {
+                    for ( size_t j = 0; j < readLen; ++j) {
+                        if (isRC) {
+                            if (j == posInRead) {
+                                char red[] = "\x1b[30m";
+                                red[3] = '0' + static_cast<char>(fmt::RED);
+                                ss << red;
+                            }
+
+                            if (j == posInRead + voteLen) {
+                                const char RESET_COLOR[] = "\x1b[0m";
+                                ss << RESET_COLOR;
+                            }
+                            ss << transcript.charBaseAt(readStart+readLen-j,dir);
+                        } else {
+                            if (j == posInRead ) {
+                                char red[] = "\x1b[30m";
+                                red[3] = '0' + static_cast<char>(fmt::RED);
+                                ss << red;
+                            }
+
+                            if (j == posInRead + voteLen) {
+                                const char RESET_COLOR[] = "\x1b[0m";
+                                ss << RESET_COLOR;
+                            }
+
+                            ss << transcript.charBaseAt(readStart+j);
+                        }
+                    }
+                    ss << "\n";
+                    char red[] = "\x1b[30m";
+                    red[3] = '0' + static_cast<char>(fmt::RED);
+                    const char RESET_COLOR[] = "\x1b[0m";
+
+                    ss << "R : " << std::string(read, posInRead) << red << std::string(read + posInRead, voteLen) << RESET_COLOR;
+                    if (readLen > posInRead + voteLen) { ss << std::string(read + posInRead + voteLen); }
+                    ss << "\n\n";
+                } catch (std::exception& e) {
+                    std::cerr << "EXCEPTION !!!!!! " << e.what() << "\n";
+                }
+                std::cerr << ss.str() << "\n";
+                ss.clear();
+            }
+
+            // The index of the current sample within the read
+            int32_t readIndex = 0;
+
+            // The number of loci in the subvotes and their
+            // offset patternns
+            size_t lpos = 3;
+            int leftPattern[] = {-4, -2, 0};
+            int rightPattern[] = {0, 2, 4};
+            int centerPattern[] = {-4, 0, 4};
+
+            // The number of subvote hits we've had
+            uint32_t numHits = 0;
+            // Take the samples
+            for (size_t i  = 0; i < numTries; ++i) {
+                // The sample will be centered around this point
+                readIndex = static_cast<uint32_t>(std::round(readStart + i * step)) - readStart;
+
+                // The number of successful sub-ovtes we have
+                uint32_t subHit = 0;
+                // Select the center sub-vote pattern, unless we're near the end of a read
+                int* pattern = &centerPattern[0];
+                if (readIndex + pattern[0] < 0) {
+                    pattern = &rightPattern[0];
+                } else if (readIndex + pattern[lpos-1] >= readLen) {
+                    pattern = &leftPattern[0];
+                }
+
+                // collect the subvotes
+                for (size_t j = 0; j < lpos; ++j) {
+                    // the pattern offset
+                    int offset = pattern[j];
+                    // and sample position it implies within the read
+                    int readPos = readIndex + offset;
+
+                    if (readStart + readPos >= tlen) {
+                        std::cerr  << "offset = " << offset << ", readPos = " << readPos << ", readStart = " << readStart << ", readStart + readPos = " << readStart + readPos << ", tlen = " << transcript.RefLength << "\n";
+                    }
+
+                    subHit += (isRC) ?
+                        (transcript.charBaseAt(readStart + readLen - readPos, dir) == salmon::stringtools::charCanon[read[readPos]]) :
+                        (transcript.charBaseAt(readStart + readPos               ) == salmon::stringtools::charCanon[read[readPos]]);
+                }
+                // if the entire subvote was successful, this is a hit
+                numHits += (subHit == lpos);
+            }
+            // return the number of hits we had
+            return numHits;
+        }
+
+
+
+        bool computeBestLoc3_(std::vector<KmerVote>& sVotes, Transcript& transcript,
+                              std::string& read, bool isRC,
+                              int32_t& maxClusterPos, uint32_t& maxClusterCount, double& maxClusterScore) {
+
+            bool updatedMaxScore{false};
+
+            if (sVotes.size() == 0) { return updatedMaxScore; }
+
+            struct LocHitCount {
+                int32_t loc;
+                uint32_t nhits;
+            };
+
+            uint32_t numSamp = 15;
+            std::vector<LocHitCount> hitCounts;
+            size_t nvotes = sVotes.size();
+            int32_t prevPos = -std::numeric_limits<int32_t>::max();
+            for (size_t j = 0; j < nvotes; ++j) {
+                int32_t votePos = sVotes[j].votePos;
+                int32_t posInRead = sVotes[j].readPos;
+                int32_t voteLen = sVotes[j].voteLen;
+                if (prevPos == votePos) { continue; }
+                auto numHits = numSampledHits_(transcript, read, votePos, posInRead, voteLen, isRC, numSamp);
+                hitCounts.push_back({votePos, numHits});
+                prevPos = votePos;
+            }
+
+            uint32_t maxGap = 8;
+            uint32_t hitIdx = 0;
+            uint32_t accumHits = 0;
+            int32_t hitLoc = hitCounts[hitIdx].loc;
+            while (hitIdx < hitCounts.size()) {
+                uint32_t idx2 = hitIdx;
+                while (idx2 < hitCounts.size() and std::abs(hitCounts[idx2].loc - hitLoc) <= maxGap) {
+                    accumHits += hitCounts[idx2].nhits;
+                    ++idx2;
+                }
+
+                double score = static_cast<double>(accumHits) / numSamp;
+                if (score > maxClusterScore) {
+                    maxClusterCount = accumHits;
+                    maxClusterScore = score;
+                    maxClusterPos = hitCounts[hitIdx].loc;
+                    updatedMaxScore = true;
+                }
+                accumHits = 0;
+                ++hitIdx;
+                hitLoc = hitCounts[hitIdx].loc;
+            }
+
+            return updatedMaxScore;
+        }
+
+
+        bool computeBestChain(Transcript& transcript, std::string& read) {
+            std::sort(votes.begin(), votes.end(),
+                    [](const KmerVote& v1, const KmerVote& v2) -> bool {
+                        if (v1.votePos == v2.votePos) {
+                            return v1.readPos < v2.readPos;
+                        }
+                        return v1.votePos < v2.votePos;
+                    });
+
+            std::sort(rcVotes.begin(), rcVotes.end(),
+                    [](const KmerVote& v1, const KmerVote& v2) -> bool {
+                        if (v1.votePos == v2.votePos) {
+                            return v1.readPos < v2.readPos;
+                        }
+                        return v1.votePos < v2.votePos;
+                    });
+
+            int32_t maxClusterPos{0};
+            uint32_t maxClusterCount{0};
+            double maxClusterScore{0.0};
+
+            // we don't need the return value from the first call
+            static_cast<void>(computeBestLoc_(votes, transcript, read, false, maxClusterPos, maxClusterCount, maxClusterScore));
+            bool revIsBest = computeBestLoc_(rcVotes, transcript, read, true, maxClusterPos, maxClusterCount, maxClusterScore);
+            isForward_ = not revIsBest;
+
+            bestHitPos = maxClusterPos;
+            bestHitCount = maxClusterCount;
+            bestHitScore = maxClusterScore;
+            return true;
+        }
+
+        bool isForward() { return isForward_; }
+
+};
+
+template <typename CoverageCalculator>
+inline void collectHitsForRead(const bwaidx_t *idx, const bwtintv_v* a, smem_aux_t* auxHits,
+                        mem_opt_t* memOptions, const SalmonOpts& salmonOpts, const uint8_t* read, uint32_t readLen,
+                        std::unordered_map<uint64_t, CoverageCalculator>& hits) {
+
+    mem_collect_intv(salmonOpts, memOptions, idx->bwt, readLen, read, auxHits);
+
+    // For each MEM
+    for (int i = 0; i < auxHits->mem.n; ++i ) {
+        // A pointer to the interval of the MEMs occurences
+        bwtintv_t* p = &auxHits->mem.a[i];
+        // The start and end positions in the query string (i.e. read) of the MEM
+        int qstart = p->info>>32;
+        uint32_t qend = static_cast<uint32_t>(p->info);
+        int step, count, slen = (qend - qstart); // seed length
+
+        int64_t k;
+        step = p->x[2] > memOptions->max_occ? p->x[2] / memOptions->max_occ : 1;
+        // For every occurrence of the MEM
+        for (k = count = 0; k < p->x[2] && count < memOptions->max_occ; k += step, ++count) {
+            bwtint_t pos;
+            bwtint_t startPos, endPos;
+            int len, isRev, isRevStart, isRevEnd, refID, refIDStart, refIDEnd;
+            int queryStart = qstart;
+            len = slen;
+            uint32_t rlen = readLen;
+
+            // Get the position in the reference index of this MEM occurrence
+            int64_t refStart = bwt_sa(idx->bwt, p->x[0] + k);
+
+            pos = startPos = bns_depos(idx->bns, refStart, &isRevStart);
+            endPos = bns_depos(idx->bns, refStart + slen - 1, &isRevEnd);
+            // If we span the forward/reverse boundary, discard the hit
+            if (isRevStart != isRevEnd) {
+                continue;
+            }
+            // Otherwise, isRevStart = isRevEnd so just assign isRev = isRevStart
+            isRev = isRevStart;
+
+            // If the hit is reversed --- swap the start and end
+            if (isRev) {
+                if (endPos > startPos) {
+                    salmonOpts.jointLog->warn("Hit is supposedly reversed, "
+                                              "but startPos = {} < endPos = {}",
+                                              startPos, endPos);
+                }
+                auto temp = startPos;
+                startPos = endPos;
+                endPos = temp;
+            }
+            // Get the ID of the reference sequence in which it occurs
+            refID = refIDStart = bns_pos2rid(idx->bns, startPos);
+            refIDEnd = bns_pos2rid(idx->bns, endPos);
+
+            if (refID < 0) { continue; } // bridging multiple reference sequences or the forward-reverse boundary;
+
+            auto tlen = idx->bns->anns[refID].len;
+
+            // The refence sequence-relative (e.g. transcript-relative) position of the MEM
+            long hitLoc = static_cast<long>(isRev ? endPos : startPos) - idx->bns->anns[refID].offset;
+
+            if ((refIDStart != refIDEnd)) {
+                // If a seed spans two transcripts
+
+                // If we're not considering splitting such seeds, then
+                // just discard this seed and continue.
+                if (not salmonOpts.splitSpanningSeeds) { continue; }
+
+                //std::cerr << "Seed spans two transcripts! --- attempting to split: \n";
+                if (!isRev) {
+                    // If it's going forward, we have a situation like this
+                    // packed transcripts: t1 ===========|t2|==========>
+                    // hit:                          |==========>
+
+                    // length of hit in t1
+                    auto len1 = tlen - hitLoc;
+                    // length of hit in t2
+                    auto len2 = slen - len1;
+                    if (std::max(len1, len2) < memOptions->min_seed_len) { continue; }
+
+                    /** Keeping this here for now in case I need to debug splitting seeds again
+                    std::cerr << "\t hit is in the forward direction: ";
+                    std::cerr << "t1 part has length " << len1 << ", t2 part has length " << len2 << "\n";
+                    */
+
+                    // If the part in t1 is larger then just cut off the rest
+                    if (len1 >= len2) {
+                        slen = len1;
+                        int32_t votePos = static_cast<int32_t>(hitLoc) - queryStart;
+                        //std::cerr << "\t\t t1 (of length " << tlen << ") has larger hit --- new hit length = " << len1 << "; starts at pos " << queryStart << " in the read (votePos will be " << votePos << ")\n";
+                    } else {
+                        // Otherwise, make the hit be in t2.
+                        // Because the hit spans the boundary where t2 begins,
+                        // the new seed begins matching at position 0 of
+                        // transcript t2
+                        hitLoc = 0;
+                        slen = len2;
+                        // The seed originally started at position q, now it starts  len1 characters to the  right of that
+                        queryStart += len1;
+                        refID = refIDEnd;
+                        int32_t votePos = static_cast<int32_t>(hitLoc) - queryStart;
+                        tlen = idx->bns->anns[refID].len;
+                        //std::cerr << "\t\t t2 (of length " << tlen << ") has larger hit --- new hit length = " << len2 << "; starts at pos " << queryStart << " in the read (votePos will be " << votePos << ")\n";
+                    }
+                } else {
+
+                    // If it's going in the reverse direction, we have a situation like this
+                    // packed transcripts: t1 <===========|t2|<==========
+                    // hit:                          X======Y>======Z>
+                    // Which means we have
+                    // packed transcripts: t1 <===========|t2|<==========
+                    // hit:                          <Z=====Y<======X
+                    // length of hit in t1
+
+                    auto len2 = endPos - idx->bns->anns[refIDEnd].offset;
+                    auto len1 = slen - len2;
+                    if (std::max(len1, len2) < memOptions->min_seed_len) { continue; }
+
+                    /** Keeping this here for now in case I need to debug splitting seeds again
+                    std::cerr << "\t hit is in the reverse direction: ";
+                    std::cerr << "\n\n";
+                    std::cerr << "startPos = " << startPos << ", endPos = " << endPos << ", offset[refIDStart] = "
+                              <<  idx->bns->anns[refIDStart].offset << ", offset[refIDEnd] = " << idx->bns->anns[refIDEnd].offset << "\n";
+                    std::cerr << "\n\n";
+                    std::cerr << "t1 part has length " << len1 << ", t2 part has length " << len2 << "\n\n";
+                    */
+
+                    if (len1 >= len2) {
+                        slen = len1;
+                        hitLoc = tlen - len2;
+                        queryStart += len2;
+                        rlen -= len2;
+                        int32_t votePos = static_cast<int32_t>(hitLoc) - (rlen - queryStart);
+                        //std::cerr << "\t\t t1 (hitLoc: " << hitLoc << ") (of length " << tlen << ") has larger hit --- new hit length = " << len1 << "; starts at pos " << queryStart << " in the read (votePos will be " << votePos << ")\n";
+                    } else {
+                        slen = len2;
+                        refID = bns_pos2rid(idx->bns, endPos);
+                        tlen = idx->bns->anns[refID].len;
+                        hitLoc = len2;
+                        rlen = hitLoc + queryStart;
+                        int32_t votePos = static_cast<int32_t>(hitLoc) - (rlen - queryStart);
+                        //std::cerr << "\t\t t2 (of length " << tlen << ") (hitLoc: " << hitLoc << ") has larger hit --- new hit length = " << len2 << "; starts at pos " << queryStart << " in the read (votePos will be " << votePos << ")\n";
+                    }
+                }
+
+            }
+
+            if (isRev) {
+                hits[refID].addFragMatchRC(hitLoc, queryStart , slen, rlen);
+            } else {
+                hits[refID].addFragMatch(hitLoc, queryStart, slen);
+            }
+        } // for k
+    }
+}
+
+inline bool consistentNames(header_sequence_qual& r) {
+    return true;
+}
+
+bool consistentNames(std::pair<header_sequence_qual, header_sequence_qual>& rp) {
+        auto l1 = rp.first.header.length();
+        auto l2 = rp.second.header.length();
+        char* sptr = static_cast<char*>(memchr(&rp.first.header[0], ' ', l1));
+
+        bool compat = false;
+        // If we didn't find a space in the name of read1
+        if (sptr == NULL) {
+            if (l1 > 1) {
+                compat = (l1 == l2);
+                compat = compat and (memcmp(&rp.first.header[0], &rp.second.header[0], l1-1) == 0);
+                compat = compat and ((rp.first.header[l1-1] == '1' and rp.second.header[l2-1] == '2')
+                                or   (rp.first.header[l1-1] == rp.second.header[l2-1]));
+            } else {
+                compat = (l1 == l2);
+                compat = compat and (rp.first.header[0] == rp.second.header[0]);
+            }
+        } else {
+            size_t offset = sptr - (&rp.first.header[0]);
+
+            // If read2 matches read1 up to and including the space
+            if (offset + 1 < l2) {
+                compat = memcmp(&rp.first.header[0], &rp.second.header[0], offset) == 0;
+                // and after the space, read1 and read2 have an identical character or
+                // read1 has a '1' and read2 has a '2', then this is a consistent pair.
+                compat = compat and ((rp.first.header[offset+1] == rp.second.header[offset+1])
+                                or   (rp.first.header[offset+1] == '1' and rp.second.header[offset+1] == '2'));
+            } else {
+                compat = false;
+            }
+        }
+        return compat;
+}
+
+/**
+ *  Returns true if the @hit is within @cutoff bases of the end of
+ *  transcript @txp and false otherwise.
+ */
+template <typename CoverageCalculator>
+inline bool nearEndOfTranscript(
+            CoverageCalculator& hit,
+            Transcript& txp,
+            int32_t cutoff=std::numeric_limits<int32_t>::max()) {
+	// check if hit appears close to the end of the given transcript
+	auto hitPos = hit.bestHitPos;
+	return (hitPos < cutoff or
+            std::abs(static_cast<int32_t>(txp.RefLength) - hitPos) < cutoff);
+
+}
+
+template <typename CoverageCalculator>
+void getHitsForFragment(std::pair<header_sequence_qual, header_sequence_qual>& frag,
+                        bwaidx_t *idx,
+                        smem_i *itr,
+                        const bwtintv_v *a,
+                        smem_aux_t* auxHits,
+                        mem_opt_t* memOptions,
+                        const SalmonOpts& salmonOpts,
+                        double coverageThresh,
+                        uint64_t& upperBoundHits,
+                        AlignmentGroup<SMEMAlignment>& hitList,
+                        uint64_t& hitListCount,
+                        std::vector<Transcript>& transcripts) {
+
+    std::unordered_map<uint64_t, CoverageCalculator> leftHits;
+
+    std::unordered_map<uint64_t, CoverageCalculator> rightHits;
+
+    uint32_t leftReadLength{0};
+    uint32_t rightReadLength{0};
+
+    /**
+    * As soon as we can decide on an acceptable way to validate read names,
+    * we'll inform the user and quit if we see something inconsistent.  However,
+    * we first need a reasonable way to verify potential naming formats from
+    * many different sources.
+    */
+    /*
+    if (!consistentNames(frag)) {
+        fmt::MemoryWriter errstream;
+
+        errstream << "Inconsistent paired-end reads!\n";
+        errstream << "mate1 : " << frag.first.header << "\n";
+        errstream << "mate2 : " << frag.second.header << "\n";
+        errstream << "Paired-end reads should appear consistently in their respective files.\n";
+        errstream << "Please fix the paire-end input before quantifying with salmon; exiting.\n";
+
+        std::cerr << errstream.str();
+        std::exit(-1);
+    }
+    */
+
+    //---------- End 1 ----------------------//
+    {
+        std::string readStr   = frag.first.seq;
+        uint32_t readLen      = readStr.size();
+
+        leftReadLength = readLen;
+
+        for (int p = 0; p < readLen; ++p) {
+            readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
+        }
+
+        collectHitsForRead(idx, a, auxHits,
+                            memOptions,
+                            salmonOpts,
+                            reinterpret_cast<const uint8_t*>(readStr.c_str()),
+                            readLen,
+                            leftHits);
+    }
+
+    //---------- End 2 ----------------------//
+    {
+        std::string readStr   = frag.second.seq;
+        uint32_t readLen      = readStr.size();
+
+        rightReadLength = readLen;
+
+        for (int p = 0; p < readLen; ++p) {
+            readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
+        }
+
+        collectHitsForRead(idx, a, auxHits,
+                            memOptions,
+                            salmonOpts,
+                            reinterpret_cast<const uint8_t*>(readStr.c_str()),
+                            readLen,
+                            rightHits);
+     } // end right
+
+    upperBoundHits += (leftHits.size() + rightHits.size() > 0) ? 1 : 0;
+    size_t readHits{0};
+    auto& alnList = hitList.alignments();
+    hitList.isUniquelyMapped() = true;
+    alnList.clear();
+
+    //std::cerr << "leftHits.size() = " << leftHits.size() << ", leftHitsOld.size() = " << leftHitsOld.size() <<
+    //             "rightHits.size() = " << rightHits.size() << ", rightHitsOld.size() = "<< rightHitsOld.size() << "\n";
+
+    double cutoffLeft{ coverageThresh };//* leftReadLength};
+    double cutoffRight{ coverageThresh };//* rightReadLength};
+
+    uint64_t leftHitCount{0};
+
+    // Fraction of the optimal coverage that a lightweight alignment
+    // must obtain in order to be retained.
+    float fOpt{0.9};
+
+    // First, see if there are transcripts where both ends of the
+    // fragments map
+    auto& minHitList = (leftHits.size() < rightHits.size()) ? leftHits : rightHits;
+    auto& maxHitList = (leftHits.size() < rightHits.size()) ? rightHits : leftHits;
+
+    std::vector<uint64_t> jointHits; // haha (variable name)!
+    jointHits.reserve(minHitList.size());
+
+    {
+        auto notFound = maxHitList.end();
+        for (auto& kv : minHitList) {
+            uint64_t refID = kv.first;
+            if (maxHitList.find(refID) != notFound) {
+                jointHits.emplace_back(refID);
+            }
+        }
+    }
+
+    // Check if the fragment generated orphaned
+    // lightweight alignments.
+    bool isOrphan = (jointHits.size() == 0);
+
+    uint32_t firstTranscriptID = std::numeric_limits<uint32_t>::max();
+    double bestScore = -std::numeric_limits<double>::max();
+    bool sortedByTranscript = true;
+    int32_t lastTranscriptId = std::numeric_limits<int32_t>::min();
+
+    if (BOOST_UNLIKELY(isOrphan)) {
+        return;
+        bool foundValidHit{false};
+        // search for a hit on the left
+        for (auto& tHitList : leftHits) {
+            auto transcriptID = tHitList.first;
+            Transcript& t = transcripts[transcriptID];
+            tHitList.second.computeBestChain(t, frag.first.seq);
+            double score = tHitList.second.bestHitScore;
+
+            if (score >= fOpt * bestScore and score >= cutoffLeft) {
+    	    	// make sure orphaned fragment is near the end of the transcript
+	    	    if (!nearEndOfTranscript(tHitList.second, t, 200)) { continue; }
+
+                foundValidHit = true;
+
+        		if (score > bestScore) { bestScore = score; }
+                bool isForward = tHitList.second.isForward();
+                int32_t hitPos = tHitList.second.bestHitPos;
+                auto fmt = salmon::utils::hitType(hitPos, isForward);
+
+                if (leftHitCount == 0) {
+                    firstTranscriptID = transcriptID;
+                } else if (hitList.isUniquelyMapped() and transcriptID != firstTranscriptID) {
+                    hitList.isUniquelyMapped() = false;
+                }
+
+                if (transcriptID  < lastTranscriptId) {
+                    sortedByTranscript = false;
+                }
+
+                alnList.emplace_back(transcriptID, fmt, score, hitPos);
+                readHits += score;
+                ++hitListCount;
+                ++leftHitCount;
+            }
+        }
+
+        //if (!foundValidHit) {
+            // search for a hit on the right
+            for (auto& tHitList : rightHits) {
+                auto transcriptID = tHitList.first;
+                Transcript& t = transcripts[transcriptID];
+                tHitList.second.computeBestChain(t, frag.second.seq);
+                double score = tHitList.second.bestHitScore;
+
+                if (score >= fOpt * bestScore and score >= cutoffRight) {
+        		    // make sure orphaned fragment is near the end of the transcript
+	        	    if (!nearEndOfTranscript(tHitList.second, t, 200)) { continue; }
+
+                    if (score > bestScore) { bestScore = score; }
+                    foundValidHit = true;
+                    bool isForward = tHitList.second.isForward();
+                    int32_t hitPos = tHitList.second.bestHitPos;
+                    auto fmt = salmon::utils::hitType(hitPos, isForward);
+                    if (leftHitCount == 0) {
+                        firstTranscriptID = tHitList.first;
+                    } else if (hitList.isUniquelyMapped() and transcriptID != firstTranscriptID) {
+                        hitList.isUniquelyMapped() = false;
+                    }
+
+                    alnList.emplace_back(transcriptID, fmt, score, hitPos);
+                    readHits += score;
+                    ++hitListCount;
+                    ++leftHitCount;
+                }
+            }
+        //}
+
+        if (alnList.size() > 0) {
+            auto newEnd = std::stable_partition(alnList.begin(), alnList.end(),
+                           [bestScore, fOpt](SMEMAlignment& aln) -> bool {
+                                return aln.score() >= fOpt * bestScore;
+                           });
+            alnList.resize(std::distance(alnList.begin(), newEnd));
+            if (!sortedByTranscript) {
+
+                std::sort(alnList.begin(), alnList.end(),
+                          [](const SMEMAlignment& x, const SMEMAlignment& y) -> bool {
+                           return x.transcriptID() < y.transcriptID();
+                          });
+            }
+        }
+
+    } else {
+        for (auto transcriptID : jointHits) {
+            Transcript& t = transcripts[transcriptID];
+            auto& leftHitList = leftHits[transcriptID];
+            leftHitList.computeBestChain(t, frag.first.seq);
+            if (leftHitList.bestHitScore >= cutoffLeft) {
+                auto& rightHitList = rightHits[transcriptID];
+                rightHitList.computeBestChain(t, frag.second.seq);
+                if (rightHitList.bestHitScore < cutoffRight) { continue; }
+
+                auto end1Start = leftHitList.bestHitPos;
+                auto end2Start = rightHitList.bestHitPos;
+
+                double score = (leftHitList.bestHitScore + rightHitList.bestHitScore) * 0.5;
+                if (score < fOpt * bestScore) { continue; }
+
+                if (score > bestScore) {
+                    bestScore = score;
+                }
+
+                uint32_t fragLength = std::abs(static_cast<int32_t>(end1Start) -
+                                               static_cast<int32_t>(end2Start)) + rightReadLength;
+
+                bool end1IsForward = leftHitList.isForward();
+                bool end2IsForward = rightHitList.isForward();
+
+                uint32_t end1Pos = (end1IsForward) ? leftHitList.bestHitPos : leftHitList.bestHitPos + leftReadLength;
+                uint32_t end2Pos = (end2IsForward) ? rightHitList.bestHitPos : rightHitList.bestHitPos + rightReadLength;
+        		bool canDovetail = false;
+                auto fmt = salmon::utils::hitType(end1Pos, end1IsForward, leftReadLength, end2Pos, end2IsForward, rightReadLength, canDovetail);
+
+                if (readHits == 0) {
+                    firstTranscriptID = transcriptID;
+                } else if (hitList.isUniquelyMapped() and transcriptID != firstTranscriptID) {
+                     hitList.isUniquelyMapped() = false;
+                }
+
+                int32_t minHitPos = std::min(end1Pos, end2Pos);
+                if (transcriptID  < lastTranscriptId) {
+                    sortedByTranscript = false;
+                }
+                alnList.emplace_back(transcriptID, fmt, score, minHitPos, fragLength);
+                ++readHits;
+                ++hitListCount;
+            }
+        } // end for jointHits
+        if (alnList.size() > 0) {
+            auto newEnd = std::stable_partition(alnList.begin(), alnList.end(),
+                           [bestScore, fOpt](SMEMAlignment& aln) -> bool {
+                                return aln.score() >= fOpt * bestScore;
+                           });
+            alnList.resize(std::distance(alnList.begin(), newEnd));
+            if (!sortedByTranscript) {
+                std::sort(alnList.begin(), alnList.end(),
+                          [](const SMEMAlignment& x, const SMEMAlignment& y) -> bool {
+                           return x.transcriptID() < y.transcriptID();
+                          });
+            }
+        }
+    } // end else
+
+    /*
+    for (auto& tHitList : leftHits) {
+        // Coverage score
+        Transcript& t = transcripts[tHitList.first];
+        tHitList.second.computeBestChain(t, frag.first.seq);
+        ++leftHitCount;
+    }
+
+    uint32_t firstTranscriptID = std::numeric_limits<uint32_t>::max();
+
+    for (auto& tHitList : rightHits) {
+
+        auto it = leftHits.find(tHitList.first);
+        // Coverage score
+        if (it != leftHits.end() and it->second.bestHitScore >= cutoffLeft) {
+            Transcript& t = transcripts[tHitList.first];
+            tHitList.second.computeBestChain(t, frag.second.seq);
+            if (tHitList.second.bestHitScore < cutoffRight) { continue; }
+
+            auto end1Start = it->second.bestHitPos;
+            auto end2Start = tHitList.second.bestHitPos;
+
+            double score = (it->second.bestHitScore + tHitList.second.bestHitScore) * 0.5;
+            uint32_t fragLength = std::abs(static_cast<int32_t>(end1Start) -
+                                           static_cast<int32_t>(end2Start)) + rightReadLength;
+
+            bool end1IsForward = it->second.isForward();
+            bool end2IsForward = tHitList.second.isForward();
+
+            uint32_t end1Pos = (end1IsForward) ? it->second.bestHitPos : it->second.bestHitPos + leftReadLength;
+            uint32_t end2Pos = (end2IsForward) ? tHitList.second.bestHitPos : tHitList.second.bestHitPos + rightReadLength;
+            auto fmt = salmon::utils::hitType(end1Pos, end1IsForward, end2Pos, end2IsForward);
+
+            if (readHits == 0) {
+                firstTranscriptID = tHitList.first;
+            } else if (hitList.isUniquelyMapped() and tHitList.first != firstTranscriptID) {
+                hitList.isUniquelyMapped() = false;
+            }
+
+            alnList.emplace_back(tHitList.first, fmt, score, fragLength);
+            ++readHits;
+            ++hitListCount;
+        }
+    }
+    */
+}
+
+/**
+  *   Get hits for single-end fragment
+  *
+  *
+  */
+template <typename CoverageCalculator>
+void getHitsForFragment(jellyfish::header_sequence_qual& frag,
+                        bwaidx_t *idx,
+                        smem_i *itr,
+                        const bwtintv_v *a,
+                        smem_aux_t* auxHits,
+                        mem_opt_t* memOptions,
+                        const SalmonOpts& salmonOpts,
+                        double coverageThresh,
+                        uint64_t& upperBoundHits,
+                        AlignmentGroup<SMEMAlignment>& hitList,
+                        uint64_t& hitListCount,
+                        std::vector<Transcript>& transcripts) {
+
+    uint64_t leftHitCount{0};
+
+    //std::unordered_map<uint64_t, TranscriptHitList> hits;
+    std::unordered_map<uint64_t, CoverageCalculator> hits;
+
+    uint32_t readLength{0};
+
+    //---------- get hits ----------------------//
+    {
+        std::string readStr   = frag.seq;
+        uint32_t readLen      = frag.seq.size();
+
+        readLength = readLen;
+
+        for (int p = 0; p < readLen; ++p) {
+            readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
+        }
+
+        char* readPtr = const_cast<char*>(readStr.c_str());
+
+        collectHitsForRead(idx, a, auxHits,
+                            memOptions,
+                            salmonOpts,
+                            reinterpret_cast<const uint8_t*>(readStr.c_str()),
+                            readLen,
+                            hits);
+
+    }
+
+    upperBoundHits += (hits.size() > 0) ? 1 : 0;
+
+    int32_t lastTranscriptId = std::numeric_limits<int32_t>::min();
+    bool sortedByTranscript{true};
+    double fOpt{0.9};
+    double bestScore = -std::numeric_limits<double>::max();
+
+    size_t readHits{0};
+    auto& alnList = hitList.alignments();
+    hitList.isUniquelyMapped() = true;
+    alnList.clear();
+
+    uint32_t firstTranscriptID = std::numeric_limits<uint32_t>::max();
+    double cutoff{ coverageThresh };//* readLength};
+    for (auto& tHitList : hits) {
+        // Coverage score
+        Transcript& t = transcripts[tHitList.first];
+        tHitList.second.computeBestChain(t, frag.seq);
+        double score = tHitList.second.bestHitScore;
+        // DEBUG -- process ALL HITS
+        //if (true) {
+        if (score >= fOpt * bestScore and tHitList.second.bestHitScore >= cutoff) {
+
+            bool isForward = tHitList.second.isForward();
+            if (score < fOpt * bestScore) { continue; }
+
+        	if (score > bestScore) { bestScore = score; }
+
+            auto hitPos = tHitList.second.bestHitPos;
+            auto fmt = salmon::utils::hitType(hitPos, isForward);
+
+            if (leftHitCount == 0) {
+                firstTranscriptID = tHitList.first;
+            } else if (hitList.isUniquelyMapped() and tHitList.first != firstTranscriptID) {
+                hitList.isUniquelyMapped() = false;
+            }
+
+            auto transcriptID = tHitList.first;
+
+            if (transcriptID  < lastTranscriptId) {
+                sortedByTranscript = false;
+            }
+
+            alnList.emplace_back(transcriptID, fmt, score, hitPos);
+            readHits += score;
+            ++hitListCount;
+            ++leftHitCount;
+        }
+    }
+    if (alnList.size() > 0) {
+        auto newEnd = std::stable_partition(alnList.begin(), alnList.end(),
+                [bestScore, fOpt](SMEMAlignment& aln) -> bool {
+                return aln.score() >= fOpt * bestScore;
+                });
+        alnList.resize(std::distance(alnList.begin(), newEnd));
+        if (!sortedByTranscript) {
+            std::sort(alnList.begin(), alnList.end(),
+                    [](const SMEMAlignment& x, const SMEMAlignment& y) -> bool {
+                     return x.transcriptID() < y.transcriptID();
+                    });
+        }
+    }
+
+
+
+}
+
+// To use the parser in the following, we get "jobs" until none is
+// available. A job behaves like a pointer to the type
+// jellyfish::sequence_list (see whole_sequence_parser.hpp).
+template <typename ParserT, typename CoverageCalculator>
+void processReadsMEM(ParserT* parser,
+               ReadExperiment& readExp,
+               ReadLibrary& rl,
+               AlnGroupQueue& structureCache,
+               AlnGroupQueue& outputGroups,
+               std::atomic<uint64_t>& numObservedFragments,
+               std::atomic<uint64_t>& numAssignedFragments,
+               std::atomic<uint64_t>& validHits,
+               std::atomic<uint64_t>& upperBoundHits,
+               bwaidx_t *idx,
+               std::vector<Transcript>& transcripts,
+               ForgettingMassCalculator& fmCalc,
+               ClusterForest& clusterForest,
+               FragmentLengthDistribution& fragLengthDist,
+               mem_opt_t* memOptions,
+               const SalmonOpts& salmonOpts,
+               double coverageThresh,
+	           std::mutex& iomutex,
+               bool initialRound,
+               bool& burnedIn,
+               volatile bool& writeToCache) {
+  uint64_t count_fwd = 0, count_bwd = 0;
+  // Seed with a real random value, if available
+  std::random_device rd;
+
+  // Create a random uniform distribution
+  std::default_random_engine eng(rd());
+
+  std::vector<AlignmentGroup<SMEMAlignment>*> hitLists;
+  //std::vector<std::vector<Alignment>> hitLists;
+  uint64_t prevObservedFrags{1};
+  hitLists.resize(miniBatchSize);
+
+  uint64_t leftHitCount{0};
+  uint64_t hitListCount{0};
+
+  // Super-MEM iterator
+  smem_i *itr = smem_itr_init(idx->bwt);
+  const bwtintv_v *a = nullptr;
+  smem_aux_t* auxHits = smem_aux_init();
+
+  auto expectedLibType = rl.format();
+
+  uint64_t firstTimestepOfRound = fmCalc.getCurrentTimestep();
+
+  size_t locRead{0};
+  uint64_t localUpperBoundHits{0};
+  while(true) {
+    typename ParserT::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group)
+    if(j.is_empty()) break;           // If got nothing, quit
+
+    hitLists.resize(j->nb_filled);
+    //structureCache.try_dequeue_bulk(hitLists.begin() , j->nb_filled);
+
+    for(size_t i = 0; i < j->nb_filled; ++i) { // For all the read in this batch
+        localUpperBoundHits = 0;
+        //hitLists[i].setRead(&j->data[i]);
+
+#if defined(__MOODYCAMEL__)
+        // Moody camel
+        while (!structureCache.try_dequeue(hitLists[i])) {}
+#else
+        // TBB
+        while (!structureCache.try_pop(hitLists[i])) {}
+#endif
+        auto& hitList = *(hitLists[i]);
+
+        getHitsForFragment<CoverageCalculator>(j->data[i], idx, itr, a,
+                                               auxHits,
+                                               memOptions,
+                                               salmonOpts,
+                                               coverageThresh,
+                                               localUpperBoundHits,
+                                               hitList, hitListCount,
+                                               transcripts);
+        if (initialRound) {
+            upperBoundHits += localUpperBoundHits;
+        }
+
+        // If the read mapped to > maxReadOccs places, discard it
+        if (hitList.size() > salmonOpts.maxReadOccs ) { hitList.alignments().clear(); }
+        validHits += hitList.size();
+        locRead++;
+        ++numObservedFragments;
+        if (numObservedFragments % 50000 == 0) {
+    	    iomutex.lock();
+            const char RESET_COLOR[] = "\x1b[0m";
+            char green[] = "\x1b[30m";
+            green[3] = '0' + static_cast<char>(fmt::GREEN);
+            char red[] = "\x1b[30m";
+            red[3] = '0' + static_cast<char>(fmt::RED);
+            if (initialRound) {
+                fmt::print(stderr, "\033[A\r\r{}processed{} {} {}fragments{}\n", green, red, numObservedFragments, green, RESET_COLOR);
+                fmt::print(stderr, "hits per frag:  {}; hit upper bound: {}",
+                           validHits / static_cast<float>(prevObservedFrags),
+                           upperBoundHits.load());
+            } else {
+                fmt::print(stderr, "\r\r{}processed{} {} {}fragments{}", green, red, numObservedFragments, green, RESET_COLOR);
+            }
+    	    iomutex.unlock();
+        }
+
+
+    } // end for i < j->nb_filled
+
+
+    // NOT DOUBLY-COLLAPSED
+    // double logForgettingMass = fmCalc();
+
+    // BEGIN: DOUBLY-COLLAPSED TESTING
+   // double logForgettingMass{0.0};
+   // uint64_t currentMinibatchTimestep{0};
+   // fmCalc.getLogMassAndTimestep(logForgettingMass, currentMinibatchTimestep);
+    // END: DOUBLE-COLLAPSED TESTING
+
+    prevObservedFrags = numObservedFragments;
+
+   processMiniBatch(readExp, fmCalc,firstTimestepOfRound, rl, salmonOpts, hitLists, transcripts, clusterForest,
+                     fragLengthDist, numAssignedFragments, eng, initialRound, burnedIn);
+    if (writeToCache) {
+
+#if defined(__MOODYCAMEL__)
+        // Moody camel
+        if (!outputGroups.enqueue_bulk(hitLists.begin(), hitLists.size())) {
+            salmonOpts.jointLog->critical("Could not enqueue items in "
+                                          "outputGroups queue\n");
+            std::exit(1);
+        }
+#else
+        // TBB
+        for (auto hl : hitLists) { outputGroups.push(hl); }
+#endif
+    } else {
+
+#if defined(__MOODYCAMEL__)
+        // Moody camel
+        if (!structureCache.enqueue_bulk(hitLists.begin(), hitLists.size())) {
+            salmonOpts.jointLog->critical("Could not enqueue items in "
+                                          "structureCache queue\n");
+            std::exit(1);
+        }
+#else
+        // TBB
+        for (auto hl : hitLists) { structureCache.push(hl); }
+#endif
+    }
+    // At this point, the parser can re-claim the strings
+  }
+  smem_aux_destroy(auxHits);
+  smem_itr_destroy(itr);
+}
+
+// To use the parser in the following, we get "jobs" until none is
+// available. A job behaves like a pointer to the type
+// jellyfish::sequence_list (see whole_sequence_parser.hpp).
+void processCachedAlignmentsHelper(
+        ReadExperiment& readExp,
+        ReadLibrary& rl,
+        AlnGroupQueue& structureCache,
+        AlnGroupQueue& alignmentCache,
+        std::atomic<uint64_t>& numObservedFragments,
+        std::atomic<uint64_t>& numAssignedFragments,
+        std::atomic<uint64_t>& validHits,
+        std::vector<Transcript>& transcripts,
+        ForgettingMassCalculator& fmCalc,
+        ClusterForest& clusterForest,
+        FragmentLengthDistribution& fragLengthDist,
+        const SalmonOpts& salmonOpts,
+        std::mutex& iomutex,
+        bool initialRound,
+        volatile bool& cacheExhausted,
+        bool& burnedIn) {
+
+    // Seed with a real random value, if available
+    std::random_device rd;
+
+    // Create a random uniform distribution
+    std::default_random_engine eng(rd());
+
+    std::vector<AlignmentGroup<SMEMAlignment>*> hitLists;
+
+    uint64_t prevObservedFrags{1};
+    auto expectedLibType = rl.format();
+
+    uint32_t batchCount{miniBatchSize};
+    uint64_t locRead{0};
+    uint64_t locValidHits{0};
+    uint32_t numConsumed{0};
+#if defined(__MOODYCAMEL__)
+    uint32_t obtained{0};
+#else
+    bool obtained{false};
+#endif
+
+    uint64_t firstTimestepOfRound = fmCalc.getCurrentTimestep();
+    hitLists.resize(batchCount);
+    auto it = hitLists.begin();
+
+    while(!cacheExhausted or
+#if defined(__MOODYCAMEL__)
+    // Moody camel
+          (obtained = alignmentCache.try_dequeue_bulk(it, batchCount - numConsumed)) > 0) {
+      numConsumed += obtained;
+#else
+    // TBB
+            (obtained = alignmentCache.try_pop(*it))) {
+        numConsumed += obtained ? 1 : 0;
+#endif
+
+        /** Get alignment groups from the queue while they still exist
+         * once the cacheExhausted variable is true, there will be
+         * no more alignment groups written in this round.  If cacheExhausted
+         * is true and the alignment cache is empty, then there are no
+         * more alignments to process (am I certain about this in concurrent
+         * crazy multi-threaded land?).
+         */
+        while (numConsumed < batchCount) {
+#if defined(__MOODYCAMEL__)
+            // Moody camel
+            it += obtained;
+            obtained = alignmentCache.try_dequeue_bulk(it, batchCount - numConsumed);
+            numConsumed += obtained;
+            if (cacheExhausted and obtained == 0) {
+#else
+            // TBB
+            it += obtained ? 1 : 0;
+            obtained = alignmentCache.try_pop(*it);
+            numConsumed += obtained ? 1 : 0;
+            if (cacheExhausted and !obtained) {
+#endif
+                break;
+            }
+        }
+        // At this point, we either have the requested # of alignemnts, or
+        // have exhausted the alignment queue.
+
+        hitLists.resize(numConsumed);
+        for (auto hitList : hitLists) {
+            locValidHits += hitList->size();
+        }
+        validHits += locValidHits;
+        locRead += numConsumed;
+
+        uint32_t updateRate = 500000;
+        uint64_t prevMod = numObservedFragments % updateRate;
+        numObservedFragments += numConsumed;
+        uint64_t newMod = numObservedFragments % updateRate;
+        if (newMod < prevMod) {
+            iomutex.lock();
+            const char RESET_COLOR[] = "\x1b[0m";
+            char green[] = "\x1b[30m";
+            green[3] = '0' + static_cast<char>(fmt::GREEN);
+            char red[] = "\x1b[30m";
+            red[3] = '0' + static_cast<char>(fmt::RED);
+            if (initialRound) {
+                fmt::print(stderr, "\033[A\r\r{}processed{} {} {}fragments{}\n", green, red, numObservedFragments, green, RESET_COLOR);
+                fmt::print(stderr, "hits per frag:  {}", validHits / static_cast<float>(prevObservedFrags));
+            } else {
+                fmt::print(stderr, "\r\r{}processed{} {} {}fragments{}", green, red, numObservedFragments, green, RESET_COLOR);
+            }
+            salmonOpts.fileLog->info("processed {} fragments\n", numObservedFragments);
+            iomutex.unlock();
+        }
+
+        // NOT DOUBLY-COLLAPSED
+        // double logForgettingMass = fmCalc();
+
+        processMiniBatch(readExp, fmCalc,firstTimestepOfRound, rl, salmonOpts, hitLists, transcripts, clusterForest,
+                fragLengthDist, numAssignedFragments, eng, initialRound, burnedIn);
+#if defined(__MOODYCAMEL__)
+        if (!structureCache.enqueue_bulk(std::make_move_iterator(hitLists.begin()), hitLists.size())) {
+            salmonOpts.jointLog->error("Could not enqueue structures in "
+                                       "structureCache; exiting\n\n");
+            std::exit(1);
+        }
+#else
+        // TBB
+        for (auto& hl : hitLists) { structureCache.push(hl); }
+#endif
+        numConsumed = 0;
+        obtained = 0;
+        hitLists.clear();
+        hitLists.resize(batchCount);
+        it = hitLists.begin();
+        // At this point, the parser can re-claim the strings
+    }
+
+}
+
+
+
+
+int performBiasCorrection(boost::filesystem::path featPath,
+                          boost::filesystem::path expPath,
+                          double estimatedReadLength,
+                          double kmersPerRead,
+                          uint64_t mappedKmers,
+                          uint32_t merLen,
+                          boost::filesystem::path outPath,
+                          size_t numThreads);
+
+void processCachedAlignments(
+        ReadExperiment& readExp,
+        ReadLibrary& rl,
+        AlnGroupQueue& structureCache,
+        AlnGroupQueue& alignmentCache,
+        std::atomic<uint64_t>& numObservedFragments,
+        std::atomic<uint64_t>& numAssignedFragments,
+        std::vector<Transcript>& transcripts,
+        ForgettingMassCalculator& fmCalc,
+        ClusterForest& clusterForest,
+        FragmentLengthDistribution& fragLengthDist,
+        const SalmonOpts& salmonOpts,
+        std::mutex& ioMutex,
+        bool initialRound,
+        volatile bool& cacheExhausted,
+        bool& burnedIn,
+        size_t numQuantThreads) {
+
+        std::atomic<uint64_t> numValidHits{0};
+        std::vector<std::thread> quantThreads;
+        for (size_t i = 0; i < numQuantThreads; ++i) {
+                quantThreads.emplace_back(processCachedAlignmentsHelper,
+                        std::ref(readExp),
+                        std::ref(rl),
+                        std::ref(structureCache),
+                        std::ref(alignmentCache),
+                        std::ref(numObservedFragments),
+                        std::ref(numAssignedFragments),
+                        std::ref(numValidHits),
+                        std::ref(transcripts),
+                        std::ref(fmCalc),
+                        std::ref(clusterForest),
+                        std::ref(fragLengthDist),
+                        std::ref(salmonOpts),
+                        std::ref(ioMutex),
+                        initialRound,
+                        std::ref(cacheExhausted),
+                        std::ref(burnedIn));
+
+        }
+        for (auto& t : quantThreads) { t.join(); }
+}
+
+void processReadLibrary(
+        ReadExperiment& readExp,
+        ReadLibrary& rl,
+        bwaidx_t* idx,
+        std::vector<Transcript>& transcripts,
+        ClusterForest& clusterForest,
+        std::atomic<uint64_t>& numObservedFragments, // total number of reads we've looked at
+        std::atomic<uint64_t>& numAssignedFragments, // total number of assigned reads
+        std::atomic<uint64_t>& upperBoundHits, // upper bound on # of mapped frags
+        bool initialRound,
+        bool& burnedIn,
+        ForgettingMassCalculator& fmCalc,
+        FragmentLengthDistribution& fragLengthDist,
+        mem_opt_t* memOptions,
+        const SalmonOpts& salmonOpts,
+        double coverageThresh,
+        bool greedyChain,
+        std::mutex& iomutex,
+        size_t numThreads,
+        AlnGroupQueue& structureCache,
+        AlnGroupQueue& outputGroups,
+        volatile bool& writeToCache){
+
+            std::vector<std::thread> threads;
+
+            std::atomic<uint64_t> numValidHits{0};
+            rl.checkValid();
+
+            std::unique_ptr<paired_parser> pairedParserPtr{nullptr};
+            std::unique_ptr<single_parser> singleParserPtr{nullptr};
+            // If the read library is paired-end
+            // ------ Paired-end --------
+            if (rl.format().type == ReadType::PAIRED_END) {
+
+                char* readFiles[] = { const_cast<char*>(rl.mates1().front().c_str()),
+                    const_cast<char*>(rl.mates2().front().c_str()) };
+
+                size_t maxReadGroup{miniBatchSize}; // Number of reads in each "job"
+                size_t concurrentFile{2}; // Number of files to read simultaneously
+                pairedParserPtr.reset(new
+                        paired_parser(4 * numThreads, maxReadGroup,
+                                      concurrentFile, readFiles, readFiles + 2));
+
+
+                for(int i = 0; i < numThreads; ++i)  {
+                    if (greedyChain) {
+                        auto threadFun = [&]() -> void {
+                                    processReadsMEM<paired_parser, TranscriptHitList>(
+                                    pairedParserPtr.get(),
+                                    readExp,
+                                    rl,
+                                    structureCache,
+                                    outputGroups,
+                                    numObservedFragments,
+                                    numAssignedFragments,
+                                    numValidHits,
+                                    upperBoundHits,
+                                    idx,
+                                    transcripts,
+                                    fmCalc,
+                                    clusterForest,
+                                    fragLengthDist,
+                                    memOptions,
+                                    salmonOpts,
+                                    coverageThresh,
+                                    iomutex,
+                                    initialRound,
+                                    burnedIn,
+                                    writeToCache);
+                        };
+                        threads.emplace_back(threadFun);
+                    } else {
+                        /*
+                        auto threadFun = [&]() -> void {
+                                    processReadsMEM<paired_parser, FragmentList>(
+                                    &parser,
+                                    rl,
+                                    numObservedFragments,
+                                    numAssignedFragments,
+                                    idx,
+                                    transcripts,
+                                    batchNum,
+                                    logForgettingMass,
+                                    ffMutex,
+                                    clusterForest,
+                                    fragLengthDist,
+                                    memOptions,
+                                    salmonOpts,
+                                    coverageThresh,
+                                    iomutex,
+                                    initialRound,
+                                    burnedIn);
+                        };
+                        threads.emplace_back(threadFun);
+                        */
+                    }
+                }
+
+                for(int i = 0; i < numThreads; ++i)
+                    threads[i].join();
+
+            } // ------ Single-end --------
+            else if (rl.format().type == ReadType::SINGLE_END) {
+
+                char* readFiles[] = { const_cast<char*>(rl.unmated().front().c_str()) };
+                size_t maxReadGroup{miniBatchSize}; // Number of files to read simultaneously
+                size_t concurrentFile{1}; // Number of reads in each "job"
+                stream_manager streams( rl.unmated().begin(),
+                        rl.unmated().end(), concurrentFile);
+
+                singleParserPtr.reset(new single_parser(4 * numThreads,
+                                      maxReadGroup,
+                                      concurrentFile,
+                                      streams));
+
+                for(int i = 0; i < numThreads; ++i)  {
+                    if (greedyChain) {
+                        auto threadFun = [&]() -> void {
+                                    processReadsMEM<single_parser, TranscriptHitList>(
+                                    singleParserPtr.get(),
+                                    readExp,
+                                    rl,
+                                    structureCache,
+                                    outputGroups,
+                                    numObservedFragments,
+                                    numAssignedFragments,
+                                    numValidHits,
+                                    upperBoundHits,
+                                    idx,
+                                    transcripts,
+                                    fmCalc,
+                                    clusterForest,
+                                    fragLengthDist,
+                                    memOptions,
+                                    salmonOpts,
+                                    coverageThresh,
+                                    iomutex,
+                                    initialRound,
+                                    burnedIn,
+                                    writeToCache);
+                        };
+                        threads.emplace_back(threadFun);
+                    } else {
+                        /*
+                        auto threadFun = [&]() -> void {
+                                    processReadsMEM<single_parser, FragmentList>( &parser,
+                                    rl,
+                                    numObservedFragments,
+                                    numAssignedFragments,
+                                    idx,
+                                    transcripts,
+                                    batchNum,
+                                    logForgettingMass,
+                                    ffMutex,
+                                    clusterForest,
+                                    fragLengthDist,
+                                    coverageThresh,
+                                    iomutex,
+                                    initialRound,
+                                    burnedIn);
+                        };
+                        threads.emplace_back(threadFun);
+                        */
+                    }
+                }
+                for(int i = 0; i < numThreads; ++i)
+                    threads[i].join();
+            } // ------ END Single-end --------
+}
+
+bool writeAlignmentCacheToFile(
+        AlnGroupQueue& outputGroups,
+        AlnGroupQueue& structureCache,
+        uint64_t& numWritten,
+        std::atomic<uint64_t>& numObservedFragments,
+        uint64_t numRequiredFragments,
+        SalmonOpts& salmonOpts,
+        volatile bool& writeToCache,
+        cereal::BinaryOutputArchive& outputStream ) {
+
+        size_t blockSize{miniBatchSize};
+        size_t numDequed{0};
+        bool cacheUniqueReads = !salmonOpts.useMassBanking;
+#if defined(__MOODYCAMEL__)
+        // Moody camel
+        AlignmentGroup<SMEMAlignment>* alnGroups[blockSize];
+#else
+        // TBB
+        AlignmentGroup<SMEMAlignment>* alnGroups[1];
+#endif
+
+        while (writeToCache) {
+#if defined(__MOODYCAMEL__)
+            // MOODY CAMEL QUEUE
+            while ( (numDequed = outputGroups.try_dequeue_bulk(alnGroups, blockSize)) > 0) {
+                for (size_t i = 0; i < numDequed; ++i) {
+                    // only write ambigously mapped fragments to the cache
+                    // for processing in subsequent rounds
+                    if (cacheUniqueReads or !alnGroups[i]->isUniquelyMapped()) {
+                        outputStream((*alnGroups[i]));
+                        ++numWritten;
+                    }
+                }
+
+                structureCache.enqueue_bulk(alnGroups, numDequed);
+
+                // If, at any point, we've seen the required number of
+                // fragments, then we don't need the cache any longer.
+                if (numObservedFragments > numRequiredFragments) {
+                    writeToCache = false;
+                }
+            }
+#else
+            // TBB QUEUE
+            while (outputGroups.try_pop(alnGroups[0])) {
+                if (cacheUniqueReads or !alnGroups[0]->isUniquelyMapped()) {
+                    outputStream(*alnGroups[0]);
+                    ++numWritten;
+                }
+
+                structureCache.push(alnGroups[0]);
+
+                // If, at any point, we've seen the required number of
+                // fragments, then we don't need the cache any longer.
+                if (numObservedFragments > numRequiredFragments) {
+                    writeToCache = false;
+                }
+            }
+#endif
+        }
+
+#if defined(__MOODYCAMEL__)
+        // Moody camel
+        while (outputGroups.try_dequeue(alnGroups[0])) {
+            if (cacheUniqueReads or !alnGroups[0]->isUniquelyMapped()) {
+                outputStream((*alnGroups[0]));
+                ++numWritten;
+            }
+            structureCache.enqueue(alnGroups[0]);
+        }
+#else
+        // TBB
+        while (outputGroups.try_pop(alnGroups[0])) {
+            outputStream((*alnGroups[0]));
+            ++numWritten;
+            structureCache.push(alnGroups[0]);
+        }
+#endif
+        return true;
+}
+
+bool readAlignmentCache(
+        AlnGroupQueue& alnGroupQueue,
+        AlnGroupQueue& structureCache,
+        uint64_t numWritten,
+        volatile bool& finishedParsing,
+        boost::filesystem::path& cacheFilePath) {
+
+        std::ifstream alnCacheFile(cacheFilePath.c_str(), std::ios::binary);
+        cereal::BinaryInputArchive alnCacheArchive(alnCacheFile);
+
+        uint64_t numRead{0};
+        AlignmentGroup<SMEMAlignment>* alnGroup;
+        while (numRead < numWritten) {
+#if defined(__MOODYCAMEL__)
+            // Moody camel
+            while (!structureCache.try_dequeue(alnGroup)) {}
+#else
+            // TBB
+            while (!structureCache.try_pop(alnGroup)) {}
+#endif
+
+            alnCacheArchive((*alnGroup));
+#if defined(__MOODYCAMEL__)
+            // Moody camel
+            alnGroupQueue.enqueue(alnGroup);
+#else
+            // TBB
+            alnGroupQueue.push(alnGroup);
+#endif
+            ++numRead;
+        }
+        finishedParsing = true;
+
+        alnCacheFile.close();
+        return true;
+}
+
+struct CacheFile {
+    CacheFile(boost::filesystem::path& pathIn, uint64_t numWrittenIn) :
+        filePath(pathIn), numWritten(numWrittenIn), inMemory(false){}
+
+    bool populateCache(volatile bool& finishedParsing,
+                       uint32_t buffQueueSize) {
+
+        if (inMemory) {
+            // If the queue already exists, then just
+            // swap the processed and unprocessed structs
+            // and return
+             if (toProcess) {
+                std::swap(toProcess, processed);
+                finishedParsing = true;
+                return true;
+             } else {
+                // otherwise, create the queues and fill them as we
+                // normally would (i.e. if we weren't holding every thing
+                // in memory).
+#if defined(__MOODYCAMEL__)
+                 // Moody camel
+                toProcess.reset(new AlnGroupQueue(numWritten));
+                initCache.reset(new AlnGroupQueue(numWritten));
+#else
+                 // TBB
+                toProcess.reset(new AlnGroupQueue);
+                initCache.reset(new AlnGroupQueue);
+#endif
+                for (size_t i = 0; i < numWritten; ++i) {
+#if defined(__MOODYCAMEL__)
+                    // Moody camel
+                    initCache->enqueue( new AlignmentGroup<SMEMAlignment>() );
+#else
+                    // TBB
+                    initCache->push( new AlignmentGroup<SMEMAlignment>() );
+#endif
+                }
+                processed.reset(new AlnGroupQueue);
+             }
+        } else {
+            // If we won't be keeping everything in memory
+            // determine whether or not we need to create "working space"
+            // queues (this is only necessary the first time).
+            if (!toProcess) {
+#if defined(__MOODYCAMEL__)
+                // Moody camel
+                toProcess.reset(new AlnGroupQueue(buffQueueSize));
+                processed.reset(new AlnGroupQueue(buffQueueSize));
+#else
+                // TBB
+                toProcess.reset(new AlnGroupQueue);
+                processed.reset(new AlnGroupQueue);
+#endif
+                for (size_t i = 0; i < buffQueueSize; ++i) {
+#if defined(__MOODYCAMEL__)
+                    // Moody camel
+                    processed->enqueue( new AlignmentGroup<SMEMAlignment>() );
+#else
+                    // TBB
+                    processed->push( new AlignmentGroup<SMEMAlignment>() );
+#endif
+                }
+            }
+        }
+
+        // At this point, the queues exist, and we're either reading the
+        // information from file and using the queue as a buffer, or we're
+        // making our first "in memory" pass and we have to fill the buffers
+        // anyway
+        if (inMemory) {
+            cacheReaderThread_.reset(new std::thread (readAlignmentCache,
+                        std::ref(*toProcess),
+                        std::ref(*initCache),
+                        numWritten,
+                        std::ref(finishedParsing),
+                        std::ref(filePath)));
+        } else {
+            cacheReaderThread_.reset(new std::thread (readAlignmentCache,
+                        std::ref(*toProcess),
+                        std::ref(*processed),
+                        numWritten,
+                        std::ref(finishedParsing),
+                        std::ref(filePath)));
+        }
+        return true;
+    }
+
+    bool flushCache() {
+        if (cacheReaderThread_) {
+            cacheReaderThread_->join();
+            cacheReaderThread_.reset(nullptr);
+        }
+        return true;
+    }
+
+    void clearQueues() {
+        AlignmentGroup<SMEMAlignment>* ag;
+
+        if (toProcess) {
+#if defined(__MOODYCAMEL__)
+            // Moody camel
+            while (toProcess->try_dequeue(ag)) { delete ag; }
+#else
+            // TBB
+            while (toProcess->try_pop(ag)) { delete ag; }
+#endif
+        }
+        if (processed) {
+#if defined(__MOODYCAMEL__)
+            // Moody camel
+            while (processed->try_dequeue(ag)) { delete ag; }
+#else
+            // TBB
+            while (processed->try_pop(ag)) { delete ag; }
+#endif
+        }
+        if (initCache) {
+#if defined(__MOODYCAMEL__)
+            // Moody camel
+            while (initCache->try_dequeue(ag)) { delete ag; }
+#else
+            // TBB
+            while (initCache->try_pop(ag)) { delete ag; }
+#endif
+        }
+    }
+
+
+    boost::filesystem::path filePath;
+    uint64_t numWritten{0};
+    bool inMemory;
+
+    std::unique_ptr<AlnGroupQueue> toProcess{nullptr};
+    std::unique_ptr<AlnGroupQueue> processed{nullptr};
+    std::unique_ptr<AlnGroupQueue> initCache{nullptr};
+
+    // If the file is small enough, we'll make the mapping cache reside "in memory"
+    // that's what this guy is for.
+    // std::vector<char> inMemoryMappingCache;
+
+    private:
+        // The thread that will read the mapping cache
+        std::unique_ptr<std::thread> cacheReaderThread_{nullptr};
+};
+
+/**
+  *  Quantify the targets given in the file `transcriptFile` using the
+  *  reads in the given set of `readLibraries`, and write the results
+  *  to the file `outputFile`.  The reads are assumed to be in the format
+  *  specified by `libFmt`.
+  *
+  */
+void quantifyLibrary(
+        ReadExperiment& experiment,
+        bool greedyChain,
+        mem_opt_t* memOptions,
+        SalmonOpts& salmonOpts,
+        double coverageThresh,
+        size_t numRequiredFragments,
+        uint32_t numQuantThreads) {
+
+    bool burnedIn{false};
+    std::atomic<uint64_t> upperBoundHits{0};
+    //ErrorModel errMod(1.00);
+    auto& refs = experiment.transcripts();
+    size_t numTranscripts = refs.size();
+    // The *total* number of fragments observed so far (over all passes through the data).
+    std::atomic<uint64_t> numObservedFragments{0};
+    uint64_t prevNumObservedFragments{0};
+    // The *total* number of fragments assigned so far (over all passes through the data).
+    std::atomic<uint64_t> totalAssignedFragments{0};
+    uint64_t prevNumAssignedFragments{0};
+
+    auto jointLog = spdlog::get("jointLog");
+
+    ForgettingMassCalculator fmCalc(salmonOpts.forgettingFactor);
+    size_t prefillSize = 1000000000 / miniBatchSize;
+    fmCalc.prefill(prefillSize);
+
+    bool initialRound{true};
+    uint32_t roundNum{0};
+
+    std::mutex ffMutex;
+    std::mutex ioMutex;
+
+    size_t numPrevObservedFragments = 0;
+    std::vector<CacheFile> cacheFiles;
+
+    size_t maxReadGroup{miniBatchSize};
+    uint32_t structCacheSize = numQuantThreads * maxReadGroup * 10;
+
+    // EQCLASS
+    bool terminate{false};
+
+    while (numObservedFragments < numRequiredFragments and !terminate) {
+        prevNumObservedFragments = numObservedFragments;
+        if (!initialRound) {
+            bool didReset = (salmonOpts.disableMappingCache) ?
+                            (experiment.reset()) :
+                            (experiment.softReset());
+
+            if (!didReset) {
+                std::string errmsg = fmt::sprintf(
+                  "\n\n======== WARNING ========\n"
+                  "One of the provided read files: [{}] "
+                  "is not a regular file and therefore can't be read from "
+                  "more than once.\n\n"
+                  "We observed only {} mapping fragments when we wanted at least {}.\n\n"
+                  "Please consider re-running Salmon with these reads "
+                  "as a regular file!\n"
+                  "NOTE: If you received this warning from salmon but did not "
+                  "disable the mapping cache (--disableMappingCache), then there \n"
+                  "was some other problem. Please make sure, e.g., that you have not "
+                  "run out of disk space.\n"
+                  "==========================\n\n",
+                  experiment.readFilesAsString(), numObservedFragments, numRequiredFragments);
+                jointLog->warn() << errmsg;
+                break;
+            }
+
+            if (numObservedFragments - numPrevObservedFragments <= salmonOpts.mappingCacheMemoryLimit
+                and roundNum < 2) {
+                for (auto& cf : cacheFiles) { cf.inMemory = true; }
+            }
+            numPrevObservedFragments = numObservedFragments;
+        }
+
+        if (initialRound or salmonOpts.disableMappingCache) {
+#if defined(__MOODYCAMEL__)
+            // Moody camel
+            AlnGroupQueue outputGroups(structCacheSize);
+            AlnGroupQueue groupCache(structCacheSize);
+#else
+            // TBB
+            AlnGroupQueue outputGroups;
+            AlnGroupQueue groupCache;
+#endif
+
+            for (size_t i = 0; i < structCacheSize; ++i) {
+ #if defined(__MOODYCAMEL__)
+                // Moody camel
+                groupCache.enqueue( new AlignmentGroup<SMEMAlignment>() );
+#else
+                // TBB
+                groupCache.push( new AlignmentGroup<SMEMAlignment>() );
+#endif
+            }
+
+            volatile bool writeToCache = !salmonOpts.disableMappingCache;
+            auto processReadLibraryCallback =  [&](
+                    ReadLibrary& rl, bwaidx_t* idx,
+                    std::vector<Transcript>& transcripts, ClusterForest& clusterForest,
+                    FragmentLengthDistribution& fragLengthDist,
+                    std::atomic<uint64_t>& numAssignedFragments,
+                    size_t numQuantThreads, bool& burnedIn) -> void  {
+
+                // The file where the alignment cache was / will be written
+                fmt::MemoryWriter fname;
+                fname << "alnCache_" << cacheFiles.size() << ".bin";
+                boost::filesystem::path alnCacheFilename = salmonOpts.outputDirectory / fname.str();
+                cacheFiles.emplace_back(alnCacheFilename, uint64_t(0));
+
+                std::unique_ptr<std::ofstream> alnCacheFile{nullptr};
+                std::unique_ptr<std::thread> cacheWriterThread{nullptr};
+                if (writeToCache) {
+                    alnCacheFile.reset(new std::ofstream(alnCacheFilename.c_str(), std::ios::binary));
+                    cereal::BinaryOutputArchive alnCacheArchive(*alnCacheFile);
+                    cacheWriterThread.reset(new std::thread(writeAlignmentCacheToFile,
+                        std::ref(outputGroups),
+                        std::ref(groupCache),
+                        std::ref(cacheFiles.back().numWritten),
+                        std::ref(numObservedFragments),
+                        numRequiredFragments,
+                        std::ref(salmonOpts),
+                        std::ref(writeToCache),
+                        std::ref(alnCacheArchive)));
+                }
+
+                processReadLibrary(experiment, rl, idx, transcripts, clusterForest,
+                        numObservedFragments, totalAssignedFragments, upperBoundHits,
+                        initialRound, burnedIn, fmCalc, fragLengthDist,
+                        memOptions, salmonOpts, coverageThresh, greedyChain,
+                        ioMutex, numQuantThreads,
+                        groupCache, outputGroups, writeToCache);
+
+                numAssignedFragments = totalAssignedFragments - prevNumAssignedFragments;
+                prevNumAssignedFragments = totalAssignedFragments;
+
+                // join the thread the writes the file
+                writeToCache = false;
+                if (cacheWriterThread) { cacheWriterThread->join(); }
+                if (alnCacheFile) { alnCacheFile->close(); }
+            };
+
+            // Process all of the reads
+            experiment.processReads(numQuantThreads, processReadLibraryCallback);
+            experiment.setNumObservedFragments(numObservedFragments);
+
+            // Empty the structure cache here
+            AlignmentGroup<SMEMAlignment>* ag;
+#if defined(__MOODYCAMEL__)
+            // Moody camel
+            while (groupCache.try_dequeue(ag)) { delete ag; }
+#else
+            // TBB
+            while (groupCache.try_pop(ag)) { delete ag; }
+#endif
+            //EQCLASS
+            bool done = experiment.equivalenceClassBuilder().finish();
+            // skip the extra online rounds
+            terminate = true;
+            // END EQCLASS
+        } else {
+            uint32_t libNum{0};
+            auto processReadLibraryCallback =  [&](
+                    ReadLibrary& rl, bwaidx_t* idx,
+                    std::vector<Transcript>& transcripts, ClusterForest& clusterForest,
+                    FragmentLengthDistribution& fragLengthDist,
+                    std::atomic<uint64_t>& numAssignedFragments,
+                    size_t numQuantThreads, bool& burnedIn) -> void  {
+
+                volatile bool finishedParsing{false};
+
+                // The file where the alignment cache was / will be written
+                auto& cf = cacheFiles[libNum];
+                ++libNum;
+
+                cf.populateCache(finishedParsing, structCacheSize);
+
+                uint64_t priorTotalAssignedFragments = totalAssignedFragments;
+                uint64_t priorTotalObservedFragments = numObservedFragments;
+                processCachedAlignments(
+                        experiment,
+                        rl,
+                        //groupCache, alnGroupQueue,
+                        *(cf.processed.get()),
+                        *(cf.toProcess.get()),
+                        numObservedFragments, totalAssignedFragments,
+                        transcripts, fmCalc, clusterForest, fragLengthDist,
+                        salmonOpts, ioMutex, initialRound, finishedParsing,
+                        burnedIn, numQuantThreads);
+
+                cf.flushCache();
+
+               if (salmonOpts.useMassBanking) {
+                   // If we're using mass banking
+                   // Regardless of what we count, we see the same total number
+                   // of fragments we did in the first round
+                   totalAssignedFragments = priorTotalAssignedFragments + experiment.numAssignedFragsInFirstPass();
+                   numObservedFragments = priorTotalObservedFragments + experiment.numAssignedFragsInFirstPass();
+               }
+                // Before mass banking
+                numAssignedFragments = totalAssignedFragments - prevNumAssignedFragments;
+                prevNumAssignedFragments = totalAssignedFragments;
+            };
+
+            // Process all of the reads
+            experiment.processReads(numQuantThreads, processReadLibraryCallback);
+        }
+
+        initialRound = false;
+        ++roundNum;
+        fmt::print(stderr, "\n# observed = {} / # required = {}\n",
+                   numObservedFragments, numRequiredFragments);
+        fmt::print(stderr, "hard # assigned = {} / # observed (this round) = {} : "
+                           "upper bound assigned = {} \033[A\033[A",
+                   experiment.numAssignedFragments(),
+                   numObservedFragments - numPrevObservedFragments,
+                   upperBoundHits);
+        salmonOpts.fileLog->info("\nAt end of round {}\n"
+                                   "==================\n"
+                                   "Observed {} total fragments ({} in most recent round)\n",
+                                   roundNum - 1,
+                                   numObservedFragments,
+                                   numObservedFragments - numPrevObservedFragments);
+    }
+    fmt::print(stderr, "\n\n\n\n");
+
+    // delete any temporary alignment cache files
+    for (auto& cf : cacheFiles) {
+        if (boost::filesystem::exists(cf.filePath)) {
+            boost::filesystem::remove(cf.filePath);
+        }
+        // TODO: clear any queues allocated by
+        // the cache files.
+        cf.clearQueues();
+    }
+
+    if (numObservedFragments <= prevNumObservedFragments) {
+        jointLog->warn() << "Something seems to be wrong with the calculation "
+            "of the mapping rate.  The recorded ratio is likely wrong.  Please "
+            "file this as a bug report.\n";
+    } else {
+        double upperBoundMappingRate =
+            upperBoundHits.load() /
+            static_cast<double>(numObservedFragments.load());
+        experiment.setNumObservedFragments(numObservedFragments - prevNumObservedFragments);
+        experiment.setUpperBoundHits(upperBoundHits.load());
+        experiment.setEffetiveMappingRate(upperBoundMappingRate);
+    }
+
+        jointLog->info("Overall mapping rate = {}\%; "
+                   "Effective mapping rate = {}\%\n",
+                   experiment.mappingRate() * 100.0,
+                   experiment.effectiveMappingRate() * 100.0);
+    jointLog->info("finished quantifyLibrary()");
+}
+
+int performBiasCorrectionSalmon(
+        boost::filesystem::path featureFile,
+        boost::filesystem::path expressionFile,
+        boost::filesystem::path outputFile,
+        size_t numThreads);
+
+int salmonQuantify(int argc, char *argv[]) {
+    using std::cerr;
+    using std::vector;
+    using std::string;
+    namespace bfs = boost::filesystem;
+    namespace po = boost::program_options;
+
+    bool biasCorrect{false};
+    bool optChain{false};
+    size_t requiredObservations;
+
+    SalmonOpts sopt;
+    mem_opt_t* memOptions = mem_opt_init();
+    memOptions->split_factor = 1.5;
+
+    sopt.numThreads = std::thread::hardware_concurrency();
+
+    double coverageThresh;
+    vector<string> unmatedReadFiles;
+    vector<string> mate1ReadFiles;
+    vector<string> mate2ReadFiles;
+
+    po::options_description generic("\n"
+		    		    "basic options");
+    generic.add_options()
+    ("version,v", "print version string")
+    ("help,h", "produce help message")
+    ("index,i", po::value<string>()->required(), "Salmon index")
+    ("libType,l", po::value<std::string>()->required(), "Format string describing the library type")
+    ("unmatedReads,r", po::value<vector<string>>(&unmatedReadFiles)->multitoken(),
+     "List of files containing unmated reads of (e.g. single-end reads)")
+    ("mates1,1", po::value<vector<string>>(&mate1ReadFiles)->multitoken(),
+        "File containing the #1 mates")
+    ("mates2,2", po::value<vector<string>>(&mate2ReadFiles)->multitoken(),
+        "File containing the #2 mates")
+    ("threads,p", po::value<uint32_t>(&(sopt.numThreads))->default_value(sopt.numThreads), "The number of threads to use concurrently.")
+    ("incompatPrior", po::value<double>(&(sopt.incompatPrior))->default_value(1e-20), "This option "
+                        "sets the prior probability that an alignment that disagrees with the specified "
+                        "library type (--libType) results from the true fragment origin.  Setting this to 0 "
+                        "specifies that alignments that disagree with the library type should be \"impossible\", "
+                        "while setting it to 1 says that alignments that disagree with the library type are no "
+                        "less likely than those that do")
+    ("numRequiredObs,n", po::value(&requiredObservations)->default_value(50000000),
+                                        "[Deprecated]: The minimum number of observations (mapped reads) that must be observed before "
+                                        "the inference procedure will terminate.  If fewer mapped reads exist in the "
+                                        "input file, then it will be read through multiple times.")
+    ("minLen,k", po::value<int>(&(memOptions->min_seed_len))->default_value(19), "(S)MEMs smaller than this size won't be considered.")
+    ("extraSensitive", po::bool_switch(&(sopt.extraSeedPass))->default_value(false), "Setting this option enables an extra pass of \"seed\" search. "
+                                        "Enabling this option may improve sensitivity (the number of reads having sufficient coverage), but will "
+                                        "typically slow down quantification by ~40%.  Consider enabling this option if you find the mapping rate to "
+                                        "be significantly lower than expected.")
+    ("coverage,c", po::value<double>(&coverageThresh)->default_value(0.70), "required coverage of read by union of SMEMs to consider it a \"hit\".")
+    ("output,o", po::value<std::string>()->required(), "Output quantification file.")
+    ("biasCorrect", po::value(&biasCorrect)->zero_tokens(), "[Experimental: Output both bias-corrected and non-bias-corrected "
+                                                               "qunatification estimates.")
+    ("geneMap,g", po::value<string>(), "File containing a mapping of transcripts to genes.  If this file is provided "
+                                        "Salmon will output both quant.sf and quant.genes.sf files, where the latter "
+                                        "contains aggregated gene-level abundance estimates.  The transcript to gene mapping "
+                                        "should be provided as either a GTF file, or a in a simple tab-delimited format "
+                                        "where each line contains the name of a transcript and the gene to which it belongs "
+                                        "separated by a tab.  The extension of the file is used to determine how the file "
+                                        "should be parsed.  Files ending in \'.gtf\' or \'.gff\' are assumed to be in GTF "
+                                        "format; files with any other extension are assumed to be in the simple format.");
+    //("optChain", po::bool_switch(&optChain)->default_value(false), "Chain MEMs optimally rather than greedily")
+
+    // no sequence bias for now
+    sopt.noSeqBiasModel = true;
+    sopt.noRichEqClasses = false;
+    // mapping cache has been deprecated
+    sopt.disableMappingCache = true;
+
+    po::options_description advanced("\n"
+		    		     "advanced options");
+    advanced.add_options()
+    /*
+    ("disableMappingCache", po::bool_switch(&(sopt.disableMappingCache))->default_value(false), "Setting this option disables the creation and use "
+                                        "of the \"mapping cache\" file.  The mapping cache can speed up quantification significantly for smaller read "
+                                        "libraries (i.e. where the number of mapped fragments is less than the required number of observations). However, "
+                                        "for very large read libraries, the mapping cache is unnecessary, and disabling it may allow salmon to more effectively "
+                                        "make use of a very large number of threads.")
+    */
+    ("fldMax" , po::value<size_t>(&(sopt.fragLenDistMax))->default_value(800), "The maximum fragment length to consider when building the empirical "
+     											      "distribution")
+    ("fldMean", po::value<size_t>(&(sopt.fragLenDistPriorMean))->default_value(200), "The mean used in the fragment length distribution prior")
+    ("fldSD" , po::value<size_t>(&(sopt.fragLenDistPriorSD))->default_value(80), "The standard deviation used in the fragment length distribution prior")
+    ("forgettingFactor,f", po::value<double>(&(sopt.forgettingFactor))->default_value(0.65), "The forgetting factor used "
+                        "in the online learning schedule.  A smaller value results in quicker learning, but higher variance "
+                        "and may be unstable.  A larger value results in slower learning but may be more stable.  Value should "
+                        "be in the interval (0.5, 1.0].")
+    ("mappingCacheMemoryLimit", po::value<uint32_t>(&(sopt.mappingCacheMemoryLimit))->default_value(5000000), "If the file contained fewer than this "
+                                        "many reads, then just keep the data in memory for subsequent rounds of inference. Obviously, this value should "
+                                        "not be too large if you wish to keep a low memory usage, but setting it large enough can substantially speed up "
+                                        "inference on \"small\" files that contain only a few million reads.")
+    ("maxOcc,m", po::value<int>(&(memOptions->max_occ))->default_value(200), "(S)MEMs occuring more than this many times won't be considered.")
+    ("maxReadOcc,w", po::value<uint32_t>(&(sopt.maxReadOccs))->default_value(100), "Reads \"mapping\" to more than this many places won't be considered.")
+    ("noEffectiveLengthCorrection", po::bool_switch(&(sopt.noEffectiveLengthCorrection))->default_value(false), "Disables "
+                        "effective length correction when computing the probability that a fragment was generated "
+                        "from a transcript.  If this flag is passed in, the fragment length distribution is not taken "
+                        "into account when computing this probability.")
+    ("noFragLengthDist", po::bool_switch(&(sopt.noFragLengthDist))->default_value(false), "[Currently Experimental] : "
+                        "Don't consider concordance with the learned fragment length distribution when trying to determine "
+                        "the probability that a fragment has originated from a specified location.  Normally, Fragments with "
+                         "unlikely lengths will be assigned a smaller relative probability than those with more likely "
+                        "lengths.  When this flag is passed in, the observed fragment length has no effect on that fragment's "
+                        "a priori probability.")
+    ("noFragStartPosDist", po::bool_switch(&(sopt.noFragStartPosDist))->default_value(false), "[Currently Experimental] : "
+                        "Don't consider / model non-uniformity in the fragment start positions "
+                        "across the transcript.")
+    //("noSeqBiasModel", po::bool_switch(&(sopt.noSeqBiasModel))->default_value(false),
+    //                    "Don't learn and apply a model of sequence-specific bias")
+    ("numAuxModelSamples", po::value<uint32_t>(&(sopt.numBurninFrags))->default_value(5000000), "The first <numAuxModelSamples> are used to train the "
+     			"auxiliary model parameters (e.g. fragment length distribution, bias, etc.).  After ther first <numAuxModelSamples> observations "
+			"the auxiliary model parameters will be assumed to have converged and will be fixed.")
+    ("numPreAuxModelSamples", po::value<uint32_t>(&(sopt.numPreBurninFrags))->default_value(1000000), "The first <numPreAuxModelSamples> will have their "
+     			"assignment likelihoods and contributions to the transcript abundances computed without applying any auxiliary models.  The purpose "
+			"of ignoring the auxiliary models for the first <numPreAuxModelSamples> observations is to avoid applying these models before thier "
+			"parameters have been learned sufficiently well.")
+    ("splitWidth,s", po::value<int>(&(memOptions->split_width))->default_value(0), "If (S)MEM occurs fewer than this many times, search for smaller, contained MEMs. "
+                                        "The default value will not split (S)MEMs, a higher value will result in more MEMs being explore and, thus, will "
+                                        "result in increased running time.")
+    ("splitSpanningSeeds,b", po::bool_switch(&(sopt.splitSpanningSeeds))->default_value(false), "Attempt to split seeds that happen to fall on the "
+                                        "boundary between two transcripts.  This can improve the  fragment hit-rate, but is usually not necessary.")
+    ("useMassBanking", po::bool_switch(&(sopt.useMassBanking))->default_value(false), "[Deprecated] : "
+                        "Use mass \"banking\" in subsequent epoch of inference.  Rather than re-observing uniquely "
+                        "mapped reads, simply remember the ratio of uniquely to ambiguously mapped reads for each "
+                        "transcript and distribute the unique mass uniformly throughout the epoch.")
+    ("useVBOpt,v", po::bool_switch(&(sopt.useVBOpt))->default_value(false), "Use the Variational Bayesian EM rather than the "
+     			"traditional EM algorithm for optimization in the batch passes.")
+    ("useGSOpt", po::bool_switch(&(sopt.useGSOpt))->default_value(false), "[*super*-experimental]: After the initial optimization has finished, "
+                "use collapsed Gibbs sampling to refine estimates even further (and obtain variance)")
+    ("numGibbsSamples", po::value<uint32_t>(&(sopt.numGibbsSamples))->default_value(500), "[*super*-experimental]: Number of Gibbs sampling rounds to "
+     		"perform.");
+
+
+
+    po::options_description testing("\n"
+            "testing options");
+    testing.add_options()
+        ("noRichEqClasses", po::bool_switch(&(sopt.noRichEqClasses))->default_value(false),
+                        "[TESTING OPTION]: Disable \"rich\" equivalent classes.  If this flag is passed, then "
+                        "all information about the relative weights for each transcript in the "
+                        "label of an equivalence class will be ignored, and only the relative "
+                        "abundance and effective length of each transcript will be considered.")
+        ("noFragLenFactor", po::bool_switch(&(sopt.noFragLenFactor))->default_value(false),
+                        "[TESTING OPTION]: Disable the factor in the likelihood that takes into account the "
+                        "goodness-of-fit of an alignment with the empirical fragment length "
+                        "distribution");
+
+    po::options_description all("salmon quant options");
+    all.add(generic).add(advanced).add(testing);
+
+    po::options_description visible("salmon quant options");
+    visible.add(generic).add(advanced);
+
+    po::variables_map vm;
+    try {
+        auto orderedOptions = po::command_line_parser(argc,argv).
+            options(all).run();
+
+        po::store(orderedOptions, vm);
+
+        if ( vm.count("help") ) {
+            auto hstring = R"(
+Quant
+==========
+Perform streaming SMEM-based estimation of
+transcript abundance from RNA-seq reads
+)";
+            std::cout << hstring << std::endl;
+            std::cout << visible << std::endl;
+            std::exit(1);
+        }
+
+        po::notify(vm);
+
+        std::stringstream commentStream;
+        commentStream << "# salmon (smem-based) v" << salmon::version << "\n";
+        commentStream << "# [ program ] => salmon \n";
+        commentStream << "# [ command ] => quant \n";
+        for (auto& opt : orderedOptions.options) {
+            commentStream << "# [ " << opt.string_key << " ] => {";
+            for (auto& val : opt.value) {
+                commentStream << " " << val;
+            }
+            commentStream << " }\n";
+        }
+        std::string commentString = commentStream.str();
+        fmt::print(stderr, "{}", commentString);
+
+        // Verify the geneMap before we start doing any real work.
+        bfs::path geneMapPath;
+        if (vm.count("geneMap")) {
+            // Make sure the provided file exists
+            geneMapPath = vm["geneMap"].as<std::string>();
+            if (!bfs::exists(geneMapPath)) {
+                std::cerr << "Could not fine transcript <=> gene map file " << geneMapPath << "\n";
+                std::cerr << "Exiting now: please either omit the \'geneMap\' option or provide a valid file\n";
+                std::exit(1);
+            }
+        }
+
+        bool greedyChain = !optChain;
+        bfs::path outputDirectory(vm["output"].as<std::string>());
+        bfs::create_directory(outputDirectory);
+        if (!(bfs::exists(outputDirectory) and bfs::is_directory(outputDirectory))) {
+            std::cerr << "Couldn't create output directory " << outputDirectory << "\n";
+            std::cerr << "exiting\n";
+            std::exit(1);
+        }
+
+        bfs::path indexDirectory(vm["index"].as<string>());
+        bfs::path logDirectory = outputDirectory / "logs";
+
+        sopt.indexDirectory = indexDirectory;
+        sopt.outputDirectory = outputDirectory;
+
+        // Create the logger and the logging directory
+        bfs::create_directory(logDirectory);
+        if (!(bfs::exists(logDirectory) and bfs::is_directory(logDirectory))) {
+            std::cerr << "Couldn't create log directory " << logDirectory << "\n";
+            std::cerr << "exiting\n";
+            std::exit(1);
+        }
+        std::cerr << "Logs will be written to " << logDirectory.string() << "\n";
+
+        bfs::path logPath = logDirectory / "salmon_quant.log";
+	// must be a power-of-two
+        size_t max_q_size = 2097152;
+        spdlog::set_async_mode(max_q_size);
+
+        auto fileSink = std::make_shared<spdlog::sinks::simple_file_sink_mt>(logPath.string(), true);
+        auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
+        auto consoleLog = spdlog::create("consoleLog", {consoleSink});
+        auto fileLog = spdlog::create("fileLog", {fileSink});
+        auto jointLog = spdlog::create("jointLog", {fileSink, consoleSink});
+
+        sopt.jointLog = jointLog;
+        sopt.fileLog = fileLog;
+
+        // Verify that no inconsistent options were provided
+        {
+            if (sopt.noFragLengthDist and !sopt.noEffectiveLengthCorrection) {
+                jointLog->info() << "Error: You cannot enable --noFragLengthDist without "
+                                 << "also enabling --noEffectiveLengthCorrection; exiting!\n";
+                std::exit(1);
+            }
+        }
+
+        // maybe arbitrary, but if it's smaller than this, consider it
+        // equal to LOG_0
+        if (sopt.incompatPrior < 1e-320) {
+            sopt.incompatPrior = salmon::math::LOG_0;
+        } else {
+            sopt.incompatPrior = std::log(sopt.incompatPrior);
+        }
+        // END: option checking
+
+        jointLog->info() << "parsing read library format";
+
+        vector<ReadLibrary> readLibraries = salmon::utils::extractReadLibraries(orderedOptions);
+        ReadExperiment experiment(readLibraries, indexDirectory, sopt);
+
+        // This will be the class in charge of maintaining our
+	// rich equivalence classes
+        experiment.equivalenceClassBuilder().start();
+
+        quantifyLibrary(experiment, greedyChain, memOptions, sopt, coverageThresh,
+                        requiredObservations, sopt.numThreads);
+
+        // Now that the streaming pass is complete, we have
+	// our initial estimates, and our rich equivalence
+	// classes.  Perform further optimization until
+	// convergence.
+        CollapsedEMOptimizer optimizer;
+        jointLog->info("Starting optimizer");
+    	salmon::utils::normalizeAlphas(sopt, experiment);
+        optimizer.optimize(experiment, sopt, 0.01, 10000);
+        jointLog->info("Finished optimizer");
+
+        if (sopt.useGSOpt) {
+            jointLog->info("Starting Gibbs Sampler");
+            CollapsedGibbsSampler sampler;
+            sampler.sample(experiment, sopt, sopt.numGibbsSamples);
+            jointLog->info("Finished Gibbs Sampler");
+        }
+
+
+        free(memOptions);
+        size_t tnum{0};
+
+        jointLog->info("writing output \n");
+
+        bfs::path estFilePath = outputDirectory / "quant.sf";
+
+        commentStream << "# [ mapping rate ] => { " << experiment.mappingRate() * 100.0 << "\% }\n";
+        commentString = commentStream.str();
+
+        salmon::utils::writeAbundancesFromCollapsed(
+                sopt, experiment, estFilePath, commentString);
+
+        // Now create a subdirectory for any parameters of interest
+        bfs::path paramsDir = outputDirectory / "libParams";
+        if (!boost::filesystem::exists(paramsDir)) {
+            if (!boost::filesystem::create_directory(paramsDir)) {
+                fmt::print(stderr, "{}ERROR{}: Could not create "
+                           "output directory for experimental parameter "
+                           "estimates [{}]. exiting.", ioutils::SET_RED,
+                           ioutils::RESET_COLOR, paramsDir);
+                std::exit(-1);
+            }
+        }
+
+        bfs::path libCountFilePath = outputDirectory / "libFormatCounts.txt";
+        experiment.summarizeLibraryTypeCounts(libCountFilePath);
+
+        // Test writing out the fragment length distribution
+        if (!sopt.noFragLengthDist) {
+            bfs::path distFileName = paramsDir / "flenDist.txt";
+            {
+                std::unique_ptr<std::FILE, int (*)(std::FILE *)> distOut(std::fopen(distFileName.c_str(), "w"), std::fclose);
+                fmt::print(distOut.get(), "{}\n", experiment.fragmentLengthDistribution()->toString());
+            }
+        }
+        if (!sopt.noSeqBiasModel) {
+            bfs::path biasFileName = paramsDir / "seqBias.txt";
+            {
+                std::unique_ptr<std::FILE, int (*)(std::FILE *)> biasOut(std::fopen(biasFileName.c_str(), "w"), std::fclose);
+                fmt::print(biasOut.get(), "{}\n", experiment.sequenceBiasModel().toString());
+            }
+        }
+
+        if (biasCorrect) {
+            auto origExpressionFile = estFilePath;
+
+            auto outputDirectory = estFilePath;
+            outputDirectory.remove_filename();
+
+            auto biasFeatPath = indexDirectory / "bias_feats.txt";
+            auto biasCorrectedFile = outputDirectory / "quant_bias_corrected.sf";
+            performBiasCorrectionSalmon(biasFeatPath, estFilePath, biasCorrectedFile, sopt.numThreads);
+        }
+
+        /** If the user requested gene-level abundances, then compute those now **/
+        if (vm.count("geneMap")) {
+            try {
+                salmon::utils::generateGeneLevelEstimates(geneMapPath,
+                                                            outputDirectory,
+                                                            biasCorrect);
+            } catch (std::invalid_argument& e) {
+                fmt::print(stderr, "Error: [{}] when trying to compute gene-level "\
+                                   "estimates. The gene-level file(s) may not exist",
+                                   e.what());
+            }
+        }
+
+    } catch (po::error &e) {
+        std::cerr << "Exception : [" << e.what() << "]. Exiting.\n";
+        std::exit(1);
+    } catch (const spdlog::spdlog_ex& ex) {
+        std::cerr << "logger failed with : [" << ex.what() << "]. Exiting.\n";
+        std::exit(1);
+    } catch (std::exception& e) {
+        std::cerr << "Exception : [" << e.what() << "]\n";
+        std::cerr << argv[0] << " quant was invoked improperly.\n";
+        std::cerr << "For usage information, try " << argv[0] << " quant --help\nExiting.\n";
+        std::exit(1);
+    }
+
+
+    return 0;
+}
+
diff --git a/src/SalmonQuantify.cpp.bak b/src/SalmonQuantify.cpp.bak
new file mode 100644
index 0000000..0c2acd3
--- /dev/null
+++ b/src/SalmonQuantify.cpp.bak
@@ -0,0 +1,1314 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cmath>
+#include <unordered_map>
+#include <map>
+#include <vector>
+#include <unordered_set>
+#include <mutex>
+#include <thread>
+#include <sstream>
+#include <exception>
+#include <random>
+#include <queue>
+#include "btree_map.h"
+#include "btree_set.h"
+
+
+/** BWA Includes */
+#include <cstdio>
+#include <unistd.h>
+#include <cstdlib>
+#include <cstring>
+#include <cctype>
+
+extern "C" {
+#include "bwa.h"
+#include "bwamem.h"
+#include "kvec.h"
+#include "utils.h"
+#include "kseq.h"
+#include "utils.h"
+}
+
+/** Boost Includes */
+#include <boost/filesystem.hpp>
+#include <boost/container/flat_map.hpp>
+#include <boost/dynamic_bitset/dynamic_bitset.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/program_options.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/lockfree/queue.hpp>
+#include <boost/thread/thread.hpp>
+
+#include "tbb/concurrent_unordered_set.h"
+#include "tbb/concurrent_vector.h"
+#include "tbb/concurrent_unordered_map.h"
+#include "tbb/concurrent_queue.h"
+#include "tbb/parallel_for.h"
+#include "tbb/parallel_for_each.h"
+#include "tbb/parallel_reduce.h"
+#include "tbb/blocked_range.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/partitioner.h"
+
+#if HAVE_LOGGER
+#include "g2logworker.h"
+#include "g2log.h"
+#endif
+
+#include "cereal/types/vector.hpp"
+#include "cereal/archives/binary.hpp"
+
+#include "jellyfish/mer_dna.hpp"
+
+#include "ClusterForest.hpp"
+#include "PerfectHashIndex.hpp"
+#include "LookUpTableUtils.hpp"
+#include "SalmonMath.hpp"
+#include "Transcript.hpp"
+#include "LibraryFormat.hpp"
+#include "SailfishUtils.hpp"
+#include "ReadLibrary.hpp"
+
+#include "PairSequenceParser.hpp"
+
+//#include "google/dense_hash_map"
+
+extern unsigned char nst_nt4_table[256];
+char* bwa_pg = "cha";
+
+typedef pair_sequence_parser<char**> sequence_parser;
+
+using TranscriptID = uint32_t;
+using TranscriptIDVector = std::vector<TranscriptID>;
+using KmerIDMap = std::vector<TranscriptIDVector>;
+using my_mer = jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;
+
+uint32_t transcript(uint64_t enc) {
+    uint32_t t = (enc & 0xFFFFFFFF00000000) >> 32;
+    return t;
+}
+
+uint32_t offset(uint64_t enc) {
+    uint32_t o = enc & 0xFFFFFFFF;
+    return o;
+}
+
+class Alignment {
+    public:
+        Alignment(TranscriptID transcriptIDIn, uint32_t kCountIn = 1, double logProbIn = salmon::math::LOG_0) :
+            transcriptID_(transcriptIDIn), kmerCount(kCountIn), logProb(logProbIn) {}
+
+        inline TranscriptID transcriptID() { return transcriptID_; }
+        uint32_t kmerCount;
+        double logProb;
+
+    private:
+        TranscriptID transcriptID_;
+};
+
+void processMiniBatch(
+        double logForgettingMass,
+        std::vector<std::vector<Alignment>>& batchHits,
+        std::vector<Transcript>& transcripts,
+        ClusterForest& clusterForest
+        ) {
+
+    using salmon::math::LOG_0;
+    using salmon::math::logAdd;
+    using salmon::math::logSub;
+
+    size_t numTranscripts{transcripts.size()};
+
+    bool burnedIn{true};
+    // Build reverse map from transcriptID => hit id
+    using HitID = uint32_t;
+    btree::btree_map<TranscriptID, std::vector<Alignment*>> hitsForTranscript;
+    size_t hitID{0};
+    for (auto& hv : batchHits) {
+        for (auto& tid : hv) {
+            hitsForTranscript[tid.transcriptID()].push_back(&tid);
+        }
+        ++hitID;
+    }
+
+    double clustTotal = std::log(batchHits.size()) + logForgettingMass;
+    {
+        // E-step
+
+        // Iterate over each group of alignments (a group consists of all alignments reported
+        // for a single read).  Distribute the read's mass proportionally dependent on the
+        // current hits
+        for (auto& alnGroup : batchHits) {
+            if (alnGroup.size() == 0) { continue; }
+            double sumOfAlignProbs{LOG_0};
+            // update the cluster-level properties
+            bool transcriptUnique{true};
+            auto firstTranscriptID = alnGroup.front().transcriptID();
+            std::unordered_set<size_t> observedTranscripts;
+            for (auto& aln : alnGroup) {
+                auto transcriptID = aln.transcriptID();
+                auto& transcript = transcripts[transcriptID];
+                transcriptUnique = transcriptUnique and (transcriptID == firstTranscriptID);
+
+                double refLength = transcript.RefLength > 0 ? transcript.RefLength : 1.0;
+
+                double logFragProb = salmon::math::LOG_1;
+
+                // The alignment probability is the product of a transcript-level term (based on abundance and) an alignment-level
+                // term below which is P(Q_1) * P(Q_2) * P(F | T)
+                double logRefLength = std::log(refLength);
+                double transcriptLogCount = transcript.mass();
+                if ( transcriptLogCount != LOG_0 ) {
+                    double errLike = salmon::math::LOG_1;
+                    if (burnedIn) {
+                        //errLike = errMod.logLikelihood(aln, transcript);
+                    }
+
+                    aln.logProb = std::log(std::pow(aln.kmerCount,2.0)) + (transcriptLogCount - logRefLength);// + qualProb + errLike;
+                    //aln.logProb = (transcriptLogCount - logRefLength);// + qualProb + errLike;
+
+
+                    sumOfAlignProbs = logAdd(sumOfAlignProbs, aln.logProb);
+                    if (observedTranscripts.find(transcriptID) == observedTranscripts.end()) {
+                        transcripts[transcriptID].addTotalCount(1);
+                        observedTranscripts.insert(transcriptID);
+                    }
+                } else {
+                    aln.logProb = LOG_0;
+                }
+            }
+
+            // normalize the hits
+            if (sumOfAlignProbs == LOG_0) { std::cerr << "0 probability fragment; skipping\n"; continue; }
+            for (auto& aln : alnGroup) {
+                aln.logProb -= sumOfAlignProbs;
+                auto transcriptID = aln.transcriptID();
+                auto& transcript = transcripts[transcriptID];
+                /*
+                double r = uni(eng);
+                if (!burnedIn and r < std::exp(aln.logProb)) {
+                    errMod.update(aln, transcript, aln.logProb, logForgettingMass);
+                    if (aln.fragType() == ReadType::PAIRED_END) {
+                        double fragLength = aln.fragLen();//std::abs(aln.read1->core.pos - aln.read2->core.pos) + aln.read2->core.l_qseq;
+                        fragLengthDist.addVal(fragLength, logForgettingMass);
+                    }
+                }
+                */
+
+            } // end normalize
+
+            // update the single target transcript
+            if (transcriptUnique) {
+                transcripts[firstTranscriptID].addUniqueCount(1);
+                clusterForest.updateCluster(firstTranscriptID, 1, logForgettingMass);
+            } else { // or the appropriate clusters
+                clusterForest.mergeClusters<Alignment>(alnGroup.begin(), alnGroup.end());
+                clusterForest.updateCluster(alnGroup.front().transcriptID(), 1, logForgettingMass);
+            }
+
+            } // end read group
+        }// end timer
+
+        double individualTotal = LOG_0;
+        {
+            // M-step
+            double totalMass{0.0};
+            for (auto kv = hitsForTranscript.begin(); kv != hitsForTranscript.end(); ++kv) {
+                auto transcriptID = kv->first;
+                // The target must be a valid transcript
+                if (transcriptID >= numTranscripts or transcriptID < 0) {std::cerr << "index " << transcriptID << " out of bounds\n"; }
+
+                auto& transcript = transcripts[transcriptID];
+
+                // The prior probability
+                double hitMass{LOG_0};
+
+                // The set of alignments that match transcriptID
+                auto& hits = kv->second;
+                std::for_each(hits.begin(), hits.end(), [&](Alignment* aln) -> void {
+                        if (!std::isfinite(aln->logProb)) { std::cerr << "hitMass = " << aln->logProb << "\n"; }
+                        hitMass = logAdd(hitMass, aln->logProb);
+                        });
+
+                double updateMass = logForgettingMass + hitMass;
+                individualTotal = logAdd(individualTotal, updateMass);
+                totalMass = logAdd(totalMass, updateMass);
+                transcript.addMass(updateMass);
+            } // end for
+        } // end timer
+        //if (processedReads >= 5000000 and !burnedIn) { burnedIn = true; }
+}
+
+uint32_t basesCovered(std::vector<uint32_t>& kmerHits) {
+    std::sort(kmerHits.begin(), kmerHits.end());
+    uint32_t covered{0};
+    uint32_t lastHit{0};
+    uint32_t kl{20};
+    for (auto h : kmerHits) {
+        covered += std::min(h - lastHit, kl);
+        lastHit = h;
+    }
+    return covered;
+}
+
+uint32_t basesCovered(std::vector<uint32_t>& posLeft, std::vector<uint32_t>& posRight) {
+    return basesCovered(posLeft) + basesCovered(posRight);
+}
+
+class KmerVote {
+    public:
+        KmerVote(uint32_t vp, uint32_t rp, uint32_t vl) : votePos(vp), readPos(rp), voteLen(vl) {}
+        uint32_t votePos{0};
+        uint32_t readPos{0};
+        uint32_t voteLen{0};
+};
+
+
+class TranscriptHitList {
+    public:
+        uint32_t bestHitPos{0};
+        uint32_t bestHitScore{0};
+        std::vector<KmerVote> votes;
+
+        void addVote(uint32_t tpos, uint32_t readPos, uint32_t voteLen) {
+            uint32_t transcriptPos = (readPos > tpos) ? 0 : tpos - readPos;
+            votes.emplace_back(transcriptPos, readPos, voteLen);
+        }
+
+        void addVoteRC(uint32_t tpos, uint32_t readPos, uint32_t voteLen) {
+            uint32_t transcriptPos = (readPos > tpos) ? 0 : tpos + readPos;
+            votes.emplace_back(transcriptPos, readPos, voteLen);
+        }
+
+
+        uint32_t totalNumHits() { return votes.size(); }
+
+        bool computeBestHit() {
+            std::sort(votes.begin(), votes.end(),
+                    [](const KmerVote& v1, const KmerVote& v2) -> bool {
+                        if (v1.votePos == v2.votePos) {
+                            return v1.readPos < v2.readPos;
+                        }
+                        return v1.votePos < v2.votePos;
+                    });
+
+            /*
+            std::cerr << "(" << votes.size() << ") { ";
+            for (auto v : votes) {
+                std::cerr << v.votePos << " ";
+            }
+            std::cerr << "}\n";
+            */
+            uint32_t maxClusterPos{0};
+            uint32_t maxClusterCount{0};
+
+            struct VoteInfo {
+                uint32_t coverage;
+                uint32_t rightmostBase;
+            };
+
+            boost::container::flat_map<uint32_t, VoteInfo> hitMap;
+            int32_t currClust{votes.front().votePos};
+            for (size_t j = 0; j < votes.size(); ++j) {
+
+                uint32_t votePos = votes[j].votePos;
+                uint32_t readPos = votes[j].readPos;
+                uint32_t voteLen = votes[j].voteLen;
+
+                if (votePos >= currClust) {
+                    if (votePos - currClust > 10) {
+                        currClust = votePos;
+                    }
+                    auto& hmEntry = hitMap[currClust];
+
+                    hmEntry.coverage += std::min(voteLen, (readPos + voteLen) - hmEntry.rightmostBase);
+                    hmEntry.rightmostBase = readPos + voteLen;
+                } else if (votePos < currClust) {
+                    std::cerr << "CHA?!?\n";
+                    if (currClust - votePos > 10) {
+                        currClust = votePos;
+                    }
+                    auto& hmEntry = hitMap[currClust];
+                    hmEntry.coverage += std::min(voteLen, (readPos + voteLen) - hmEntry.rightmostBase);
+                    hmEntry.rightmostBase = readPos + voteLen;
+                }
+
+                if (hitMap[currClust].coverage > maxClusterCount) {
+                    maxClusterCount = hitMap[currClust].coverage;
+                    maxClusterPos = currClust;
+                }
+
+            }
+
+            bestHitPos = maxClusterPos;
+            bestHitScore = maxClusterCount;
+            return true;
+        }
+};
+
+// To use the parser in the following, we get "jobs" until none is
+// available. A job behaves like a pointer to the type
+// jellyfish::sequence_list (see whole_sequence_parser.hpp).
+void add_sizes(sequence_parser* parser, std::atomic<uint64_t>* total_fwd, std::atomic<uint64_t>* total_bwd,
+               std::atomic<uint64_t>& totalHits,
+               std::atomic<uint64_t>& rn,
+               PerfectHashIndex& phi,
+               std::vector<Transcript>& transcripts,
+               std::atomic<uint64_t>& batchNum,
+               double& logForgettingMass,
+               std::mutex& ffMutex,
+               ClusterForest& clusterForest,
+               std::vector<uint64_t>& offsets,
+               std::vector<uint64_t>& kmerLocs,
+               uint32_t sampleRate
+               //google::dense_hash_map<uint64_t, uint64_t>& khash
+               //std::vector<std::vector<uint64_t>>& kmerLocMap
+               ) {
+  uint64_t count_fwd = 0, count_bwd = 0;
+  auto INVALID = phi.INVALID;
+  auto merLen = phi.kmerLength();
+
+  double forgettingFactor{0.65};
+
+  std::vector<std::vector<Alignment>> hitLists;
+  hitLists.resize(5000);
+
+  uint64_t leftHitCount{0};
+  uint64_t hitListCount{0};
+
+  class DirHit {
+      uint32_t fwd{0};
+      uint32_t rev{0};
+  };
+
+  size_t locRead{0};
+  while(true) {
+    sequence_parser::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group)
+    if(j.is_empty()) break;          // If got nothing, quit
+
+    my_mer kmer;
+    my_mer rkmer;
+
+    for(size_t i = 0; i < j->nb_filled; ++i) { // For all the read we got
+
+        hitLists.resize(j->nb_filled);
+
+        /*
+        btree::btree_map<uint64_t, uint32_t> eqLeftFwdPos;
+        btree::btree_map<uint64_t, uint32_t> eqLeftBwdPos;
+        btree::btree_map<uint64_t, uint32_t> eqRightFwdPos;
+        btree::btree_map<uint64_t, uint32_t> eqRightBwdPos;
+        */
+
+        std::unordered_map<uint64_t, TranscriptHitList> leftFwdHits;
+        std::unordered_map<uint64_t, TranscriptHitList> leftBwdHits;
+
+        std::unordered_map<uint64_t, TranscriptHitList> rightFwdHits;
+        std::unordered_map<uint64_t, TranscriptHitList> rightBwdHits;
+
+        uint32_t totFwdLeft = 0;
+        uint32_t totBwdLeft = 0;
+        uint32_t totFwdRight = 0;
+        uint32_t totBwdRight = 0;
+        uint32_t readLength = 0;
+
+        uint32_t leftReadLength{0};
+        uint32_t rightReadLength{0};
+
+        uint64_t prevEQ{std::numeric_limits<uint64_t>::max()};
+
+        count_fwd += j->data[i].first.seq.size();        // Add up the size of the sequence
+        count_bwd += j->data[i].second.seq.size();        // Add up the size of the sequence
+
+        //---------- Left End ----------------------//
+        {
+            const char* start     = j->data[i].first.seq.c_str();
+            uint32_t readLen      = j->data[i].first.seq.size();
+            leftReadLength = readLen;
+            const char* const end = start + readLen;
+            uint32_t cmlen{0};
+            uint32_t rbase{0};
+            uint32_t phase{0};
+            // iterate over the read base-by-base
+            while(start < end) {
+                ++rbase; ++readLength;
+                char base = *start; ++start;
+                auto c = jellyfish::mer_dna::code(base);
+                kmer.shift_left(c);
+                rkmer.shift_right(jellyfish::mer_dna::complement(c));
+                switch(c) {
+                    case jellyfish::mer_dna::CODE_IGNORE:
+                        break;
+                    case jellyfish::mer_dna::CODE_COMMENT:
+                        std::cerr << "ERROR: unexpected character " << base << " in read!\n";
+                        // Fall through
+                    case jellyfish::mer_dna::CODE_RESET:
+                        cmlen = 0;
+                        kmer.polyA(); rkmer.polyA();
+                        break;
+
+                    default:
+                        if(++cmlen >= merLen) {
+                            if (phase++ % sampleRate != 0) { continue; }
+                            cmlen = merLen;
+                            auto merID = phi.index(kmer.get_bits(0, 2*merLen));
+                            //auto merIt = khash.find(kmer.get_bits(0, 2*merLen));
+                            //auto merID = (merIt == khash.end()) ? INVALID : merIt->second;
+
+
+                            if (merID != INVALID) {
+                                // Locations
+                                auto first = offsets[merID];
+                                auto last = offsets[merID+1];
+                                for (size_t ei = first; ei < last; ++ei) {
+                                    uint64_t e = kmerLocs[ei];
+                                    //for (uint64_t e : kmerLocMap[merID]) {
+                                    leftFwdHits[transcript(e)].addVote(offset(e), rbase - merLen, merLen);
+                                }
+                                /*
+                                auto eqClass = memberships[merID];
+                                eqLeftFwdPos[eqClass]++;//.push_back(rbase);
+                                */
+                                totFwdLeft++;
+                            }
+
+                            auto rmerID = phi.index(rkmer.get_bits(0, 2*merLen));
+                            //auto rmerIt = khash.find(rkmer.get_bits(0, 2*merLen));
+                            //auto rmerID = (rmerIt == khash.end()) ? INVALID : rmerIt->second;
+
+                            if (rmerID != INVALID) {
+                                // Locations
+                                auto first = offsets[rmerID];
+                                auto last = offsets[rmerID+1];
+                                for (size_t ei = first ; ei < last; ++ei) {
+                                    uint64_t e = kmerLocs[ei];
+                                    //for (uint64_t e : kmerLocMap[rmerID]) {
+                                    leftFwdHits[transcript(e)].addVoteRC(offset(e), rbase - merLen, merLen);
+                                }
+
+                                /*
+                                auto eqClass = memberships[rmerID];
+                                eqLeftBwdPos[eqClass]++;//.push_back(rbase);
+                                */
+                                totBwdLeft++;
+                            }
+
+                        }
+
+                        } // end switch(c)
+                } // while (start < end)
+        }
+
+        hitLists[i].clear();
+        auto& hitList = hitLists[i];
+        prevEQ = std::numeric_limits<uint64_t>::max();
+        //---------- Right End ----------------------//
+        {
+            kmer.polyA();
+            rkmer.polyA();
+            const char* start     = j->data[i].second.seq.c_str();
+            uint32_t readLen      = j->data[i].second.seq.size();
+            rightReadLength = readLen;
+            const char* const end = start + readLen;
+            uint32_t cmlen{0};
+            uint32_t rbase{0};
+
+            uint32_t phase{0};
+            // iterate over the read base-by-base
+            while(start < end) {
+                ++rbase; ++readLength;
+                char base = *start; ++start;
+                auto c = jellyfish::mer_dna::code(base);
+                kmer.shift_left(c);
+                rkmer.shift_right(jellyfish::mer_dna::complement(c));
+                switch(c) {
+                    case jellyfish::mer_dna::CODE_IGNORE:
+                        break;
+                    case jellyfish::mer_dna::CODE_COMMENT:
+                        std::cerr << "ERROR: unexpected character " << base << " in read!\n";
+                        // Fall through
+                    case jellyfish::mer_dna::CODE_RESET:
+                        cmlen = 0;
+                        kmer.polyA(); rkmer.polyA();
+                        break;
+
+                    default:
+                        if(++cmlen >= merLen) {
+                            if (phase++ % sampleRate != 0) { continue; }
+                            cmlen = merLen;
+                            auto merID = phi.index(kmer.get_bits(0, 2*merLen));
+                            //auto merIt = khash.find(kmer.get_bits(0, 2*merLen));
+                            //auto merID = (merIt == khash.end()) ? INVALID : merIt->second;
+
+                            if (merID != INVALID) {
+                                // Locations
+                                auto first = offsets[merID];
+                                auto last = offsets[merID+1];
+                                for (size_t ei = first; ei < last; ++ei) {
+                                    uint64_t e = kmerLocs[ei];
+                                    //for (uint64_t e : kmerLocMap[merID]) {
+                                     rightFwdHits[transcript(e)].addVote(offset(e), rbase - merLen, merLen);
+                                }
+                                /*
+                                auto eqClass = memberships[merID];
+                                eqRightFwdPos[eqClass]++;//.push_back(rbase);
+                                */
+                                totFwdRight++;
+
+                            }
+
+
+                            auto rmerID = phi.index(rkmer.get_bits(0, 2*merLen));
+                            //auto rmerIt = khash.find(rkmer.get_bits(0, 2*merLen));
+                            //auto rmerID = (rmerIt == khash.end()) ? INVALID : rmerIt->second;
+
+                            if (rmerID != INVALID) {
+                                // Locations
+                                auto first = offsets[rmerID];
+                                auto last = offsets[rmerID+1];
+                                for (size_t ei = first; ei < last; ++ei) {
+                                    uint64_t e = kmerLocs[ei];
+                                    //for (uint64_t e : kmerLocMap[rmerID]) {
+                                    rightFwdHits[transcript(e)].addVoteRC(offset(e), rbase - merLen, merLen);
+                                }
+                                /*
+                                auto eqClass = memberships[rmerID];
+                                eqRightBwdPos[eqClass]++;//.push_back(rbase);
+                                */
+                                totBwdRight++;
+                            }
+
+
+
+                        }
+
+                } // end switch(c)
+            } // while (start < end)
+        }
+
+        /*
+        std::unordered_map<TranscriptID, uint32_t> leftHits;
+        std::unordered_map<TranscriptID, uint32_t> rightHits;
+
+       auto& eqLeftPos = (totFwdLeft >= totBwdLeft) ? eqLeftFwdPos : eqLeftBwdPos;
+       auto& eqRightPos = (totFwdRight >= totBwdRight) ? eqRightFwdPos : eqRightBwdPos;
+
+      for (auto& leqPos: eqLeftPos) {
+          for (auto t : klut[leqPos.first]) {
+              // --- counts
+              leftHits[t] += leqPos.second;//.size();
+
+              // --- positions
+             //auto& thits = leftHits[t];
+             //thits.insert(thits.end(), leqPos.second.begin(), leqPos.second.end());
+         }
+      }
+
+      for (auto& reqPos : eqRightPos) {
+          for (auto t : klut[reqPos.first]) {
+              auto it = leftHits.find(t);
+              if (it != leftHits.end() and it->second > 40) {
+                  // --- counts
+                  rightHits[t] += reqPos.second;//.size();
+
+                  // --- positions
+                  //auto score = basesCovered(it->second);
+                  //if (score < 70) { continue; }
+                  //auto& thits = rightHits[t];
+                  //thits.insert(thits.end(), reqPos.second.begin(), reqPos.second.end());
+              }
+          }
+      }
+
+      uint32_t readHits{0};
+      for (auto& rightHit : rightHits) {
+          if (rightHit.second > 40) {
+              // --- counts
+              uint32_t score = leftHits[rightHit.first] + rightHit.second;
+              hitList.emplace_back(rightHit.first, score);
+              readHits += score;
+
+              // --- positions
+              //auto rscore = basesCovered(rightHit.second);
+               //if (rscore >= 70) {
+              //uint32_t score = leftHits[rightHit.first] + rightHit.second;
+              //hitList.emplace_back(rightHit.first, score);
+              //readHits += score;
+          }
+      }
+      leftHitCount += leftHits.size();
+      hitListCount += hitList.size();
+
+        */
+
+        size_t readHits{0};
+    std::unordered_map<TranscriptID, uint32_t> leftHitCounts;
+    std::unordered_map<TranscriptID, uint32_t> rightHitCounts;
+
+    auto& leftHits = leftFwdHits;//(totFwdLeft >= totBwdLeft) ? leftFwdHits : leftBwdHits;
+    auto& rightHits = rightFwdHits;//(totFwdRight >= totBwdRight) ? rightFwdHits : rightBwdHits;
+
+    for (auto& tHitList : leftHits) {
+        // Coverage score
+        tHitList.second.computeBestHit();
+        //leftHitCounts[tHitList.first] = tHitList.second.totalNumHits();
+        ++leftHitCount;
+    }
+
+    double cutoffLeft{ 0.75 * leftReadLength};
+    double cutoffRight{ 0.75 * rightReadLength};
+
+    double cutoffLeftCount{ 0.80 * leftReadLength - merLen + 1};
+    double cutoffRightCount{ 0.80 * rightReadLength - merLen + 1};
+    for (auto& tHitList : rightHits) {
+        auto it = leftHits.find(tHitList.first);
+        // Simple counting score
+        /*
+        if (it != leftHits.end() and it->second.totalNumHits() >= cutoffLeftCount) {
+            if (tHitList.second.totalNumHits() < cutoffRightCount) { continue; }
+            uint32_t score = it->second.totalNumHits() + tHitList.second.totalNumHits();
+        */
+        // Coverage score
+        if (it != leftHits.end() and it->second.bestHitScore >= cutoffLeft) {
+            tHitList.second.computeBestHit();
+            if (tHitList.second.bestHitScore < cutoffRight) { continue; }
+            uint32_t score = it->second.bestHitScore + tHitList.second.bestHitScore;
+
+
+            hitList.emplace_back(tHitList.first, score);
+            readHits += score;
+            ++hitListCount;
+        }
+    }
+
+
+      totalHits += hitList.size() > 0;
+      locRead++;
+      ++rn;
+      if (rn % 50000 == 0) {
+          std::cerr << "\r\rprocessed read "  << rn;
+          std::cerr << "\n leftHits.size() " << leftHits.size()
+                    << ", rightHits.size() " << rightHits.size() << ", hit list of size = " << hitList.size() << "\n";
+          std::cerr << "average leftHits = " << leftHitCount / static_cast<float>(locRead)
+                    << ", average hitList = " << hitListCount / static_cast<float>(locRead) << "\n";
+      }
+
+      if (hitList.size() > 100) { hitList.clear(); }
+
+      double invHits = 1.0 / readHits;
+      for (auto t : hitList) {
+          transcripts[t.transcriptID()].addSharedCount( t.kmerCount * invHits );
+      }
+
+    } // end for i < j->nb_filled
+
+    auto oldBatchNum = batchNum++;
+    if (oldBatchNum > 1) {
+        ffMutex.lock();
+        logForgettingMass += forgettingFactor * std::log(static_cast<double>(oldBatchNum-1)) -
+        std::log(std::pow(static_cast<double>(oldBatchNum), forgettingFactor) - 1);
+        ffMutex.unlock();
+    }
+
+    //
+    processMiniBatch(logForgettingMass, hitLists, transcripts, clusterForest);
+
+  }
+
+  *total_fwd += count_fwd;
+  *total_bwd += count_bwd;
+}
+
+// To use the parser in the following, we get "jobs" until none is
+// available. A job behaves like a pointer to the type
+// jellyfish::sequence_list (see whole_sequence_parser.hpp).
+void processReadsMEM(sequence_parser* parser, std::atomic<uint64_t>* total_fwd, std::atomic<uint64_t>* total_bwd,
+               std::atomic<uint64_t>& totalHits,
+               std::atomic<uint64_t>& rn,
+               bwaidx_t *idx,
+               std::vector<Transcript>& transcripts,
+               std::atomic<uint64_t>& batchNum,
+               double& logForgettingMass,
+               std::mutex& ffMutex,
+               ClusterForest& clusterForest
+               ) {
+  uint64_t count_fwd = 0, count_bwd = 0;
+
+  double forgettingFactor{0.65};
+
+  std::vector<std::vector<Alignment>> hitLists;
+  hitLists.resize(5000);
+
+  uint64_t leftHitCount{0};
+  uint64_t hitListCount{0};
+
+  class DirHit {
+      uint32_t fwd{0};
+      uint32_t rev{0};
+  };
+
+
+  smem_i *itr = smem_itr_init(idx->bwt);
+  const bwtintv_v *a;
+  int minLen{20};
+  int minIWidth{20};
+  int splitWidth{0};
+
+  size_t locRead{0};
+  while(true) {
+    sequence_parser::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group)
+    if(j.is_empty()) break;          // If got nothing, quit
+
+    my_mer kmer;
+    my_mer rkmer;
+
+    for(size_t i = 0; i < j->nb_filled; ++i) { // For all the read we got
+
+        hitLists.resize(j->nb_filled);
+
+        std::unordered_map<uint64_t, TranscriptHitList> leftFwdHits;
+        std::unordered_map<uint64_t, TranscriptHitList> leftBwdHits;
+
+        std::unordered_map<uint64_t, TranscriptHitList> rightFwdHits;
+        std::unordered_map<uint64_t, TranscriptHitList> rightBwdHits;
+
+        uint32_t totFwdLeft = 0;
+        uint32_t totBwdLeft = 0;
+        uint32_t totFwdRight = 0;
+        uint32_t totBwdRight = 0;
+        uint32_t readLength = 0;
+
+        uint32_t leftReadLength{0};
+        uint32_t rightReadLength{0};
+
+        uint64_t prevEQ{std::numeric_limits<uint64_t>::max()};
+
+        count_fwd += j->data[i].first.seq.size();        // Add up the size of the sequence
+        count_bwd += j->data[i].second.seq.size();        // Add up the size of the sequence
+
+        //---------- Left End ----------------------//
+        {
+            std::string& readStr   = j->data[i].first.seq;
+            uint32_t readLen      = j->data[i].first.seq.size();
+
+            for (int p = 0; p < readLen; ++p) {
+                readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
+            }
+
+            char* readPtr = const_cast<char*>(readStr.c_str());
+
+            smem_set_query(itr, readLen, reinterpret_cast<uint8_t*>(readPtr));
+
+            // while there are more matches on the query
+            while ((a = smem_next(itr, minLen<<1, splitWidth)) != 0) {
+                for (size_t mi = 0; mi < a->n; ++mi) {
+                    bwtintv_t *p = &a->a[mi];
+                    if (static_cast<uint32_t>(p->info) - (p->info>>32) < minLen) continue;
+                    uint32_t qstart = static_cast<uint32_t>(p->info>>32);
+                    uint32_t qend = static_cast<uint32_t>(p->info);
+                    long numHits = static_cast<long>(p->x[2]);
+
+                    if ( p->x[2] <= minIWidth) {
+                        for (bwtint_t k = 0; k < p->x[2]; ++k) {
+                            bwtint_t pos;
+                            int len, isRev, refID;
+                            len = static_cast<uint32_t>(p->info - (p->info >> 32));
+                            pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &isRev);
+                            if (isRev) { pos -= len - 1; }
+                            bns_cnt_ambi(idx->bns, pos, len, &refID);
+                            long hitLoc = static_cast<long>(pos - idx->bns->anns[refID].offset) + 1;
+                            leftFwdHits[refID].addVote(hitLoc, hitLoc, len);
+                        } // for k
+                    } // if <= minIWidth
+
+                } // for mi
+            } // for all query matches
+        }
+
+        hitLists[i].clear();
+        auto& hitList = hitLists[i];
+        prevEQ = std::numeric_limits<uint64_t>::max();
+        //---------- Right End ----------------------//
+        {
+            std::string& readStr   = j->data[i].second.seq;
+            uint32_t readLen      = j->data[i].second.seq.size();
+
+            for (int p = 0; p < readLen; ++p) {
+                readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
+            }
+
+            char* readPtr = const_cast<char*>(readStr.c_str());
+            smem_set_query(itr, readLen, reinterpret_cast<uint8_t*>(readPtr));
+
+            // while there are more matches on the query
+            while ((a = smem_next(itr, minLen<<1, splitWidth)) != 0) {
+                for (size_t mi = 0; mi < a->n; ++mi) {
+                    bwtintv_t *p = &a->a[mi];
+                    if (static_cast<uint32_t>(p->info) - (p->info>>32) < minLen) continue;
+                    uint32_t qstart = static_cast<uint32_t>(p->info>>32);
+                    uint32_t qend = static_cast<uint32_t>(p->info);
+                    long numHits = static_cast<long>(p->x[2]);
+
+                    if ( p->x[2] <= minIWidth) {
+                        for (bwtint_t k = 0; k < p->x[2]; ++k) {
+                            bwtint_t pos;
+                            int len, isRev, refID;
+                            len = static_cast<uint32_t>(p->info - (p->info >> 32));
+                            pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &isRev);
+                            if (isRev) { pos -= len - 1; }
+                            bns_cnt_ambi(idx->bns, pos, len, &refID);
+                            long hitLoc = static_cast<long>(pos - idx->bns->anns[refID].offset) + 1;
+                            rightFwdHits[refID].addVote(hitLoc, hitLoc, len);
+                        } // for k
+                    } // if <= minIWidth
+
+                } // for mi
+            } // for all query matches
+
+        } // end right
+
+        size_t readHits{0};
+        std::unordered_map<TranscriptID, uint32_t> leftHitCounts;
+        std::unordered_map<TranscriptID, uint32_t> rightHitCounts;
+
+        auto& leftHits = leftFwdHits;
+        auto& rightHits = rightFwdHits;
+
+        for (auto& tHitList : leftHits) {
+            // Coverage score
+            tHitList.second.computeBestHit();
+            ++leftHitCount;
+        }
+
+        double cutoffLeft{ 0.75 * leftReadLength};
+        double cutoffRight{ 0.75 * rightReadLength};
+
+        for (auto& tHitList : rightHits) {
+            auto it = leftHits.find(tHitList.first);
+            // Coverage score
+            if (it != leftHits.end() and it->second.bestHitScore >= cutoffLeft) {
+                tHitList.second.computeBestHit();
+                if (tHitList.second.bestHitScore < cutoffRight) { continue; }
+                uint32_t score = it->second.bestHitScore + tHitList.second.bestHitScore;
+                hitList.emplace_back(tHitList.first, score);
+                readHits += score;
+                ++hitListCount;
+            }
+        }
+
+        totalHits += hitList.size() > 0;
+        locRead++;
+        ++rn;
+        if (rn % 50000 == 0) {
+            std::cerr << "\r\rprocessed read "  << rn;
+            std::cerr << "\n leftHits.size() " << leftHits.size()
+                << ", rightHits.size() " << rightHits.size() << ", hit list of size = " << hitList.size() << "\n";
+//            std::cerr << "average leftHits = " << leftHitCount / static_cast<float>(locRead)
+//               << ", average hitList = " << hitListCount / static_cast<float>(locRead) << "\n";
+        }
+
+        if (hitList.size() > 100) { hitList.clear(); }
+
+        double invHits = 1.0 / readHits;
+        for (auto t : hitList) {
+            transcripts[t.transcriptID()].addSharedCount( t.kmerCount * invHits );
+        }
+
+        } // end for i < j->nb_filled
+
+    auto oldBatchNum = batchNum++;
+    if (oldBatchNum > 1) {
+        ffMutex.lock();
+        logForgettingMass += forgettingFactor * std::log(static_cast<double>(oldBatchNum-1)) -
+        std::log(std::pow(static_cast<double>(oldBatchNum), forgettingFactor) - 1);
+        ffMutex.unlock();
+    }
+
+    processMiniBatch(logForgettingMass, hitLists, transcripts, clusterForest);
+  }
+  smem_itr_destroy(itr);
+  *total_fwd += count_fwd;
+  *total_bwd += count_bwd;
+}
+
+
+
+int salmonQuantify(int argc, char *argv[]) {
+    using std::cerr;
+    using std::vector;
+    using std::string;
+    namespace bfs = boost::filesystem;
+    namespace po = boost::program_options;
+
+    uint32_t maxThreads = std::thread::hardware_concurrency();
+    uint32_t sampleRate{1};
+
+    vector<string> unmatedReadFiles;
+    vector<string> mate1ReadFiles;
+    vector<string> mate2ReadFiles;
+
+    po::options_description generic("salmon quant options");
+    generic.add_options()
+    ("version,v", "print version string")
+    ("help,h", "produce help message")
+    ("index,i", po::value<string>(), "sailfish index.")
+    ("libtype,l", po::value<std::string>(), "Format string describing the library type")
+    ("unmated_reads,r", po::value<vector<string>>(&unmatedReadFiles)->multitoken(),
+     "List of files containing unmated reads of (e.g. single-end reads)")
+    ("mates1,1", po::value<vector<string>>(&mate1ReadFiles)->multitoken(),
+        "File containing the #1 mates")
+    ("mates2,2", po::value<vector<string>>(&mate2ReadFiles)->multitoken(),
+        "File containing the #2 mates")
+    ("threads,p", po::value<uint32_t>()->default_value(maxThreads), "The number of threads to use concurrently.")
+    ("sample,s", po::value<uint32_t>(&sampleRate)->default_value(1), "Sample rate --- only consider every s-th k-mer in a read.")
+    ("output,o", po::value<std::string>(), "Output quantification file");
+
+    po::variables_map vm;
+    try {
+        auto orderedOptions = po::command_line_parser(argc,argv).
+            options(generic).run();
+
+        po::store(orderedOptions, vm);
+
+        if ( vm.count("help") ) {
+            auto hstring = R"(
+Quant
+==========
+Perform streaming k-mer-group-based estimation of
+transcript abundance from RNA-seq reads
+)";
+            std::cout << hstring << std::endl;
+            std::cout << generic << std::endl;
+            std::exit(1);
+        }
+
+        po::notify(vm);
+
+
+        for (auto& opt : orderedOptions.options) {
+            std::cerr << "[ " << opt.string_key << "] => {";
+            for (auto& val : opt.value) {
+                std::cerr << " " << val;
+            }
+            std::cerr << " }\n";
+        }
+
+        bfs::path outputDirectory(vm["output"].as<std::string>());
+        bfs::create_directory(outputDirectory);
+        if (!(bfs::exists(outputDirectory) and bfs::is_directory(outputDirectory))) {
+            std::cerr << "Couldn't create output directory " << outputDirectory << "\n";
+            std::cerr << "exiting\n";
+            std::exit(1);
+        }
+
+        bfs::path indexDirectory(vm["index"].as<string>());
+        bfs::path logDirectory = outputDirectory.parent_path() / "logs";
+
+#if HAVE_LOGGER
+        bfs::create_directory(logDirectory);
+        if (!(bfs::exists(logDirectory) and bfs::is_directory(logDirectory))) {
+            std::cerr << "Couldn't create log directory " << logDirectory << "\n";
+            std::cerr << "exiting\n";
+            std::exit(1);
+        }
+        std::cerr << "writing logs to " << logDirectory.string() << "\n";
+        g2LogWorker logger(argv[0], logDirectory.string());
+        g2::initializeLogging(&logger);
+#endif
+
+        vector<ReadLibrary> readLibraries;
+        for (auto& opt : orderedOptions.options) {
+            if (opt.string_key == "libtype") {
+                LibraryFormat libFmt = salmon::utils::parseLibraryFormatString(opt.value[0]);
+                if (libFmt.check()) {
+                    std::cerr << libFmt << "\n";
+                } else {
+                    std::stringstream ss;
+                    ss << libFmt << " is invalid!";
+                    throw std::invalid_argument(ss.str());
+                }
+                readLibraries.emplace_back(libFmt);
+            } else if (opt.string_key == "mates1") {
+                readLibraries.back().addMates1(opt.value);
+            } else if (opt.string_key == "mates2") {
+                readLibraries.back().addMates2(opt.value);
+            } else if (opt.string_key == "unmated_reads") {
+                readLibraries.back().addUnmated(opt.value);
+            }
+        }
+
+        for (auto& rl : readLibraries) { rl.checkValid(); }
+
+        auto& rl = readLibraries.front();
+        // Handle all the proper cases here
+        char* readFiles[] = { const_cast<char*>(rl.mates1().front().c_str()),
+                              const_cast<char*>(rl.mates2().front().c_str()) };
+
+        uint32_t nbThreads = vm["threads"].as<uint32_t>();
+
+        size_t maxReadGroup{2000}; // Number of files to read simultaneously
+        size_t concurrentFile{1}; // Number of reads in each "job"
+        sequence_parser parser(4 * nbThreads, maxReadGroup, concurrentFile,
+                readFiles, readFiles + 2);
+
+        // kmer-based
+        /*
+        bfs::path sfIndexPath = indexDirectory / "transcriptome.sfi";
+        std::string sfTrascriptIndexFile(sfIndexPath.string());
+        std::cerr << "reading index . . . ";
+        auto phi = PerfectHashIndex::fromFile(sfTrascriptIndexFile);
+        std::cerr << "done\n";
+        std::cerr << "index contained " << phi.numKeys() << " kmers\n";
+
+        size_t nkeys = phi.numKeys();
+        size_t merLen = phi.kmerLength();
+        */
+
+        /*
+        google::dense_hash_map<uint64_t, uint64_t> khash;
+        khash.set_empty_key(std::numeric_limits<uint64_t>::max());
+        uint64_t i{0};
+        for (auto k : phi.kmers()) {
+            khash[k] = phi.index(k);
+        }
+        size_t merLen = 20;
+        std::cerr << "kmer length = " << merLen << "\n";
+        my_mer::k(merLen);
+
+        bfs::path tlutPath = indexDirectory / "transcriptome.tlut";
+        // Get transcript lengths
+        std::ifstream ifile(tlutPath.string(), std::ios::binary);
+        size_t numRecords {0};
+        ifile.read(reinterpret_cast<char *>(&numRecords), sizeof(numRecords));
+
+        std::vector<Transcript> transcripts_tmp;
+
+        std::cerr << "Transcript LUT contained " << numRecords << " records\n";
+        //transcripts_.resize(numRecords);
+        for (auto i : boost::irange(size_t(0), numRecords)) {
+            auto ti = LUTTools::readTranscriptInfo(ifile);
+            // copy over the length, then we're done.
+            transcripts_tmp.emplace_back(ti->transcriptID, ti->name.c_str(), ti->length);
+        }
+
+        std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
+                [](const Transcript& t1, const Transcript& t2) -> bool {
+                return t1.id < t2.id;
+                });
+        std::vector<Transcript> transcripts_;
+        for (auto& t : transcripts_tmp) {
+            transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength);
+        }
+        transcripts_tmp.clear();
+        ifile.close();
+        // --- done ---
+
+        */
+
+        bwaidx_t *idx;
+
+        { // mem-based
+            bfs::path indexPath = indexDirectory / "bwaidx";
+            if ((idx = bwa_idx_load(indexPath.string().c_str(), BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
+        }
+
+        size_t numRecords = idx->bns->n_seqs;
+        std::vector<Transcript> transcripts_tmp;
+
+        std::cerr << "Transcript LUT contained " << numRecords << " records\n";
+        //transcripts_.resize(numRecords);
+        for (auto i : boost::irange(size_t(0), numRecords)) {
+            uint32_t id = i;
+            char* name = idx->bns->anns[i].name;
+            uint32_t len = idx->bns->anns[i].len;
+            // copy over the length, then we're done.
+            transcripts_tmp.emplace_back(id, name, len);
+        }
+
+        std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
+                [](const Transcript& t1, const Transcript& t2) -> bool {
+                return t1.id < t2.id;
+                });
+        std::vector<Transcript> transcripts_;
+        for (auto& t : transcripts_tmp) {
+            transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength);
+        }
+        transcripts_tmp.clear();
+        // --- done ---
+
+        /*
+        { // k-mer based
+        bfs::path locPath = indexDirectory / "fullLookup.kmap";
+        // Make sure we have the file we need to look up k-mer hits
+        if (!boost::filesystem::exists(locPath)) {
+            std::cerr << "could not find k-mer location index; expected at " << locPath << "\n";
+            std::cerr << "please ensure that you've run salmon index before attempting to run salmon quant\n";
+            std::exit(1);
+        }
+
+        std::cerr << "Loading k-mer location index from " << locPath << " . . .";
+
+        //std::vector<std::vector<uint64_t>> kmerLocMap;
+        std::vector<uint64_t> offsets;
+        std::vector<uint64_t> kmerLocs;
+
+        std::ifstream binstream(locPath.string(), std::ios::binary);
+        cereal::BinaryInputArchive archive(binstream);
+        //archive(kmerLocMap);
+        archive(offsets, kmerLocs);
+        binstream.close();
+        std::cerr << "done\n";
+        */
+        std::vector<std::thread> threads;
+        std::atomic<uint64_t> total_fwd(0);
+        std::atomic<uint64_t> total_bwd(0);
+        std::atomic<uint64_t> totalHits(0);
+        std::atomic<uint64_t> rn{0};
+        std::atomic<uint64_t> batchNum{0};
+
+        size_t numTranscripts{transcripts_.size()};
+        ClusterForest clusterForest(numTranscripts, transcripts_);
+
+        double logForgettingMass{salmon::math::LOG_1};
+        std::mutex ffmutex;
+        for(int i = 0; i < nbThreads; ++i)  {
+            std::cerr << "here\n";
+            /*
+            threads.push_back(std::thread(add_sizes, &parser, &total_fwd, &total_bwd,
+                        std::ref(totalHits), std::ref(rn),
+                        std::ref(phi),
+                        std::ref(transcripts_),
+                        std::ref(batchNum),
+                        std::ref(logForgettingMass),
+                        std::ref(ffmutex),
+                        std::ref(clusterForest),
+                        std::ref(offsets),
+                        std::ref(kmerLocs),
+                        sampleRate
+                        //std::ref(khash)
+                        //std::ref(kmerLocMap)
+                        ));
+            */
+            threads.push_back(std::thread(processReadsMEM, &parser, &total_fwd, &total_bwd,
+                        std::ref(totalHits), std::ref(rn),
+                        idx,
+                        std::ref(transcripts_),
+                        std::ref(batchNum),
+                        std::ref(logForgettingMass),
+                        std::ref(ffmutex),
+                        std::ref(clusterForest)
+                        ));
+        }
+
+        for(int i = 0; i < nbThreads; ++i)
+            threads[i].join();
+
+        std::cerr << "\n\n";
+        std::cerr << "processed " << rn << " total reads\n";
+        std::cout << "Total bases: " << total_fwd << " " << total_bwd << "\n";
+        std::cout << "Had a hit for " << totalHits  / static_cast<double>(rn) * 100.0 << "% of the reads\n";
+
+        size_t tnum{0};
+
+        std::cerr << "writing output \n";
+
+        bfs::path outputFile = outputDirectory / "quant.sf";
+        std::ofstream output(outputFile.string());
+        output << "# SDAFish v0.01\n";
+        output << "# ClusterID\tName\tLength\tFPKM\tNumReads\n";
+
+        const double logBillion = std::log(1000000000.0);
+        const double logNumFragments = std::log(static_cast<double>(rn));
+        auto clusters = clusterForest.getClusters();
+        size_t clusterID = 0;
+        for(auto cptr : clusters) {
+            double logClusterMass = cptr->logMass();
+            double logClusterCount = std::log(static_cast<double>(cptr->numHits()));
+
+            if (logClusterMass == salmon::math::LOG_0) {
+                std::cerr << "Warning: cluster " << clusterID << " has 0 mass!\n";
+            }
+
+            bool requiresProjection{false};
+
+            auto& members = cptr->members();
+            size_t clusterSize{0};
+            for (auto transcriptID : members) {
+                Transcript& t = transcripts_[transcriptID];
+                t.uniqueCounts = t.uniqueCount();
+                t.totalCounts = t.totalCount();
+            }
+
+            for (auto transcriptID : members) {
+                Transcript& t = transcripts_[transcriptID];
+                double logTranscriptMass = t.mass();
+                double logClusterFraction = logTranscriptMass - logClusterMass;
+                t.projectedCounts = std::exp(logClusterFraction + logClusterCount);
+                requiresProjection |= t.projectedCounts > static_cast<double>(t.totalCounts) or
+                    t.projectedCounts < static_cast<double>(t.uniqueCounts);
+                ++clusterSize;
+            }
+
+            if (clusterSize > 1 and requiresProjection) {
+                cptr->projectToPolytope(transcripts_);
+            }
+
+            // Now posterior has the transcript fraction
+            size_t tidx = 0;
+            for (auto transcriptID : members) {
+                auto& transcript = transcripts_[transcriptID];
+                double logLength = std::log(transcript.RefLength);
+                double fpkmFactor = std::exp(logBillion - logLength - logNumFragments);
+                double count = transcripts_[transcriptID].projectedCounts;
+                double countTotal = transcripts_[transcriptID].totalCounts;
+                double countUnique = transcripts_[transcriptID].uniqueCounts;
+                double fpkm = count > 0 ? fpkmFactor * count : 0.0;
+                output << clusterID << '\t' << transcript.RefName << '\t' <<
+                    transcript.RefLength << '\t' << fpkm << '\t' << countTotal << '\t' << countUnique << '\t' << count <<  '\t' << transcript.mass() << '\n';
+                ++tidx;
+            }
+
+            ++clusterID;
+        }
+        output.close();
+
+    } catch (po::error &e) {
+        std::cerr << "Exception : [" << e.what() << "]. Exiting.\n";
+        std::exit(1);
+    } catch (std::exception& e) {
+        std::cerr << "Exception : [" << e.what() << "]\n";
+        std::cerr << argv[0] << " quant was invoked improperly.\n";
+        std::cerr << "For usage information, try " << argv[0] << " quant --help\nExiting.\n";
+    }
+
+
+    return 0;
+}
+
+
diff --git a/src/SalmonQuantifyAlignments.cpp b/src/SalmonQuantifyAlignments.cpp
new file mode 100644
index 0000000..53cd26c
--- /dev/null
+++ b/src/SalmonQuantifyAlignments.cpp
@@ -0,0 +1,1264 @@
+
+extern "C" {
+#include "io_lib/scram.h"
+#include "io_lib/os.h"
+}
+
+// for cpp-format
+#include "format.h"
+
+// are these used?
+#include <boost/dynamic_bitset.hpp>
+#include <boost/lockfree/spsc_queue.hpp>
+#include <boost/lockfree/queue.hpp>
+
+#include <tbb/atomic.h>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <atomic>
+#include <vector>
+#include <random>
+#include <memory>
+#include <exception>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <mutex>
+#include <thread>
+#include <memory>
+#include <condition_variable>
+
+#include <tbb/concurrent_queue.h>
+
+#include <boost/timer/timer.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/math/special_functions/digamma.hpp>
+#include <boost/program_options.hpp>
+#include <boost/pending/disjoint_sets.hpp>
+#include <boost/math/distributions/normal.hpp>
+
+#include "ClusterForest.hpp"
+#include "AlignmentLibrary.hpp"
+#include "MiniBatchInfo.hpp"
+#include "BAMQueue.hpp"
+#include "SalmonMath.hpp"
+#include "FASTAParser.hpp"
+#include "LibraryFormat.hpp"
+#include "Transcript.hpp"
+#include "ReadPair.hpp"
+#include "ErrorModel.hpp"
+#include "AlignmentModel.hpp"
+#include "ForgettingMassCalculator.hpp"
+#include "FragmentLengthDistribution.hpp"
+#include "TranscriptCluster.hpp"
+#include "SalmonUtils.hpp"
+#include "SalmonConfig.hpp"
+#include "SalmonOpts.hpp"
+#include "NullFragmentFilter.hpp"
+#include "Sampler.hpp"
+#include "spdlog/spdlog.h"
+#include "EquivalenceClassBuilder.hpp"
+#include "CollapsedEMOptimizer.hpp"
+
+namespace bfs = boost::filesystem;
+using salmon::math::LOG_0;
+using salmon::math::LOG_1;
+using salmon::math::logAdd;
+using salmon::math::logSub;
+
+constexpr uint32_t miniBatchSize{250};
+
+template <typename FragT>
+using AlignmentBatch = std::vector<FragT>;
+
+template <typename FragT>
+using MiniBatchQueue = tbb::concurrent_queue<MiniBatchInfo<FragT>*>;
+
+using PriorAbundanceVector = std::vector<double>;
+using PosteriorAbundanceVector = std::vector<double>;
+
+struct RefSeq {
+    RefSeq(char* name, uint32_t len) : RefName(name), RefLength(len) {}
+    std::string RefName;
+    uint32_t RefLength;
+};
+
+/**
+ * Multiple each element in the vector `vec` by the factor `scale`.
+ */
+template <typename T>
+void scaleBy(std::vector<T>& vec, T scale) {
+    std::for_each(vec.begin(), vec.end(), [scale](T& ele)->void { ele *= scale; });
+}
+
+/*
+ * Tries _numTries_ times to get work from _workQueue_.  It returns
+ * true immediately if it was able to find work, and false otherwise.
+ */
+template <typename FragT>
+inline bool tryToGetWork(MiniBatchQueue<AlignmentGroup<FragT*>>& workQueue,
+                  MiniBatchInfo<AlignmentGroup<FragT*>>*& miniBatch,
+                  uint32_t numTries) {
+
+    uint32_t attempts{1};
+    bool foundWork = workQueue.try_pop(miniBatch);
+    while (!foundWork and attempts < numTries) {
+        foundWork = workQueue.try_pop(miniBatch);
+        ++attempts;
+    }
+    return foundWork;
+}
+
+
+
+template <typename FragT>
+void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
+                      ForgettingMassCalculator& fmCalc,
+                      uint64_t firstTimestepOfRound,
+                      MiniBatchQueue<AlignmentGroup<FragT*>>& workQueue,
+                      MiniBatchQueue<AlignmentGroup<FragT*>>* processedCache,
+                      std::condition_variable& workAvailable,
+                      std::mutex& cvmutex,
+                      volatile bool& doneParsing,
+                      std::atomic<size_t>& activeBatches,
+                      const SalmonOpts& salmonOpts,
+                      bool& burnedIn,
+                      bool initialRound,
+                      std::atomic<size_t>& processedReads) {
+
+    // Seed with a real random value, if available
+    std::random_device rd;
+    auto& log = salmonOpts.jointLog;
+
+    // Whether or not we are using "banking"
+    bool useMassBanking = (!initialRound and salmonOpts.useMassBanking);
+    bool useReadCompat = salmonOpts.incompatPrior != salmon::math::LOG_0;
+
+    // Create a random uniform distribution
+    std::default_random_engine eng(rd());
+    std::uniform_real_distribution<> uni(0.0, 1.0 + std::numeric_limits<double>::min());
+
+    //EQClass
+    EquivalenceClassBuilder& eqBuilder = alnLib.equivalenceClassBuilder();
+
+    using salmon::math::LOG_0;
+    using salmon::math::logAdd;
+    using salmon::math::logSub;
+
+    auto& refs = alnLib.transcripts();
+    auto& clusterForest = alnLib.clusterForest();
+    auto& fragmentQueue = alnLib.fragmentQueue();
+    auto& alignmentGroupQueue = alnLib.alignmentGroupQueue();
+
+    std::vector<FragmentStartPositionDistribution>& fragStartDists =
+        alnLib.fragmentStartPositionDistributions();
+
+    auto& fragLengthDist = alnLib.fragmentLengthDistribution();
+    auto& alnMod = alnLib.alignmentModel();
+
+    bool useFSPD{!salmonOpts.noFragStartPosDist};
+    bool useFragLengthDist{!salmonOpts.noFragLengthDist};
+    bool noFragLenFactor{salmonOpts.noFragLenFactor};
+
+    double startingCumulativeMass = fmCalc.cumulativeLogMassAt(firstTimestepOfRound);
+    const auto expectedLibraryFormat = alnLib.format();
+    uint32_t numBurninFrags{salmonOpts.numBurninFrags};
+
+    bool useAuxParams = (processedReads > salmonOpts.numPreBurninFrags);
+
+    std::chrono::microseconds sleepTime(1);
+    MiniBatchInfo<AlignmentGroup<FragT*>>* miniBatch = nullptr;
+    bool updateCounts = initialRound;
+    size_t numTranscripts = refs.size();
+
+    while (!doneParsing or !workQueue.empty()) {
+        uint32_t zeroProbFrags{0};
+
+        // Try up to numTries times to get work from the queue before
+        // giving up and waiting on the condition variable
+    	constexpr uint32_t numTries = 100;
+        bool foundWork = tryToGetWork(workQueue, miniBatch, 100);
+
+        // If work wasn't immediately available, then wait for it using
+    	// a condition variable to avoid burning CPU cycles for no reason.
+        if (!foundWork) {
+            std::unique_lock<std::mutex> l(cvmutex);
+            workAvailable.wait(l, [&miniBatch, &workQueue, &doneParsing]() { return workQueue.try_pop(miniBatch) or doneParsing; });
+        }
+
+	    // If we actually got some work
+        if (miniBatch != nullptr) {
+
+            useAuxParams = (processedReads > salmonOpts.numPreBurninFrags);
+            ++activeBatches;
+            size_t batchReads{0};
+
+            // double logForgettingMass = fmCalc();
+            double logForgettingMass{0.0};
+            uint64_t currentMinibatchTimestep{0};
+            fmCalc.getLogMassAndTimestep(logForgettingMass, currentMinibatchTimestep);
+            miniBatch->logForgettingMass = logForgettingMass;
+
+            std::vector<AlignmentGroup<FragT*>*>& alignmentGroups = *(miniBatch->alignments);
+
+            using TranscriptID = size_t;
+            using HitIDVector = std::vector<size_t>;
+            using HitProbVector = std::vector<double>;
+
+            // BEGIN: DOUBLY-COLLAPSED TESTING
+            struct HitInfo {
+                uint32_t numHits = 0;
+                bool observed = false;
+                double newUniqueMass = LOG_0;
+            };
+
+            std::unordered_map<TranscriptID, HitInfo> hitInfo;
+            // We only need to fill this in if it's not the first round
+            if (useMassBanking) {
+                for (auto& alnGroup : alignmentGroups) {
+                    for (auto a : alnGroup->alignments()) {
+                        auto transcriptID = a->transcriptID();
+                        if (transcriptID < 0 or transcriptID >= refs.size()) {
+                            salmonOpts.jointLog->warn("Invalid Transcript ID [{}] encountered", transcriptID);
+                        }
+                        auto& info = hitInfo[transcriptID];
+                        auto& txp =refs[transcriptID];
+                        if(!info.observed) {
+                            info.observed = true;
+
+                            if (txp.uniqueCount() > 0) {
+                                /*
+                                double dormantInterval = static_cast<double>(currentMinibatchTimestep -
+                                        firstTimestepOfRound + 1);
+                                        */
+                                // The cumulative mass last time this was updated
+                                // double prevUpdateMass = startingCumulativeMass;
+
+                                double updateFraction = std::log(txp.uniqueUpdateFraction());
+                                auto lastUpdate = txp.lastTimestepUpdated();
+                                double newUniqueMass = 0.0;
+                                if (lastUpdate >= currentMinibatchTimestep) {
+                                    newUniqueMass = logForgettingMass + updateFraction;
+                                } else {
+                                    double dormantInterval = static_cast<double>(currentMinibatchTimestep) - lastUpdate;
+                                    double prevUpdateMass = fmCalc.cumulativeLogMassAt(lastUpdate - 1);
+                                    double currentUpdateMass = fmCalc.cumulativeLogMassAt(currentMinibatchTimestep);
+                                    newUniqueMass = salmon::math::logSub(currentUpdateMass, prevUpdateMass) +
+                                        updateFraction - std::log(dormantInterval);
+                                }
+                                info.newUniqueMass = newUniqueMass;
+
+                                // The new unique mass to be added to this transcript
+				/*
+                                double newUniqueMass =
+                                    salmon::math::logSub(currentUpdateMass, prevUpdateMass) +
+                                    updateFraction - std::log(dormantInterval);
+				*/
+                                /*
+				double newUniqueMass = logForgettingMass + updateFraction;
+                                info.newUniqueMass = newUniqueMass;
+                                */
+                            }
+                        }
+                        info.numHits++;
+                    } // end alignments in group
+                } // end batch hits
+            } // end initial round
+            // END: DOUBLY-COLLAPSED TESTING
+
+
+            {
+                // The cumulative forgetting mass up through and including the current timestep.
+                double currentCumulativeMass = fmCalc.cumulativeLogMassAt(currentMinibatchTimestep);
+
+                // Iterate over each group of alignments (a group consists of all alignments reported
+                // for a single read).  Distribute the read's mass proportionally dependent on the
+                // current
+                for (auto alnGroup : alignmentGroups) {
+
+                    // EQCLASS
+                    std::vector<uint32_t> txpIDs;
+                    std::vector<double> auxProbs;
+                    size_t txpIDsHash{0};
+                    double auxDenom = salmon::math::LOG_0;
+                    alnGroup->sortHits();
+
+                    double sumOfAlignProbs{LOG_0};
+                    // update the cluster-level properties
+                    bool transcriptUnique{true};
+                    auto firstTranscriptID = alnGroup->alignments().front()->transcriptID();
+                    std::unordered_set<size_t> observedTranscripts;
+                    for (auto& aln : alnGroup->alignments()) {
+                        auto transcriptID = aln->transcriptID();
+                        auto& transcript = refs[transcriptID];
+                        transcriptUnique = transcriptUnique and (transcriptID == firstTranscriptID);
+
+                        double refLength = transcript.RefLength > 0 ? transcript.RefLength : 1.0;
+
+                        double logFragProb = salmon::math::LOG_1;
+
+                        if (!salmonOpts.noFragLengthDist and useAuxParams) {
+                            if(aln->fragLen() == 0) {
+                                if (aln->isLeft() and transcript.RefLength - aln->left() < fragLengthDist.maxVal()) {
+                                    logFragProb = fragLengthDist.cmf(transcript.RefLength - aln->left());
+                                } else if (aln->isRight() and aln->right() < fragLengthDist.maxVal()) {
+                                    logFragProb = fragLengthDist.cmf(aln->right());
+                                }
+                            } else {
+                                logFragProb = fragLengthDist.pmf(static_cast<size_t>(aln->fragLen()));
+                            }
+                        }
+
+                        // TESTING
+                        if (noFragLenFactor) { logFragProb = LOG_1; }
+
+                        // @TODO: handle this case better
+                        //double fragProb = cdf(fragLengthDist, fragLength + 0.5) - cdf(fragLengthDist, fragLength - 0.5);
+                        //fragProb = std::max(fragProb, 1e-3);
+                        //fragProb /= cdf(fragLengthDist, refLength);
+
+                        // The alignment probability is the product of a transcript-level term (based on abundance and) an alignment-level
+                        // term below which is P(Q_1) * P(Q_2) * P(F | T)
+                        double logRefLength = std::log(refLength);
+
+                        // The probability that the fragments align to the given strands in the
+                        // given orientations.
+                        double logAlignCompatProb = (useReadCompat) ?
+                                (salmon::utils::logAlignFormatProb(aln->libFormat(), expectedLibraryFormat, salmonOpts.incompatPrior)) :
+                                LOG_1;
+
+                        // Adjustment to the likelihood due to the
+                        // error model
+                        double errLike = salmon::math::LOG_1;
+                        //if (burnedIn and salmonOpts.useErrorModel) {
+                        if (useAuxParams and salmonOpts.useErrorModel) {
+                            errLike = alnMod.logLikelihood(*aln, transcript);
+                        }
+
+			// Allow for a non-uniform fragment start position distribution
+                        double startPosProb = -logRefLength;
+                        auto hitPos = aln->left();
+			if (useFSPD and burnedIn and hitPos < refLength) {
+			  auto& fragStartDist =
+				  fragStartDists[transcript.lengthClassIndex()];
+			  startPosProb = fragStartDist(hitPos, refLength, logRefLength);
+			}
+
+			// Pre FSPD
+			/*
+                        double auxProb = -logRefLength + logFragProb +
+                                          aln->logQualProb() +
+                                          errLike + logAlignCompatProb;
+		        */
+            double auxProb = startPosProb + logFragProb +
+                             aln->logQualProb() +
+                             errLike + logAlignCompatProb;
+
+
+
+                        // The overall mass of this transcript, which is used to
+                        // account for this transcript's relaive abundance
+                        double transcriptLogCount = transcript.mass(initialRound);
+
+                        // BEGIN: DOUBLY-COLLAPSED TESTING
+                        // If this is not the initial round, then add the
+                        // appropriate proportion of unique read mass for
+                        // every ambiguous alignment we encounter.  Here,
+                        // we're not assigning the extra mass yet, but just
+                        // adding it to the transcriptLogCount.
+                        if (useMassBanking and transcript.uniqueCount() > 0) {
+                            auto txpHitInfo = hitInfo[transcriptID];
+                            transcriptLogCount = salmon::math::logAdd(transcriptLogCount, txpHitInfo.newUniqueMass);
+                        }
+                        // END: DOUBLY-COLLAPSED TESTING
+
+                        if ( transcriptLogCount != LOG_0 and
+                             auxProb != LOG_0) {
+                           aln->logProb = transcriptLogCount + auxProb;
+
+                            sumOfAlignProbs = logAdd(sumOfAlignProbs, aln->logProb);
+                            if (updateCounts and
+                                observedTranscripts.find(transcriptID) == observedTranscripts.end()) {
+                                refs[transcriptID].addTotalCount(1);
+                                observedTranscripts.insert(transcriptID);
+                            }
+                            // EQCLASS
+                            txpIDs.push_back(transcriptID);
+                            auxProbs.push_back(auxProb);
+                            auxDenom = salmon::math::logAdd(auxDenom, auxProb);
+                            boost::hash_combine(txpIDsHash, transcriptID);
+
+                        } else {
+                            aln->logProb = LOG_0;
+                        }
+                    }
+
+                    // If we have a 0-probability fragment
+                    if (sumOfAlignProbs == LOG_0) {
+                        auto aln = alnGroup->alignments().front();
+                        log->warn("0 probability fragment [{}] "
+                                  "encountered \n", aln->getName());
+                        continue;
+                    }
+
+                    // EQCLASS
+                    TranscriptGroup tg(txpIDs, txpIDsHash);
+                    double auxProbSum{0.0};
+                    for (auto& p : auxProbs) {
+                        p = std::exp(p - auxDenom);
+                        auxProbSum += p;
+                    }
+                    if (std::abs(auxProbSum - 1.0) > 0.01) {
+                        std::cerr << "weights had sum of " << auxProbSum
+                                  << " but it should be 1!!\n\n";
+                    }
+                    eqBuilder.addGroup(std::move(tg), auxProbs);
+
+
+                    // Normalize the scores
+                    for (auto& aln : alnGroup->alignments()) {
+                        if (aln->logProb == LOG_0) { continue; }
+                        aln->logProb -= sumOfAlignProbs;
+
+                        auto transcriptID = aln->transcriptID();
+                        auto& transcript = refs[transcriptID];
+
+                        double newMass = logForgettingMass + aln->logProb;
+                        if (useMassBanking and transcript.uniqueCount() > 0) {
+                            newMass = salmon::math::logAdd(newMass, hitInfo[transcriptID].newUniqueMass);
+                        }
+                        transcript.addMass(newMass);
+                        transcript.setLastTimestepUpdated(currentMinibatchTimestep);
+
+                        double r = uni(eng);
+                        if (!burnedIn and r < std::exp(aln->logProb)) {
+			    // Update the error model
+                            if (salmonOpts.useErrorModel) {
+                                alnMod.update(*aln, transcript, LOG_1, logForgettingMass);
+                            }
+			    // Update the fragment length distribution
+                            if (aln->isPaired() and !salmonOpts.noFragLengthDist) {
+                                double fragLength = aln->fragLen();
+                                fragLengthDist.addVal(fragLength, logForgettingMass);
+                            }
+			    // Update the fragment start position distribution
+			    if (useFSPD) {
+				    auto hitPos = aln->left();
+				    auto& fragStartDist =
+					    fragStartDists[transcript.lengthClassIndex()];
+				    fragStartDist.addVal(hitPos,
+						    transcript.RefLength,
+						    logForgettingMass);
+			    }
+                        }
+                    }
+
+                    // update the single target transcript
+                    if (transcriptUnique) {
+                        if (updateCounts) {
+                            refs[firstTranscriptID].addUniqueCount(1);
+                        }
+                        clusterForest.updateCluster(firstTranscriptID, 1,
+                                                    logForgettingMass, updateCounts);
+                    } else { // or the appropriate clusters
+                        // ughh . . . C++ still has some very rough edges
+                        clusterForest.template mergeClusters<FragT>(alnGroup->alignments().begin(),
+                                                           alnGroup->alignments().end());
+                        clusterForest.updateCluster(alnGroup->alignments().front()->transcriptID(),
+                                                    1, logForgettingMass, updateCounts);
+                    }
+
+                    ++batchReads;
+                } // end read group
+            }// end timer
+
+            double individualTotal = LOG_0;
+            {
+                /*
+                // M-step
+                for (auto kv = hitList.begin(); kv != hitList.end(); ++kv) {
+                    auto transcriptID = kv->first;
+                    // The target must be a valid transcript
+                    if (transcriptID >= numTranscripts or transcriptID < 0) {std::cerr << "index " << transcriptID << " out of bounds\n"; }
+
+                    auto& transcript = refs[transcriptID];
+
+                    // The prior probability
+                    double hitMass{LOG_0};
+
+                    // The set of alignments that match transcriptID
+                    auto& hits = kv->second;
+                    std::for_each(hits.begin(), hits.end(), [&](FragT* aln) -> void {
+                            if (!std::isfinite(aln->logProb)) { log->warn("hitMass = {}\n", aln->logProb); }
+                            hitMass = logAdd(hitMass, aln->logProb);
+                    });
+
+                    // Lock the target
+                    if (hitMass == LOG_0) {
+                        log->warn("\n\n\n\nA set of *valid* alignments for a read appeared to "
+                                  "have 0 probability.  This should not happen.  Please report "
+                                  "this bug.  exiting!");
+
+                        std::cerr << "\n\n\n\nA set of *valid* alignments for a read appeared to "
+                                  << "have 0 probability.  This should not happen.  Please report "
+                                  << "this bug.  exiting!";
+                        std::exit(1);
+                    }
+
+                    double updateMass = logForgettingMass + hitMass;
+                    individualTotal = logAdd(individualTotal, updateMass);
+                    transcript.addMass(updateMass);
+
+                   // unlock the target
+                } // end for
+                */
+            } // end timer
+
+            // If we're not keeping around a cache, then
+            // reclaim the memory for these fragments and alignments
+            // and delete the mini batch.
+            if (processedCache == nullptr) {
+                miniBatch->release(fragmentQueue, alignmentGroupQueue);
+                delete miniBatch;
+            } else {
+            // Otherwise, just put the mini-batch on the processed queue
+            // to be re-used in the next round.
+                processedCache->push(miniBatch);
+            }
+            --activeBatches;
+            processedReads += batchReads;
+            if (processedReads >= numBurninFrags and !burnedIn) {
+                burnedIn = true;
+                fragLengthDist.cacheCMF();
+            }
+        }
+        miniBatch = nullptr;
+    } // nothing left to process
+}
+
+/**
+  *  Quantify the targets given in the file `transcriptFile` using the
+  *  alignments given in the file `alignmentFile`, and write the results
+  *  to the file `outputFile`.  The reads are assumed to be in the format
+  *  specified by `libFmt`.
+  *
+  */
+template <typename FragT>
+bool quantifyLibrary(
+        AlignmentLibrary<FragT>& alnLib,
+        size_t numRequiredFragments,
+        const SalmonOpts& salmonOpts) {
+
+    bool burnedIn{false};
+
+    auto& refs = alnLib.transcripts();
+    size_t numTranscripts = refs.size();
+    size_t numObservedFragments{0};
+
+    auto& fileLog = salmonOpts.fileLog;
+    bool useMassBanking = salmonOpts.useMassBanking;
+
+    MiniBatchQueue<AlignmentGroup<FragT*>> workQueue;
+    MiniBatchQueue<AlignmentGroup<FragT*>>* workQueuePtr{&workQueue};
+    MiniBatchQueue<AlignmentGroup<FragT*>> processedCache;
+    MiniBatchQueue<AlignmentGroup<FragT*>>* processedCachePtr{nullptr};
+
+    ForgettingMassCalculator fmCalc(salmonOpts.forgettingFactor);
+
+    // Up-front, prefill the forgetting mass schedule for up to
+    // 1 billion reads
+    size_t prefillSize = (1000000000) / miniBatchSize;
+    fmCalc.prefill(prefillSize);
+
+    size_t batchNum{0};
+    std::atomic<size_t> totalProcessedReads{0};
+    bool initialRound{true};
+    bool haveCache{false};
+    bool doReset{true};
+    size_t maxCacheSize{salmonOpts.mappingCacheMemoryLimit};
+
+    NullFragmentFilter<FragT>* nff = nullptr;
+
+    // Give ourselves some space
+    fmt::print(stderr, "\n\n\n\n");
+
+    // EQCLASS
+    bool terminate{false};
+
+    while (numObservedFragments < numRequiredFragments and !terminate) {
+        if (!initialRound) {
+
+    	    size_t numToCache = (useMassBanking) ?
+				(alnLib.numMappedReads() - alnLib.numUniquelyMappedReads()) :
+				(alnLib.numMappedReads());
+
+            if (haveCache) {
+                std::swap(workQueuePtr, processedCachePtr);
+                doReset = false;
+                fmt::print(stderr, "\n\n");
+            } else if (numToCache <= maxCacheSize) {
+                processedCachePtr = &processedCache;
+                doReset = true;
+                fmt::print(stderr, "\n");
+            }
+
+            if (doReset and
+        		!alnLib.reset(
+	        		true, 		/* increment # of passes */
+		        	nff, 		/* fragment filter */
+        			useMassBanking  /* only process ambiguously mapped fragments*/
+	        	)) {
+                fmt::print(stderr,
+                  "\n\n======== WARNING ========\n"
+                  "A provided alignment file "
+                  "is not a regular file and therefore can't be read from "
+                  "more than once.\n\n"
+                  "We observed only {} fragments when we wanted at least {}.\n\n"
+                  "Please consider re-running Salmon with these alignments "
+                  "as a regular file!\n"
+                  "==========================\n\n",
+                  numObservedFragments, numRequiredFragments);
+                break;
+            }
+        }
+
+        volatile bool doneParsing{false};
+        std::condition_variable workAvailable;
+        std::mutex cvmutex;
+        std::vector<std::thread> workers;
+        std::atomic<size_t> activeBatches{0};
+        auto currentQuantThreads = (haveCache) ?
+                                   salmonOpts.numQuantThreads + salmonOpts.numParseThreads :
+                                   salmonOpts.numQuantThreads;
+
+        uint64_t firstTimestepOfRound = fmCalc.getCurrentTimestep();
+        if (firstTimestepOfRound > 1) {
+            firstTimestepOfRound -= 1;
+        }
+
+        for (uint32_t i = 0; i < currentQuantThreads; ++i) {
+            workers.emplace_back(processMiniBatch<FragT>,
+                    std::ref(alnLib),
+                    std::ref(fmCalc),
+                    firstTimestepOfRound,
+                    std::ref(*workQueuePtr),
+                    processedCachePtr,
+                    std::ref(workAvailable), std::ref(cvmutex),
+                    std::ref(doneParsing), std::ref(activeBatches),
+                    std::ref(salmonOpts),
+                    std::ref(burnedIn),
+                    initialRound,
+                    std::ref(totalProcessedReads));
+        }
+
+        if (!haveCache) {
+            size_t numProc{0};
+
+            BAMQueue<FragT>& bq = alnLib.getAlignmentGroupQueue();
+            std::vector<AlignmentGroup<FragT*>*>* alignments = new std::vector<AlignmentGroup<FragT*>*>;
+            alignments->reserve(miniBatchSize);
+            AlignmentGroup<FragT*>* ag;
+
+            bool alignmentGroupsRemain = bq.getAlignmentGroup(ag);
+            while (alignmentGroupsRemain or alignments->size() > 0) {
+                if (alignmentGroupsRemain) { alignments->push_back(ag); }
+                // If this minibatch has reached the size limit, or we have nothing
+                // left to fill it up with
+                if (alignments->size() >= miniBatchSize or !alignmentGroupsRemain) {
+                    ++batchNum;
+                    double logForgettingMass = 0.0;
+                    MiniBatchInfo<AlignmentGroup<FragT*>>* mbi =
+                        new MiniBatchInfo<AlignmentGroup<FragT*>>(batchNum, alignments, logForgettingMass);
+                    workQueuePtr->push(mbi);
+                    {
+                        std::unique_lock<std::mutex> l(cvmutex);
+                        workAvailable.notify_one();
+                    }
+                    alignments = new std::vector<AlignmentGroup<FragT*>*>;
+                    alignments->reserve(miniBatchSize);
+                }
+
+                if ((numProc % 1000000 == 0) or !alignmentGroupsRemain) {
+                    fmt::print(stderr, "\r\r{}processed{} {} {}reads in current round{}",
+                            ioutils::SET_GREEN, ioutils::SET_RED, numProc,
+                            ioutils::SET_GREEN, ioutils::RESET_COLOR);
+                    fileLog->info("quantification processed {} fragments so far\n",
+                                   numProc);
+                }
+
+                ++numProc;
+                alignmentGroupsRemain = bq.getAlignmentGroup(ag);
+            }
+            fmt::print(stderr, "\n");
+
+            // Free the alignments and the vector holding them
+            if (processedCachePtr == nullptr) {
+                for (auto& alnGroup : *alignments) {
+                    alnGroup->alignments().clear();
+                    delete alnGroup; alnGroup = nullptr;
+                }
+                delete alignments;
+            }
+        } else {
+            fmt::print(stderr, "\n");
+        }
+
+        doneParsing = true;
+
+        /**
+          * This could be a problem for small sets of alignments --- make sure the
+          * work queue is empty!!
+          * --- Thanks for finding a dataset that exposes this bug, Richard (Smith-Unna)!
+          */
+        size_t t = 0;
+        while (!workQueuePtr->empty()) {
+            std::unique_lock<std::mutex> l(cvmutex);
+            workAvailable.notify_one();
+        }
+
+        size_t tnum{0};
+        for (auto& t : workers) {
+            fmt::print(stderr, "\r\rkilling thread {} . . . ", tnum++);
+            {
+                std::unique_lock<std::mutex> l(cvmutex);
+                workAvailable.notify_all();
+            }
+            t.join();
+            fmt::print(stderr, "done");
+        }
+        fmt::print(stderr, "\n\n");
+
+        numObservedFragments += alnLib.numMappedReads();
+
+        fmt::print(stderr, "# observed = {} / # required = {}\033[A\033[A\033[A\033[A\033[A",
+                   numObservedFragments, numRequiredFragments);
+
+        if (initialRound) {
+            salmonOpts.jointLog->info("\n\n\nCompleted first pass through the alignment file.\n"
+                                      "Total # of mapped reads : {}\n"
+                                      "# of uniquely mapped reads : {}\n"
+                                      "# ambiguously mapped reads : {}\n\n\n",
+                                      alnLib.numMappedReads(),
+                                      alnLib.numUniquelyMappedReads(),
+                                      alnLib.numMappedReads() -
+                                      alnLib.numUniquelyMappedReads());
+        }
+
+        initialRound = false;
+
+        // If we're done our second pass and we've decided to use
+        // the in-memory cache, then activate it now.
+        if (!initialRound and processedCachePtr != nullptr) {
+            haveCache = true;
+        }
+        //EQCLASS
+        bool done = alnLib.equivalenceClassBuilder().finish();
+        // skip the extra online rounds
+        terminate = true;
+        // END EQCLASS
+    }
+
+    fmt::print(stderr, "\n\n\n\n");
+
+    // In this case, we have to give the structures held
+    // in the cache back to the appropriate queues
+    if (haveCache) {
+        auto& fragmentQueue = alnLib.fragmentQueue();
+        auto& alignmentGroupQueue = alnLib.alignmentGroupQueue();
+
+        MiniBatchInfo<AlignmentGroup<FragT*>>* mbi = nullptr;
+        while (!processedCache.empty()) {
+            while (processedCache.try_pop(mbi)) {
+                mbi->release(fragmentQueue, alignmentGroupQueue);
+                delete mbi;
+            }
+        }
+    }
+
+    return burnedIn;
+}
+
+int computeBiasFeatures(
+    std::vector<std::string>& transcriptFiles,
+    boost::filesystem::path outFilePath,
+    bool useStreamingParser,
+    size_t numThreads);
+
+int performBiasCorrectionSalmon(
+        boost::filesystem::path featureFile,
+        boost::filesystem::path expressionFile,
+        boost::filesystem::path outputFile,
+        size_t numThreads);
+
+int salmonAlignmentQuantify(int argc, char* argv[]) {
+    using std::cerr;
+    using std::vector;
+    using std::string;
+    namespace po = boost::program_options;
+    namespace bfs = boost::filesystem;
+
+    SalmonOpts sopt;
+
+    bool sampleOutput{false};
+    bool sampleUnaligned{false};
+    bool biasCorrect{false};
+    uint32_t numThreads{4};
+    size_t requiredObservations{50000000};
+
+    po::options_description basic("\nbasic options");
+    basic.add_options()
+    ("version,v", "print version string.")
+    ("help,h", "produce help message.")
+    ("libType,l", po::value<std::string>()->required(), "Format string describing the library type.")
+    ("alignments,a", po::value<vector<string>>()->multitoken()->required(), "input alignment (BAM) file(s).")
+    ("targets,t", po::value<std::string>()->required(), "FASTA format file containing target transcripts.")
+    ("threads,p", po::value<uint32_t>(&numThreads)->default_value(6), "The number of threads to use concurrently. "
+                                            "The alignment-based quantification mode of salmon is usually I/O bound "
+                                            "so until there is a faster multi-threaded SAM/BAM parser to feed the "
+                                            "quantification threads, one should not expect much of a speed-up beyond "
+                                            "~6 threads.")
+    ("incompatPrior", po::value<double>(&(sopt.incompatPrior))->default_value(1e-20), "This option "
+                        "sets the prior probability that an alignment that disagrees with the specified "
+                        "library type (--libType) results from the true fragment origin.  Setting this to 0 "
+                        "specifies that alignments that disagree with the library type should be \"impossible\", "
+                        "while setting it to 1 says that alignments that disagree with the library type are no "
+                        "less likely than those that do")
+    ("useErrorModel", po::bool_switch(&(sopt.useErrorModel))->default_value(false), "[Currently Experimental] : "
+                        "Learn and apply an error model for the aligned reads.  This takes into account the "
+                        "the observed frequency of different types of mismatches when computing the likelihood of "
+                        "a given alignment.")
+    ("output,o", po::value<std::string>()->required(), "Output quantification directory.")
+    ("biasCorrect", po::value(&biasCorrect)->zero_tokens(), "[Experimental]: Output both bias-corrected and non-bias-corrected ")
+    ("numRequiredObs,n", po::value(&requiredObservations)->default_value(50000000),
+                                        "[Deprecated]: The minimum number of observations (mapped reads) that must be observed before "
+                                        "the inference procedure will terminate.  If fewer mapped reads exist in the "
+                                        "input file, then it will be read through multiple times.")
+    ("geneMap,g", po::value<std::string>(), "File containing a mapping of transcripts to genes.  If this file is provided "
+                                        "Salmon will output both quant.sf and quant.genes.sf files, where the latter "
+                                        "contains aggregated gene-level abundance estimates.  The transcript to gene mapping "
+                                        "should be provided as either a GTF file, or a in a simple tab-delimited format "
+                                        "where each line contains the name of a transcript and the gene to which it belongs "
+                                        "separated by a tab.  The extension of the file is used to determine how the file "
+                                        "should be parsed.  Files ending in \'.gtf\' or \'.gff\' are assumed to be in GTF "
+                                        "format; files with any other extension are assumed to be in the simple format.");
+
+    // no sequence bias for now
+    sopt.noSeqBiasModel = true;
+    sopt.noRichEqClasses = false;
+
+    po::options_description advanced("\nadvanced options");
+    advanced.add_options()
+    ("fldMax" , po::value<size_t>(&(sopt.fragLenDistMax))->default_value(800), "The maximum fragment length to consider when building the empirical distribution")
+    ("fldMean", po::value<size_t>(&(sopt.fragLenDistPriorMean))->default_value(200), "The mean used in the fragment length distribution prior")
+    ("fldSD" , po::value<size_t>(&(sopt.fragLenDistPriorSD))->default_value(80), "The standard deviation used in the fragment length distribution prior")
+    ("forgettingFactor,f", po::value<double>(&(sopt.forgettingFactor))->default_value(0.65), "The forgetting factor used "
+                        "in the online learning schedule.  A smaller value results in quicker learning, but higher variance "
+                        "and may be unstable.  A larger value results in slower learning but may be more stable.  Value should "
+                        "be in the interval (0.5, 1.0].")
+    ("mappingCacheMemoryLimit", po::value<uint32_t>(&(sopt.mappingCacheMemoryLimit))->default_value(2000000), "If the file contained fewer than this "
+                                        "many mapped reads, then just keep the data in memory for subsequent rounds of inference. Obviously, this value should "
+                                        "not be too large if you wish to keep a low memory usage, but setting it large enough to accommodate all of the mapped "
+                                        "read can substantially speed up inference on \"small\" files that contain only a few million reads.")
+    ("maxReadOcc,w", po::value<uint32_t>(&(sopt.maxReadOccs))->default_value(200), "Reads \"mapping\" to more than this many places won't be considered.")
+    ("noEffectiveLengthCorrection", po::bool_switch(&(sopt.noEffectiveLengthCorrection))->default_value(false), "Disables "
+                        "effective length correction when computing the probability that a fragment was generated "
+                        "from a transcript.  If this flag is passed in, the fragment length distribution is not taken "
+                        "into account when computing this probability.")
+    ("noFragLengthDist", po::bool_switch(&(sopt.noFragLengthDist))->default_value(false), "[Currently Experimental] : "
+                        "Don't consider concordance with the learned fragment length distribution when trying to determine "
+                        "the probability that a fragment has originated from a specified location.  Normally, Fragments with "
+                         "unlikely lengths will be assigned a smaller relative probability than those with more likely "
+                        "lengths.  When this flag is passed in, the observed fragment length has no effect on that fragment's "
+                        "a priori probability.")
+    ("noFragStartPosDist", po::bool_switch(&(sopt.noFragStartPosDist))->default_value(false), "[Currently Experimental] : "
+                        "Don't consider / model non-uniformity in the fragment start positions "
+                        "across the transcript.")
+    /*
+    // Don't expose this yet
+    ("noRichEqClasses", po::bool_switch(&(sopt.noRichEqClasses))->default_value(false),
+                        "Disable \"rich\" equivalent classes.  If this flag is passed, then "
+                        "all information about the relative weights for each transcript in the "
+                        "label of an equivalence class will be ignored, and only the relative "
+                        "abundance and effective length of each transcript will be considered.")
+                        */
+    ("numErrorBins", po::value<uint32_t>(&(sopt.numErrorBins))->default_value(6), "The number of bins into which to divide "
+                        "each read when learning and applying the error model.  For example, a value of 10 would mean that "
+                        "effectively, a separate error model is leared and applied to each 10th of the read, while a value of "
+                        "3 would mean that a separate error model is applied to the read beginning (first third), middle (second third) "
+                        "and end (final third).")
+    ("numPreAuxModelSamples", po::value<uint32_t>(&(sopt.numPreBurninFrags))->default_value(1000000), "The first <numPreAuxModelSamples> will have their "
+     			"assignment likelihoods and contributions to the transcript abundances computed without applying any auxiliary models.  The purpose "
+			"of ignoring the auxiliary models for the first <numPreAuxModelSamples> observations is to avoid applying these models before thier "
+			"parameters have been learned sufficiently well.")
+    ("numAuxModelSamples", po::value<uint32_t>(&(sopt.numBurninFrags))->default_value(5000000), "The first <numAuxModelSamples> are used to train the "
+     			"auxiliary model parameters (e.g. fragment length distribution, bias, etc.).  After ther first <numAuxModelSamples> observations "
+			"the auxiliary model parameters will be assumed to have converged and will be fixed.")
+    ("sampleOut,s", po::bool_switch(&sampleOutput)->default_value(false), "Write a \"postSample.bam\" file in the output directory "
+                        "that will sample the input alignments according to the estimated transcript abundances. If you're "
+                        "going to perform downstream analysis of the alignments with tools which don't, themselves, take "
+                        "fragment assignment ambiguity into account, you should use this output.")
+    ("sampleUnaligned,u", po::bool_switch(&sampleUnaligned)->default_value(false), "In addition to sampling the aligned reads, also write "
+                        "the un-aligned reads to \"posSample.bam\".")
+    ("useMassBanking", po::bool_switch(&(sopt.useMassBanking))->default_value(false), "[Deprecated] : "
+                        "Use mass \"banking\" in subsequent epoch of inference.  Rather than re-observing uniquely "
+                        "mapped reads, simply remember the ratio of uniquely to ambiguously mapped reads for each "
+                        "transcript and distribute the unique mass uniformly throughout the epoch.")
+    ("useVBOpt,v", po::bool_switch(&(sopt.useVBOpt))->default_value(false), "Use the Variational Bayesian EM rather than the "
+     			"traditional EM algorithm for optimization in the batch passes.");
+
+
+    po::options_description testing("\n"
+            "testing options");
+    testing.add_options()
+        ("noRichEqClasses", po::bool_switch(&(sopt.noRichEqClasses))->default_value(false),
+                        "[TESTING OPTION]: Disable \"rich\" equivalent classes.  If this flag is passed, then "
+                        "all information about the relative weights for each transcript in the "
+                        "label of an equivalence class will be ignored, and only the relative "
+                        "abundance and effective length of each transcript will be considered.")
+        ("noFragLenFactor", po::bool_switch(&(sopt.noFragLenFactor))->default_value(false),
+                        "[TESTING OPTION]: Disable the factor in the likelihood that takes into account the "
+                        "goodness-of-fit of an alignment with the empirical fragment length "
+                        "distribution");
+
+
+    po::options_description all("salmon quant options");
+    all.add(basic).add(advanced).add(testing);
+
+    po::options_description visible("salmon quant options");
+    visible.add(basic).add(advanced);
+
+    po::variables_map vm;
+    try {
+        auto orderedOptions = po::command_line_parser(argc,argv).
+            options(all).run();
+
+        po::store(orderedOptions, vm);
+
+        if (vm.count("help")) {
+            std::cout << "Salmon quant (alignment-based)\n";
+            std::cout << visible << std::endl;
+            std::exit(0);
+        }
+        po::notify(vm);
+
+        if (numThreads < 2) {
+            fmt::print(stderr, "salmon requires at least 2 threads --- "
+                               "setting # of threads = 2\n");
+            numThreads = 2;
+        }
+        sopt.numThreads = numThreads;
+
+        if (sopt.forgettingFactor <= 0.5 or
+            sopt.forgettingFactor > 1.0) {
+            fmt::print(stderr, "The forgetting factor must be in (0.5, 1.0], "
+                               "but the value {} was provided\n", sopt.forgettingFactor);
+            std::exit(1);
+        }
+
+        std::stringstream commentStream;
+        commentStream << "# salmon (alignment-based) v" << salmon::version << "\n";
+        commentStream << "# [ program ] => salmon \n";
+        commentStream << "# [ command ] => quant \n";
+        for (auto& opt : orderedOptions.options) {
+            commentStream << "# [ " << opt.string_key << " ] => {";
+            for (auto& val : opt.value) {
+                commentStream << " " << val;
+            }
+            commentStream << " }\n";
+        }
+        std::string commentString = commentStream.str();
+        fmt::print(stderr, "{}", commentString);
+
+        // Verify the geneMap before we start doing any real work.
+        bfs::path geneMapPath;
+        if (vm.count("geneMap")) {
+            // Make sure the provided file exists
+            geneMapPath = vm["geneMap"].as<std::string>();
+            if (!bfs::exists(geneMapPath)) {
+                fmt::print(stderr, "Could not find transcript <=> gene map file {} \n"
+                           "Exiting now; please either omit the \'geneMap\' option or "
+                           "provide a valid file\n", geneMapPath);
+                std::exit(1);
+            }
+        }
+
+        vector<string> alignmentFileNames = vm["alignments"].as<vector<string>>();
+        vector<bfs::path> alignmentFiles;
+        for (auto& alignmentFileName : alignmentFileNames) {
+            bfs::path alignmentFile(alignmentFileName);//vm["alignments"].as<std::string>());
+            if (!bfs::exists(alignmentFile)) {
+                std::stringstream ss;
+                ss << "The provided alignment file: " << alignmentFile <<
+                    " does not exist!\n";
+                throw std::invalid_argument(ss.str());
+            } else {
+                alignmentFiles.push_back(alignmentFile);
+            }
+        }
+
+        std::string libFmtStr = vm["libType"].as<std::string>();
+        LibraryFormat libFmt = salmon::utils::parseLibraryFormatStringNew(libFmtStr);
+        if (libFmt.check()) {
+            std::cerr << libFmt << "\n";
+        } else {
+            std::stringstream ss;
+            ss << libFmt << " is invalid!";
+            throw std::invalid_argument(ss.str());
+        }
+
+        bfs::path outputDirectory(vm["output"].as<std::string>());
+        // If the path exists
+        if (bfs::exists(outputDirectory)) {
+            // If it is not a directory, then complain
+            if (!bfs::is_directory(outputDirectory)) {
+                std::stringstream errstr;
+                errstr << "Path [" << outputDirectory << "] already exists "
+                       << "and is not a directory.\n"
+                       << "Please either remove this file or choose another "
+                       << "output path.\n";
+                throw std::invalid_argument(errstr.str());
+            }
+        } else { // If the path doesn't exist, then create it
+            if (!bfs::create_directories(outputDirectory)) { // creation failed for some reason
+                std::stringstream errstr;
+                errstr << "Could not create output directory ["
+                       << outputDirectory << "], please check that it is valid.";
+                throw std::invalid_argument(errstr.str());
+            }
+        }
+
+        bfs::path logDirectory = outputDirectory / "logs";
+
+        // Create the logger and the logging directory
+        bfs::create_directory(logDirectory);
+        if (!(bfs::exists(logDirectory) and bfs::is_directory(logDirectory))) {
+            std::cerr << "Couldn't create log directory " << logDirectory << "\n";
+            std::cerr << "exiting\n";
+            std::exit(1);
+        }
+        std::cerr << "Logs will be written to " << logDirectory.string() << "\n";
+
+        bfs::path logPath = logDirectory / "salmon.log";
+        size_t max_q_size = 2097152;
+        spdlog::set_async_mode(max_q_size);
+
+        auto fileSink = std::make_shared<spdlog::sinks::simple_file_sink_mt>(logPath.string(), true);
+        auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
+        auto consoleLog = spdlog::create("consoleLog", {consoleSink});
+        auto fileLog = spdlog::create("fileLog", {fileSink});
+        auto jointLog = spdlog::create("jointLog", {fileSink, consoleSink});
+
+        sopt.jointLog = jointLog;
+        sopt.fileLog = fileLog;
+
+        if (!sampleOutput and sampleUnaligned) {
+            fmt::MemoryWriter wstr;
+            wstr << "WARNING: you passed in the (-u/--sampleUnaligned) flag, but did not request a sampled "
+                 << "output file (-s/--sampleOut).  This flag will be ignored!\n";
+            jointLog->warn() << wstr.str();
+        }
+
+        // maybe arbitrary, but if it's smaller than this, consider it
+        // equal to LOG_0
+        if (sopt.incompatPrior < 1e-320) {
+            sopt.incompatPrior = salmon::math::LOG_0;
+        } else {
+            sopt.incompatPrior = std::log(sopt.incompatPrior);
+        }
+
+        // If we made it this far, the output directory exists
+        bfs::path outputFile = outputDirectory / "quant.sf";
+        // Now create a subdirectory for any parameters of interest
+        bfs::path paramsDir = outputDirectory / "libParams";
+        if (!boost::filesystem::exists(paramsDir)) {
+            if (!boost::filesystem::create_directory(paramsDir)) {
+                fmt::print(stderr, "{}ERROR{}: Could not create "
+                           "output directory for experimental parameter "
+                           "estimates [{}]. exiting.", ioutils::SET_RED,
+                           ioutils::RESET_COLOR, paramsDir);
+                std::exit(-1);
+            }
+        }
+
+        // The transcript file contains the target sequences
+        bfs::path transcriptFile(vm["targets"].as<std::string>());
+
+        // Currently, one thread is used for parsing the alignment file.
+        // Hopefully, in the future, samtools will implemented multi-threaded
+        // BAM/SAM parsing, as this is the current bottleneck.  For the time
+        // being, however, the number of quantification threads is the
+        // total number of threads - 1.
+        uint32_t numParseThreads = std::min(uint32_t(6),
+                                            std::max(uint32_t(2), uint32_t(std::ceil(numThreads/2.0))));
+        numThreads = std::max(numThreads, numParseThreads);
+        uint32_t numQuantThreads = std::max(uint32_t(2), uint32_t(numThreads - numParseThreads));
+        sopt.numQuantThreads = numQuantThreads;
+        sopt.numParseThreads = numParseThreads;
+        std::cerr << "numQuantThreads = " << numQuantThreads << "\n";
+
+        switch (libFmt.type) {
+            case ReadType::SINGLE_END:
+                {
+                    AlignmentLibrary<UnpairedRead> alnLib(alignmentFiles,
+                                                          transcriptFile,
+                                                          libFmt,
+                                                          sopt);
+                    // EQCLASS
+                    alnLib.equivalenceClassBuilder().start();
+
+                    bool burnedIn = quantifyLibrary<UnpairedRead>(alnLib, requiredObservations, sopt);
+
+                    // EQCLASS
+                    CollapsedEMOptimizer optimizer;
+                    jointLog->info("starting optimizer");
+                    salmon::utils::normalizeAlphas(sopt, alnLib);
+                    optimizer.optimize(alnLib, sopt, 0.01, 10000);
+                    jointLog->info("finished optimizer");
+
+                    // EQCLASS
+                    fmt::print(stderr, "\n\nwriting output \n");
+                    salmon::utils::writeAbundancesFromCollapsed(
+                        sopt, alnLib, outputFile, commentString);
+
+                    //fmt::print(stderr, "\n\nwriting output \n");
+                    //salmon::utils::writeAbundances(sopt, alnLib, outputFile, commentString);
+
+
+                    if (sampleOutput) {
+                        // In this case, we should "re-convert" transcript
+                        // masses to be counts in log space
+                        auto nr = alnLib.numMappedReads();
+                        for (auto& t : alnLib.transcripts()) {
+                            double m = t.mass(false) * nr;
+                            if (m > 0.0) {
+                                t.setMass(std::log(m));
+                            }
+                        }
+
+                        bfs::path sampleFilePath = outputDirectory / "postSample.bam";
+                        bool didSample = salmon::sampler::sampleLibrary<UnpairedRead>(alnLib, sopt, burnedIn, sampleFilePath, sampleUnaligned);
+                        if (!didSample) {
+                            jointLog->warn("There may have been a problem generating the sampled output file; please check the log\n");
+                        }
+                    }
+                }
+                break;
+            case ReadType::PAIRED_END:
+                {
+                    AlignmentLibrary<ReadPair> alnLib(alignmentFiles,
+                                                      transcriptFile,
+                                                      libFmt,
+                                                      sopt);
+                    // EQCLASS
+                    alnLib.equivalenceClassBuilder().start();
+
+                    bool burnedIn = quantifyLibrary<ReadPair>(alnLib, requiredObservations, sopt);
+
+                    // EQCLASS
+                    CollapsedEMOptimizer optimizer;
+                    jointLog->info("starting optimizer");
+                    salmon::utils::normalizeAlphas(sopt, alnLib);
+                    optimizer.optimize(alnLib, sopt, 0.01, 10000);
+                    jointLog->info("finished optimizer");
+
+                    fmt::print(stderr, "\n\nwriting output \n");
+                    // EQCLASS
+                    salmon::utils::writeAbundancesFromCollapsed(
+                        sopt, alnLib, outputFile, commentString);
+
+                    /*
+                    fmt::print(stderr, "\n\nwriting output \n");
+                    salmon::utils::writeAbundances(sopt, alnLib, outputFile, commentString);
+                    */
+
+                                        // Test writing out the fragment length distribution
+                    if (!sopt.noFragLengthDist) {
+                        bfs::path distFileName = paramsDir / "flenDist.txt";
+                        {
+                            std::unique_ptr<std::FILE, int (*)(std::FILE *)> distOut(std::fopen(distFileName.c_str(), "w"), std::fclose);
+                            fmt::print(distOut.get(), "{}\n", alnLib.fragmentLengthDistribution().toString());
+                        }
+                    }
+
+                    if (sampleOutput) {
+                        // In this case, we should "re-convert" transcript
+                        // masses to be counts in log space
+                        auto nr = alnLib.numMappedReads();
+                        for (auto& t : alnLib.transcripts()) {
+                            double m = t.mass(false) * nr;
+                            if (m > 0.0) {
+                                t.setMass(std::log(m));
+                            }
+                        }
+
+                        bfs::path sampleFilePath = outputDirectory / "postSample.bam";
+                        bool didSample = salmon::sampler::sampleLibrary<ReadPair>(alnLib, sopt, burnedIn, sampleFilePath, sampleUnaligned);
+                        if (!didSample) {
+                            jointLog->warn("There may have been a problem generating the sampled output file; please check the log\n");
+                        }
+
+                    }
+                }
+                break;
+            default:
+                std::cerr << "Cannot quantify library of unknown format "
+                          << libFmt << "\n";
+                std::exit(1);
+        }
+
+        bfs::path estFilePath = outputDirectory / "quant.sf";
+
+        if (biasCorrect) {
+            // First, compute the transcript features in case the user
+            // ever wants to bias-correct his / her results
+            bfs::path transcriptBiasFile(outputDirectory); transcriptBiasFile /= "bias_feats.txt";
+
+            bool useStreamingParser{true};
+            std::vector<std::string> transcriptFiles{transcriptFile.string()};
+            std::cerr << "computeBiasFeatures( {";
+            for (auto& tf : transcriptFiles) {
+                std::cerr << "[" << tf << "] ";
+            }
+            std::cerr << ", " << transcriptBiasFile << ", " << useStreamingParser << ", " << numThreads << ")\n";
+            computeBiasFeatures(transcriptFiles, transcriptBiasFile, useStreamingParser, numThreads);
+
+            auto origExpressionFile = estFilePath;
+
+            auto outputDirectory = estFilePath;
+            outputDirectory.remove_filename();
+
+            auto biasCorrectedFile = outputDirectory / "quant_bias_corrected.sf";
+            performBiasCorrectionSalmon(transcriptBiasFile, estFilePath, biasCorrectedFile, numThreads);
+        }
+
+        /** If the user requested gene-level abundances, then compute those now **/
+        if (vm.count("geneMap")) {
+            try {
+                salmon::utils::generateGeneLevelEstimates(geneMapPath,
+                                                            outputDirectory,
+                                                            biasCorrect);
+            } catch (std::exception& e) {
+                fmt::print(stderr, "Error: [{}] when trying to compute gene-level "\
+                                   "estimates. The gene-level file(s) may not exist",
+                                   e.what());
+            }
+        }
+
+    } catch (po::error& e) {
+        std::cerr << "exception : [" << e.what() << "]. Exiting.\n";
+        std::exit(1);
+    } catch (const spdlog::spdlog_ex& ex) {
+        std::cerr << "logger failed with : [" << ex.what() << "]. Exiting.\n";
+        std::exit(1);
+    } catch (std::exception& e) {
+        std::cerr << "============\n";
+        std::cerr << "Exception : [" << e.what() << "]\n";
+        std::cerr << "============\n";
+        std::cerr << argv[0] << " alignment-quant was invoked improperly.\n";
+        std::cerr << "For usage information, " <<
+            "try " << argv[0] << " quant --help-alignments\nExiting.\n";
+        std::exit(1);
+    }
+    return 0;
+}
diff --git a/src/SalmonStringUtils.cpp b/src/SalmonStringUtils.cpp
new file mode 100644
index 0000000..5400344
--- /dev/null
+++ b/src/SalmonStringUtils.cpp
@@ -0,0 +1,19 @@
+#include "SalmonStringUtils.hpp"
+#include <iostream>
+#include <cstdint>
+#include <cmath>
+
+uint8_t* salmon::stringtools::encodeSequenceInSAM(const char* src, size_t len) {
+    uint8_t* target = new uint8_t[static_cast<size_t>(ceil(len / 2.0))]();
+    for(size_t i = 0; i < len; ++i) {
+        size_t byte = i >> 1;
+        size_t nibble = i & 0x1;
+        if (nibble) {
+            target[byte] |= charToSamEncode[src[i]];
+        } else {
+            target[byte] |= (charToSamEncode[src[i]] << 4);
+        }
+    }
+    return target;
+}
+
diff --git a/src/SalmonUtils.cpp b/src/SalmonUtils.cpp
new file mode 100644
index 0000000..8498131
--- /dev/null
+++ b/src/SalmonUtils.cpp
@@ -0,0 +1,1236 @@
+#include <boost/thread/thread.hpp>
+#include <boost/filesystem.hpp>
+#include <algorithm>
+#include <iostream>
+#include <tuple>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+#include <boost/filesystem.hpp>
+#include <boost/range/join.hpp>
+
+#include "SalmonUtils.hpp"
+#include "AlignmentLibrary.hpp"
+#include "ReadPair.hpp"
+#include "UnpairedRead.hpp"
+#include "SalmonMath.hpp"
+#include "LibraryFormat.hpp"
+#include "ReadExperiment.hpp"
+
+#include "spdlog/spdlog.h"
+
+#include "gff.h"
+
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+#include "jellyfish/mer_dna.hpp"
+
+#include "TranscriptGeneMap.hpp"
+#include "GenomicFeature.hpp"
+
+namespace salmon {
+namespace utils {
+
+    bool headersAreConsistent(SAM_hdr* h1, SAM_hdr* h2) {
+
+        bool consistent{true};
+        // Both files must contain the same number of targets
+        if (h1->nref != h2->nref) { consistent = false; }
+
+        // Check each target to ensure that the name and length are the same.
+        size_t i = 0;
+        size_t n = h1->nref;
+        while (consistent and i < n) {
+            size_t l1 = h1->ref[i].len;
+            size_t l2 = h2->ref[i].len;
+            consistent = (l1 == l2) and
+                         (strcmp(h1->ref[i].name, h2->ref[i].name) == 0);
+            ++i;
+        }
+
+        return consistent;
+    }
+
+    bool headersAreConsistent(std::vector<SAM_hdr*>&& headers) {
+        if (headers.size() == 1) { return true; }
+
+        // Ensure that all of the headers are consistent (i.e. the same), by
+        // comparing each with the first.
+        bool consistent{true};
+        auto itFirst = headers.begin();
+        auto it = itFirst;
+        while (++it != headers.end()) {
+            if (!headersAreConsistent(*itFirst, *it)) {
+                consistent = false;
+                break;
+            }
+        }
+        return consistent;
+    }
+
+    std::ostream& operator<<(std::ostream& os, OrphanStatus s) {
+        switch (s) {
+            case OrphanStatus::LeftOrphan:
+                os << "left orphan";
+                break;
+            case OrphanStatus::RightOrphan:
+                os << "right orphan";
+                break;
+            case OrphanStatus::Paired:
+                os << "paired";
+                break;
+        }
+        return os;
+    }
+
+    double logAlignFormatProb(const LibraryFormat observed, const LibraryFormat expected, double incompatPrior) {
+        // Allow orphaned reads in a paired-end library, but
+        // decrease their a priori probability.
+        if (expected.type == ReadType::PAIRED_END and
+            observed.type == ReadType::SINGLE_END) {
+            double logOrphanProb = salmon::math::LOG_ORPHAN_PROB;
+            if (expected.strandedness == ReadStrandedness::U or
+                expected.strandedness == ReadStrandedness::AS or
+                expected.strandedness == ReadStrandedness::SA) {
+                return salmon::math::LOG_1;
+            } else {
+                return (expected.strandedness == observed.strandedness) ? logOrphanProb : incompatPrior;
+            }
+        } else if (observed.type != expected.type or
+            observed.orientation != expected.orientation ) {
+            return incompatPrior;
+        } else {
+            if (expected.strandedness == ReadStrandedness::U) {
+                return salmon::math::LOG_ONEHALF;
+            } else {
+                if (expected.strandedness == observed.strandedness) {
+                    return salmon::math::LOG_1;
+                } else {
+                    /**
+                    * Let's not complain about this for now, but find
+                    * a different way to report it.
+                    * std::cerr << "expected = " << expected << "\n";
+                    * std::cerr << "observed = " << observed << "\n";
+                    */
+                    return incompatPrior;
+                }
+            }
+        }
+
+        fmt::print(stderr, "WARNING: logAlignFormatProb --- should not get here");
+        return salmon::math::LOG_0;
+    }
+
+    template <typename ExpLib>
+    void writeAbundancesFromCollapsed(
+                         const SalmonOpts& sopt,
+                         ExpLib& alnLib,
+                         boost::filesystem::path& fname,
+                         std::string headerComments) {
+        using salmon::math::LOG_0;
+        using salmon::math::LOG_1;
+
+        std::unique_ptr<std::FILE, int (*)(std::FILE *)> output(std::fopen(fname.c_str(), "w"), std::fclose);
+
+        fmt::print(output.get(), "{}", headerComments);
+        fmt::print(output.get(), "# Name\tLength\tTPM\tNumReads\n");
+
+        double numMappedFrags = alnLib.upperBoundHits();
+
+        std::vector<Transcript>& transcripts_ = alnLib.transcripts();
+        for (auto& transcript : transcripts_) {
+            transcript.projectedCounts =
+                transcript.mass(false) * numMappedFrags;
+        }
+
+        double tfracDenom{0.0};
+        for (auto& transcript : transcripts_) {
+            double refLength = sopt.noEffectiveLengthCorrection ?
+                               transcript.RefLength :
+                               std::exp(transcript.getCachedEffectiveLength());
+            tfracDenom += (transcript.projectedCounts / numMappedFrags) / refLength;
+        }
+
+        double million = 1000000.0;
+        // Now posterior has the transcript fraction
+        for (auto& transcript : transcripts_) {
+            double logLength = sopt.noEffectiveLengthCorrection ?
+                               std::log(transcript.RefLength) :
+                               transcript.getCachedEffectiveLength();
+            double count = transcript.projectedCounts;
+            double npm = (transcript.projectedCounts / numMappedFrags);
+            double refLength = std::exp(logLength);
+            double tfrac = (npm / refLength) / tfracDenom;
+            double tpm = tfrac * million;
+
+            fmt::print(output.get(), "{}\t{}\t{}\t{}\n",
+                    transcript.RefName, transcript.RefLength,
+                    tpm, count);
+        }
+
+    }
+
+    template <typename ExpLib>
+    void writeAbundances(const SalmonOpts& sopt,
+                         ExpLib& alnLib,
+                         boost::filesystem::path& fname,
+                         std::string headerComments) {
+        using salmon::math::LOG_0;
+        using salmon::math::LOG_1;
+
+        std::unique_ptr<std::FILE, int (*)(std::FILE *)> output(std::fopen(fname.c_str(), "w"), std::fclose);
+
+        fmt::print(output.get(), "{}", headerComments);
+        fmt::print(output.get(), "# Name\tLength\tTPM\tFPKM\tNumReads\n");
+
+
+        auto& refs = alnLib.transcripts();
+        auto numMappedReads = alnLib.numMappedReads();
+        const double logBillion = std::log(1000000000.0);
+        const double million = 1000000.0;
+        const double logNumFragments = std::log(static_cast<double>(numMappedReads));
+        const double upperBoundFactor = static_cast<double>(alnLib.upperBoundHits()) /
+                                        numMappedReads;
+
+        auto clusters = alnLib.clusterForest().getClusters();
+        size_t clusterID = 0;
+        for(auto cptr : clusters) {
+
+            //double logClusterMass = cptr->logMass();
+            // EDIT
+            double logClusterMass = salmon::math::LOG_0;
+            double logClusterCount = std::log(upperBoundFactor * static_cast<double>(cptr->numHits()));
+
+            bool requiresProjection{false};
+
+            auto& members = cptr->members();
+            size_t clusterSize{0};
+            for (auto transcriptID : members) {
+                Transcript& t = refs[transcriptID];
+                t.uniqueCounts = t.uniqueCount();
+                t.totalCounts = t.totalCount();
+                logClusterMass = salmon::math::logAdd(logClusterMass,
+                                    t.mass(false));
+                ++clusterSize;
+            }
+
+            if (logClusterMass == LOG_0) {
+                // std::cerr << "Warning: cluster " << clusterID << " has 0 mass!\n";
+            }
+
+            for (auto transcriptID : members) {
+                Transcript& t = refs[transcriptID];
+                double logTranscriptMass = t.mass(false);
+                // Try bias
+                /*
+                double logBias = t.bias();
+                logTranscriptMass += t.bias();
+                */
+
+                if (logTranscriptMass == LOG_0) {
+                    t.projectedCounts = 0;
+                } else {
+                    double logClusterFraction = logTranscriptMass - logClusterMass;
+                    t.projectedCounts = std::exp(logClusterFraction + logClusterCount);
+                    requiresProjection |= t.projectedCounts > static_cast<double>(t.totalCounts) or
+                        t.projectedCounts < static_cast<double>(t.uniqueCounts);
+                }
+            }
+
+            if (clusterSize > 1 and requiresProjection) {
+                cptr->projectToPolytope(refs);
+            }
+            ++clusterID;
+        }
+
+        auto& transcripts_ = refs;
+        double tfracDenom{0.0};
+        for (auto& transcript : transcripts_) {
+            double refLength = sopt.noEffectiveLengthCorrection ?
+                               transcript.RefLength :
+                               std::exp(transcript.getCachedEffectiveLength());
+            //refLength = transcript.RefLength;
+            tfracDenom += (transcript.projectedCounts / numMappedReads) / refLength;
+        }
+
+        // Now posterior has the transcript fraction
+        for (auto& transcript : transcripts_) {
+            double logLength = sopt.noEffectiveLengthCorrection ?
+                               std::log(transcript.RefLength) :
+                               transcript.getCachedEffectiveLength();
+            /*
+            if (!sopt.noSeqBiasModel) {
+                double avgLogBias = transcript.getAverageSequenceBias(
+                                    alnLib.sequenceBiasModel());
+                logLength += avgLogBias;
+            }
+            */
+            //logLength = std::log(transcript.RefLength);
+            double fpkmFactor = std::exp(logBillion - logLength - logNumFragments);
+            double count = transcript.projectedCounts;
+            //double countTotal = transcripts_[transcriptID].totalCounts;
+            //double countUnique = transcripts_[transcriptID].uniqueCounts;
+            double fpkm = count > 0 ? fpkmFactor * count : 0.0;
+            double npm = (transcript.projectedCounts / numMappedReads);
+            double refLength = std::exp(logLength);
+            double tfrac = (npm / refLength) / tfracDenom;
+            double tpm = tfrac * million;
+
+            fmt::print(output.get(), "{}\t{}\t{}\t{}\t{}\n",
+                    transcript.RefName, transcript.RefLength,
+                    tpm, fpkm, count);
+        }
+
+    }
+
+
+
+    template <typename AlnLibT>
+    void normalizeAlphas(const SalmonOpts& sopt,
+                         AlnLibT& alnLib) {
+
+        using salmon::math::LOG_0;
+        using salmon::math::LOG_1;
+
+        auto& refs = alnLib.transcripts();
+        auto numMappedReads = alnLib.numMappedReads();
+        const double logNumFragments = std::log(static_cast<double>(numMappedReads));
+        auto clusters = alnLib.clusterForest().getClusters();
+        size_t clusterID = 0;
+        for(auto cptr : clusters) {
+
+            //double logClusterMass = cptr->logMass();
+            // EDIT
+            double logClusterMass = salmon::math::LOG_0;
+            double logClusterCount = std::log(static_cast<double>(cptr->numHits()));
+
+            bool requiresProjection{false};
+
+            auto& members = cptr->members();
+            size_t clusterSize{0};
+            for (auto transcriptID : members) {
+                Transcript& t = refs[transcriptID];
+                t.uniqueCounts = t.uniqueCount();
+                t.totalCounts = t.totalCount();
+                logClusterMass = salmon::math::logAdd(logClusterMass,
+                                    t.mass(false));// + t.bias());
+                ++clusterSize;
+            }
+
+            if (logClusterMass == LOG_0) {
+                // std::cerr << "Warning: cluster " << clusterID << " has 0 mass!\n";
+            }
+
+            for (auto transcriptID : members) {
+                Transcript& t = refs[transcriptID];
+                double logTranscriptMass = t.mass(false);
+                // Try bias
+                // double logBias = t.bias();
+                // logTranscriptMass += t.bias();
+
+                if (logTranscriptMass == LOG_0) {
+                    t.projectedCounts = 0;
+                } else {
+                    double logClusterFraction = logTranscriptMass - logClusterMass;
+                    t.projectedCounts = std::exp(logClusterFraction + logClusterCount);
+                    requiresProjection |= t.projectedCounts > static_cast<double>(t.totalCounts) or
+                        t.projectedCounts < static_cast<double>(t.uniqueCounts);
+                }
+            }
+
+            if (clusterSize > 1 and requiresProjection) {
+                cptr->projectToPolytope(refs);
+            }
+            ++clusterID;
+        }
+
+        auto& transcripts_ = refs;
+        double nFracDenom{0.0};
+        for (auto& transcript : transcripts_) {
+            nFracDenom += (transcript.projectedCounts / numMappedReads);
+        }
+
+	double invNFracTotal = 1.0 / nFracDenom;
+        for (auto& transcript : transcripts_) {
+		double v = transcript.projectedCounts / numMappedReads;
+		//transcript.setMass(v * invNFracTotal);
+		transcript.setMass(transcript.projectedCounts);
+        }
+
+    }
+
+
+    LibraryFormat hitType(int32_t end1Start, bool end1Fwd,
+                          int32_t end2Start, bool end2Fwd) {
+
+        // If the reads come from opposite strands
+        if (end1Fwd != end2Fwd) {
+            // and if read 1 comes from the forward strand
+            if (end1Fwd) {
+                // then if read 1 start < read 2 start ==> ISF
+                if (end1Start <= end2Start) {
+                    return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::SA);
+                } // otherwise read 2 start < read 1 start ==> OSF
+                else {
+                    return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::SA);
+                }
+            }
+            // and if read 2 comes from the forward strand
+            if (end2Fwd) {
+                // then if read 2 start <= read 1 start ==> ISR
+                if (end2Start <= end1Start) {
+                    return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::AS);
+                } // otherwise, read 2 start > read 1 start ==> OSR
+                else {
+                    return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::AS);
+                }
+            }
+        } else { // Otherwise, the reads come from the same strand
+            if (end1Fwd) { // if it's the forward strand ==> MSF
+                return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::S);
+            } else { // if it's the reverse strand ==> MSR
+                return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::A);
+            }
+        }
+        // SHOULD NOT GET HERE
+        spdlog::get("jointLog")->error("ERROR: Could not associate any known library type with read! "
+                                       "Please report this bug!\n");
+        std::exit(-1);
+        return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::NONE, ReadStrandedness::U);
+    }
+
+
+
+    LibraryFormat hitType(int32_t end1Start, bool end1Fwd, uint32_t len1,
+                          int32_t end2Start, bool end2Fwd, uint32_t len2, bool canDovetail) {
+
+        // If the reads come from opposite strands
+        if (end1Fwd != end2Fwd) {
+            // and if read 1 comes from the forward strand
+            if (end1Fwd) {
+                // then if read 1 start < read 2 start ==> ISF
+                // NOTE: We can't really delineate between inward facing reads that stretch
+                // past each other and outward facing reads --- the purpose of stretch is to help
+                // make this determinateion.
+                int32_t stretch = canDovetail ? len2 : 0;
+                if (end1Start <= end2Start + stretch) {
+                    return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::SA);
+                } // otherwise read 2 start < read 1 start ==> OSF
+                else {
+                    return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::SA);
+                }
+            }
+            // and if read 2 comes from the forward strand
+            if (end2Fwd) {
+                // then if read 2 start <= read 1 start ==> ISR
+                // NOTE: We can't really delineate between inward facing reads that stretch
+                // past each other and outward facing reads --- the purpose of stretch is to help
+                // make this determinateion.
+                int32_t stretch = canDovetail ? len1 : 0;
+                if (end2Start <= end1Start + stretch) {
+                    return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::AS);
+                } // otherwise, read 2 start > read 1 start ==> OSR
+                else {
+                    return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::AS);
+                }
+            }
+        } else { // Otherwise, the reads come from the same strand
+            if (end1Fwd) { // if it's the forward strand ==> MSF
+                return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::S);
+            } else { // if it's the reverse strand ==> MSR
+                return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::A);
+            }
+        }
+        // SHOULD NOT GET HERE
+        spdlog::get("jointLog")->error("ERROR: Could not associate any known library type with read! "
+                                       "Please report this bug!\n");
+        std::exit(-1);
+        return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::NONE, ReadStrandedness::U);
+    }
+
+
+    LibraryFormat hitType(int32_t start, bool isForward) {
+        // If the read comes from the forward strand
+        if (isForward) {
+            return LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::S);
+        } else {
+            return LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::A);
+        }
+        // SHOULD NOT GET HERE
+        fmt::print(stderr, "WARNING: Could not associate known library type with read!\n");
+        return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::NONE, ReadStrandedness::U);
+
+    }
+
+using std::string;
+using NameVector = std::vector<string>;
+using IndexVector = std::vector<size_t>;
+using KmerVector = std::vector<uint64_t>;
+
+/**
+ * This function parses the library format string that specifies the format in which
+ * the reads are to be expected.
+ */
+LibraryFormat parseLibraryFormatStringNew(std::string& fmt) {
+	using std::vector;
+	using std::string;
+	using std::map;
+	using std::stringstream;
+
+    map<string, LibraryFormat> formatMap = {
+        {"IU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::U)},
+        {"ISF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::SA)},
+        {"ISR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::AS)},
+        {"OU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::U)},
+        {"OSF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::SA)},
+        {"OSR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::AS)},
+        {"MU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::U)},
+        {"MSF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::S)},
+        {"MSR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::A)},
+        {"U", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U)},
+        {"SF", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::S)},
+        {"SR", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::A)}};
+
+	// inspired by http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c
+	// first convert the string to upper-case
+	for (auto& c : fmt) { c = std::toupper(c); }
+
+
+    auto libFmtIt = formatMap.find(fmt);
+
+	if (libFmtIt == formatMap.end()) {
+		stringstream errstr;
+		errstr << "unknown library format string : " << fmt;
+		throw std::invalid_argument(errstr.str());
+	}
+
+    return libFmtIt->second;
+}
+
+/**
+ * Parses a set of __ordered__ command line options and extracts the relevant
+ * read libraries from them.
+ */
+std::vector<ReadLibrary> extractReadLibraries(boost::program_options::parsed_options& orderedOptions) {
+	// The current (default) format for paired end data
+	LibraryFormat peFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::U);
+	// The current (default) format for single end data
+	LibraryFormat seFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U);
+
+	std::vector<ReadLibrary> peLibs{ReadLibrary(peFormat)};
+	std::vector<ReadLibrary> seLibs{ReadLibrary(seFormat)};
+	for (auto& opt : orderedOptions.options) {
+		// Update the library type
+		if (opt.string_key == "libType") {
+			auto libFmt = parseLibraryFormatStringNew(opt.value[0]);
+			if (libFmt.type == ReadType::PAIRED_END) {
+				peFormat = libFmt;
+				peLibs.emplace_back(libFmt);
+			} else {
+				seFormat = libFmt;
+				seLibs.emplace_back(libFmt);
+			}
+		}
+		if (opt.string_key == "mates1") {
+			peLibs.back().addMates1(opt.value);
+		}
+		if (opt.string_key == "mates2") {
+			peLibs.back().addMates2(opt.value);
+		}
+		if (opt.string_key == "unmatedReads") {
+			seLibs.back().addUnmated(opt.value);
+		}
+	}
+
+	std::vector<ReadLibrary> libs;
+	libs.reserve(peLibs.size() + seLibs.size());
+	for (auto& lib : boost::range::join(seLibs, peLibs)) {
+		if (lib.format().type == ReadType::SINGLE_END) {
+			if (lib.unmated().size() == 0) {
+				// Didn't use default single end library type
+				continue;
+			}
+		} else if (lib.format().type == ReadType::PAIRED_END) {
+			if (lib.mates1().size() == 0 or lib.mates2().size() == 0) {
+                // Didn't use default paired-end library type
+				continue;
+			}
+		}
+		libs.push_back(lib);
+	}
+    size_t numLibs = libs.size();
+	std::cerr << "there " << ((numLibs > 1) ? "are " : "is ") << libs.size() << ((numLibs > 1) ? " libs\n" : " lib\n");
+	return libs;
+}
+
+
+
+/**
+ * This function parses the library format string that specifies the format in which
+ * the reads are to be expected.
+ */
+LibraryFormat parseLibraryFormatString(std::string& fmt) {
+    using std::vector;
+    using std::string;
+    using std::map;
+    using std::stringstream;
+
+    // inspired by http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c
+
+    // first convert the string to upper-case
+    for (auto& c : fmt) { c = std::toupper(c); }
+    // split on the delimiter ':', and put the key, value (k=v) pairs into a map
+    stringstream ss(fmt);
+    string item;
+    map<string, string> kvmap;
+    while (std::getline(ss, item, ':')) {
+        auto splitPos = item.find('=', 0);
+        string key{item.substr(0, splitPos)};
+        string value{item.substr(splitPos+1)};
+        kvmap[key] = value;
+    }
+
+    map<string, ReadType> readType = {{"SE", ReadType::SINGLE_END}, {"PE", ReadType::PAIRED_END}};
+    map<string, ReadOrientation> orientationType = {{">>", ReadOrientation::SAME},
+                                           {"<>", ReadOrientation::AWAY},
+                                           {"><", ReadOrientation::TOWARD},
+                                           {"*", ReadOrientation::NONE}};
+    map<string, ReadStrandedness> strandType = {{"SA", ReadStrandedness::SA},
+                                    {"AS", ReadStrandedness::AS},
+                                    {"A", ReadStrandedness::A},
+                                    {"S", ReadStrandedness::S},
+                                    {"U", ReadStrandedness::U}};
+    auto it = kvmap.find("T");
+    string typeStr = "";
+    if (it != kvmap.end()) {
+        typeStr = it->second;
+    } else {
+        it = kvmap.find("TYPE");
+        if (it != kvmap.end()) {
+            typeStr = it->second;
+        }
+    }
+
+    if (typeStr != "SE" and typeStr != "PE") {
+        string e = typeStr + " is not a valid read type; must be one of {SE, PE}";
+        throw std::invalid_argument(e);
+    }
+
+    ReadType type = (typeStr == "SE") ? ReadType::SINGLE_END : ReadType::PAIRED_END;
+    ReadOrientation orientation = (type == ReadType::SINGLE_END) ? ReadOrientation::NONE : ReadOrientation::TOWARD;
+    ReadStrandedness strandedness{ReadStrandedness::U};
+    // Construct the LibraryFormat class from the key, value map
+    for (auto& kv : kvmap) {
+        auto& k = kv.first; auto& v = kv.second;
+        if (k == "O" or k == "ORIENTATION") {
+            auto it = orientationType.find(v);
+            if (it != orientationType.end()) { orientation = orientationType[it->first]; } else {
+                string e = v + " is not a valid orientation type; must be one of {>>, <>, ><}";
+                throw std::invalid_argument(e);
+            }
+
+        }
+        if (k == "S" or k == "STRAND") {
+            auto it = strandType.find(v);
+            if (it != strandType.end()) { strandedness = strandType[it->first]; } else {
+                string e = v + " is not a valid strand type; must be one of {SA, AS, S, A, U}";
+                throw std::invalid_argument(e);
+            }
+        }
+
+    }
+    LibraryFormat lf(type, orientation, strandedness);
+    return lf;
+}
+
+
+
+uint64_t encode(uint64_t tid, uint64_t offset) {
+    uint64_t res = (((tid & 0xFFFFFFFF) << 32) | (offset & 0xFFFFFFFF));
+    return res;
+}
+
+uint32_t transcript(uint64_t enc) {
+    uint32_t t = (enc & 0xFFFFFFFF00000000) >> 32;
+    return t;
+}
+
+uint32_t offset(uint64_t enc) {
+    uint32_t o = enc & 0xFFFFFFFF;
+    return o;
+}
+
+size_t numberOfReadsInFastaFile(const std::string& fname) {
+    constexpr size_t bufferSize = 16184;
+    char buffer[bufferSize];
+    std::ifstream ifile(fname, std::ifstream::in);
+    ifile.rdbuf()->pubsetbuf(buffer, bufferSize);
+
+    size_t numReads = 0;
+    std::string s;
+    while (ifile >> s) { if (s.front() == '>') { ++numReads; } }
+
+    ifile.close();
+
+    return numReads;
+}
+
+bool readKmerOrder( const std::string& fname, std::vector<uint64_t>& kmers ) {
+
+  std::ifstream mlist(fname, std::ios::in | std::ios::binary);
+  // Get the number of kmers from file
+  size_t numKmers{0};
+  mlist.read( reinterpret_cast<char*>( &numKmers ), sizeof( size_t ) );
+
+  // Resize the array that will hold the sorted kmers
+  kmers.resize(numKmers, 0);
+  mlist.read( reinterpret_cast<char*>( &kmers[0] ), sizeof( uint64_t) * kmers.size() );
+
+  mlist.close();
+
+  return true;
+}
+
+template <template<typename> class S, typename T>
+bool overlap( const S<T> &a, const S<T> &b ) {
+    // Query from the smaller set to the larger set
+    if ( a.size() <= b.size() ) {
+        for ( auto & ae : a ) {
+            if (b.find(ae) != b.end()) {
+                return true;
+            }
+        }
+    } else {
+        for ( auto & be : b ) {
+            if (a.find(be) != b.end()) {
+                return true;
+            }
+        }
+    }
+    // If nothing from the smaller set is in the larger set, then they don't overlap
+    return false;
+}
+
+
+TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname, std::string key) {
+
+    using std::unordered_set;
+    using std::unordered_map;
+    using std::vector;
+    using std::tuple;
+    using std::string;
+    using std::get;
+
+    // Use GffReader to read the file
+    GffReader reader(const_cast<char*>(fname.c_str()));
+    // Remember the optional attributes
+    reader.readAll(true);
+
+    struct TranscriptKeyPair {
+        const char* transcript_id;
+        const char* key;
+        TranscriptKeyPair(const char* t, const char* k) :
+            transcript_id(t), key(k) {}
+    };
+
+    // The user can group transcripts by gene_id, gene_name, or
+    // an optinal attribute that they provide as a string.
+    enum class TranscriptKey { GENE_ID, GENE_NAME, DYNAMIC };
+
+    // Select the proper attribute by which to group
+    TranscriptKey tkey = TranscriptKey::GENE_ID;
+
+    if (key == "gene_id") {
+    } else if (key == "gene_name") {
+        tkey = TranscriptKey::GENE_NAME;
+    } else {
+        tkey = TranscriptKey::DYNAMIC;
+    }
+
+    // Iterate over all transcript features and build the
+    // transcript <-> key vector.
+    auto nfeat = reader.gflst.Count();
+    std::vector<TranscriptKeyPair> feats;
+    for (int i=0; i < nfeat; ++i) {
+        auto f = reader.gflst[i];
+        if (f->isTranscript()) {
+            const char* keyStr;
+            switch (tkey) {
+                case TranscriptKey::GENE_ID:
+                    keyStr = f->getGeneID();
+                    break;
+                case TranscriptKey::GENE_NAME:
+                    keyStr = f->getGeneName();
+                    break;
+                case TranscriptKey::DYNAMIC:
+                    keyStr = f->getAttr(key.c_str());
+                    break;
+            }
+            feats.emplace_back(f->getID(), keyStr);
+        }
+    }
+
+    // Given the transcript <-> key vector, build the
+    // TranscriptGeneMap.
+
+    IndexVector t2g;
+    NameVector transcriptNames;
+    NameVector geneNames;
+
+    // holds the mapping from transcript ID to gene ID
+    IndexVector t2gUnordered;
+    // holds the set of gene IDs
+    unordered_map<string, size_t> geneNameToID;
+
+    // To read the input and assign ids
+    size_t transcriptCounter = 0;
+    size_t geneCounter = 0;
+    string transcript;
+    string gene;
+
+    std::sort( feats.begin(), feats.end(),
+    []( const TranscriptKeyPair & a, const TranscriptKeyPair & b) -> bool {
+        return std::strcmp(a.transcript_id, b.transcript_id) < 0;
+    } );
+
+    std::string currentTranscript = "";
+    for ( auto & feat : feats ) {
+
+        std::string gene(feat.key);
+        std::string transcript(feat.transcript_id);
+
+        if ( transcript != currentTranscript ) {
+            auto geneIt = geneNameToID.find(gene);
+            size_t geneID = 0;
+
+            if ( geneIt == geneNameToID.end() ) {
+                // If we haven't seen this gene yet, give it a new ID
+                geneNameToID[gene] = geneCounter;
+                geneID = geneCounter;
+                geneNames.push_back(gene);
+                ++geneCounter;
+            } else {
+                // Otherwise lookup the ID
+                geneID = geneIt->second;
+            }
+
+            transcriptNames.push_back(transcript);
+            t2g.push_back(geneID);
+
+            //++transcriptID;
+            currentTranscript = transcript;
+        }
+
+    }
+
+    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
+
+}
+
+
+TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile ) {
+
+    using std::unordered_set;
+    using std::unordered_map;
+    using std::vector;
+    using std::tuple;
+    using std::string;
+    using std::get;
+
+    using NameID = tuple<string, size_t>;
+
+    IndexVector t2g;
+    NameVector transcriptNames;
+    NameVector geneNames;
+
+    // holds the transcript name ID mapping
+    vector<NameID> transcripts;
+    // holds the mapping from transcript ID to gene ID
+    IndexVector t2gUnordered;
+    // holds the set of gene IDs
+    unordered_map<string, size_t> geneNameToID;
+
+    // To read the input and assign ids
+    size_t transcriptCounter = 0;
+    size_t geneCounter = 0;
+    string transcript;
+    string gene;
+
+    while ( ifile >> transcript >> gene ) {
+        // The transcript and it's ID
+        transcripts.push_back( make_tuple(transcript, transcriptCounter) );
+
+        auto geneIt = geneNameToID.find(gene);
+        size_t geneID = 0;
+
+        if ( geneIt == geneNameToID.end() ) {
+            // If we haven't seen this gene yet, give it a new ID
+            geneNameToID[gene] = geneCounter;
+            geneID = geneCounter;
+            geneNames.push_back(gene);
+            ++geneCounter;
+        } else {
+            // Otherwise lookup the ID
+            geneID = geneIt->second;
+        }
+
+        // Map the transcript to the gene in terms of their IDs
+        t2gUnordered.push_back(geneID);
+
+        ++transcriptCounter;
+    }
+
+    std::sort( transcripts.begin(), transcripts.end(),
+               []( const NameID & a, const NameID & b) -> bool { return get<0>(a) < get<0>(b); } );
+
+    // Resize these vectors for fast access
+    transcriptNames.resize(t2gUnordered.size());
+    t2g.resize(t2gUnordered.size());
+
+    for ( size_t newID = 0; newID < transcripts.size(); ++newID ) {
+        // For each transcript, map it to the appropriate gene
+        string oldName; size_t oldID;
+        std::tie(oldName, oldID) = transcripts[newID];
+        t2g[newID] = t2gUnordered[oldID];
+        transcriptNames[newID] = oldName;
+    }
+
+    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
+}
+
+
+TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile ) {
+    using std::vector;
+    using stream_manager = jellyfish::stream_manager<char**>;
+    using sequence_parser = jellyfish::whole_sequence_parser<stream_manager>;
+    namespace bfs = boost::filesystem;
+
+    NameVector transcriptNames;
+    NameVector geneNames {"gene"};
+
+    vector<bfs::path> paths{transcriptsFile};
+
+    // Create a jellyfish parser
+    const int concurrentFile{1};
+    char** fnames = new char*[1];
+    fnames[0] = const_cast<char*>(transcriptsFile.c_str());
+    stream_manager streams(fnames, fnames + 1, concurrentFile);
+
+    size_t maxReadGroupSize{100};
+    sequence_parser parser(4, maxReadGroupSize, concurrentFile, streams);
+
+    // while there are transcripts left to process
+    while (true) {
+        sequence_parser::job j(parser);
+        // If this job is empty, then we're done
+        if (j.is_empty()) { break; }
+
+        for (size_t i=0; i < j->nb_filled; ++i) {
+            // The transcript name
+            std::string fullHeader(j->data[i].header);
+            std::string header = fullHeader.substr(0, fullHeader.find(' '));
+            transcriptNames.emplace_back(header);
+        }
+    }
+
+    // Sort the transcript names
+    std::sort(transcriptNames.begin(), transcriptNames.end());
+
+    // Since we have no real gene groupings, the t2g vector is trivial,
+    // everything maps to gene 0.
+    IndexVector t2g(transcriptNames.size(), 0);
+
+    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
+}
+
+class ExpressionRecord {
+    public:
+        ExpressionRecord(const std::string& targetIn, uint32_t lengthIn,
+                         std::vector<double>& expValsIn) :
+            target(targetIn), length(lengthIn), expVals(expValsIn) {}
+
+        ExpressionRecord( ExpressionRecord&& other ) {
+            std::swap(target, other.target);
+            length = other.length;
+            std::swap(expVals, other.expVals);
+        }
+
+        ExpressionRecord(std::vector<std::string>& inputLine) {
+            if (inputLine.size() < 3) {
+                std::string err ("Any expression line must contain at least 3 tokens");
+                throw std::invalid_argument(err);
+            } else {
+                auto it = inputLine.begin();
+                target = *it; ++it;
+                length = std::stoi(*it); ++it;
+                for (; it != inputLine.end(); ++it) {
+                    expVals.push_back(std::stod(*it));
+                }
+            }
+        }
+
+        std::string target;
+        uint32_t length;
+        std::vector<double> expVals;
+};
+
+// From : http://stackoverflow.com/questions/9435385/split-a-string-using-c11
+std::vector<std::string> split(const std::string& str, int delimiter(int) = ::isspace){
+    using namespace std;
+    vector<string> result;
+    auto e=str.end();
+    auto i=str.begin();
+    while (i != e) {
+        i = find_if_not(i,e, delimiter);
+        if (i == e) break;
+        auto j = find_if(i,e, delimiter);
+        result.push_back(string(i,j));
+        i = j;
+    }
+    return result;
+}
+
+void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm, boost::filesystem::path& inputPath) {
+
+    using std::vector;
+    using std::string;
+    using std::ofstream;
+    using std::unordered_map;
+    using std::move;
+    using std::cerr;
+    using std::max;
+
+    std::ifstream expFile(inputPath.string());
+
+    if (!expFile.is_open()) {
+        perror("Error reading file");
+    }
+
+    //====================== From GeneSum ====================
+    vector<string> comments;
+    unordered_map<string, vector<ExpressionRecord>> geneExps;
+    string l;
+    size_t ln{0};
+
+    while (getline(expFile, l)) {
+        if (++ln % 1000 == 0) {
+            cerr << "\r\rParsed " << ln << " expression lines";
+        }
+        auto it = find_if(l.begin(), l.end(),
+                    [](char c) -> bool {return !isspace(c);});
+        if (it != l.end()) {
+            if (*it == '#') {
+                comments.push_back(l);
+            } else {
+                vector<string> toks = split(l);
+                ExpressionRecord er(toks);
+                auto gn = tgm.geneName(er.target);
+                geneExps[gn].push_back(move(er));
+            }
+        }
+    }
+    cerr << "\ndone\n";
+    expFile.close();
+
+    cerr << "Aggregating expressions to gene level . . .";
+    boost::filesystem::path outputFilePath(inputPath);
+    outputFilePath.replace_extension(".genes.sf");
+    ofstream outFile(outputFilePath.string());
+
+    // preserve any comments in the output
+    for (auto& c : comments) {
+        outFile << c << '\n';
+    }
+
+    for (auto& kv : geneExps) {
+        auto& gn = kv.first;
+
+        uint32_t geneLength{kv.second.front().length};
+        vector<double> expVals(kv.second.front().expVals.size(), 0);
+        const size_t NE{expVals.size()};
+
+        for (auto& tranExp : kv.second) {
+            geneLength = max(geneLength, tranExp.length);
+            for (size_t i = 0; i < NE; ++i) { expVals[i] += tranExp.expVals[i]; }
+        }
+
+        outFile << gn << '\t' << geneLength;
+        for (size_t i = 0; i < NE; ++i) {
+            outFile << '\t' << expVals[i];
+        }
+        outFile << '\n';
+    }
+
+    outFile.close();
+    cerr << " done\n";
+    //====================== From GeneSum =====================
+}
+
+void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
+                                boost::filesystem::path& estDir,
+                                bool haveBiasCorrectedFile) {
+    namespace bfs = boost::filesystem;
+    std::cerr << "Computing gene-level abundance estimates\n";
+    bfs::path gtfExtension(".gtf");
+    auto extension = geneMapPath.extension();
+
+    TranscriptGeneMap tranGeneMap;
+    // parse the map as a GTF file
+    if (extension == gtfExtension) {
+        // Using libgff
+        tranGeneMap = salmon::utils::transcriptGeneMapFromGTF(geneMapPath.string(), "gene_id");
+    } else { // parse the map as a simple format files
+        std::ifstream tgfile(geneMapPath.string());
+        tranGeneMap = salmon::utils::readTranscriptToGeneMap(tgfile);
+        tgfile.close();
+    }
+
+    std::cerr << "There were " << tranGeneMap.numTranscripts() << " transcripts mapping to "
+        << tranGeneMap.numGenes() << " genes\n";
+
+    bfs::path estFilePath = estDir / "quant.sf";
+    if (!bfs::exists(estFilePath)) {
+        std::stringstream errstr;
+        errstr << "Attempting to compute gene-level esimtates, but could not \n"
+            << "find isoform-level file " << estFilePath;
+        throw std::invalid_argument(errstr.str());
+    } else {
+        salmon::utils::aggregateEstimatesToGeneLevel(tranGeneMap, estFilePath);
+    }
+
+    /** Create a gene-level summary of the bias-corrected estimates as well if these exist **/
+    if (haveBiasCorrectedFile) {
+        bfs::path biasCorrectEstFilePath = estDir / "quant_bias_corrected.sf";
+        if (!bfs::exists(biasCorrectEstFilePath)) {
+            std::stringstream errstr;
+            errstr << "Attempting to compute gene-level esimtates, but could not \n"
+                << "find bias-corrected isoform-level file " << biasCorrectEstFilePath;
+            throw std::invalid_argument(errstr.str());
+        } else {
+            salmon::utils::aggregateEstimatesToGeneLevel(tranGeneMap, biasCorrectEstFilePath);
+        }
+    }
+}
+
+}
+}
+
+template
+void salmon::utils::writeAbundances<AlignmentLibrary<ReadPair>>(
+                                              const SalmonOpts& opts,
+                                              AlignmentLibrary<ReadPair>& alnLib,
+                                              boost::filesystem::path& fname,
+                                              std::string headerComments);
+
+template
+void salmon::utils::writeAbundances<AlignmentLibrary<UnpairedRead>>(
+                                                  const SalmonOpts& opts,
+                                                  AlignmentLibrary<UnpairedRead>& alnLib,
+                                                  boost::filesystem::path& fname,
+                                                  std::string headerComments);
+template
+void salmon::utils::writeAbundances<ReadExperiment>(
+                                                  const SalmonOpts& opts,
+                                                  ReadExperiment& alnLib,
+                                                  boost::filesystem::path& fname,
+                                                  std::string headerComments);
+template
+void salmon::utils::writeAbundancesFromCollapsed<AlignmentLibrary<ReadPair>>(
+                                              const SalmonOpts& opts,
+                                              AlignmentLibrary<ReadPair>& alnLib,
+                                              boost::filesystem::path& fname,
+                                              std::string headerComments);
+
+template
+void salmon::utils::writeAbundancesFromCollapsed<AlignmentLibrary<UnpairedRead>>(
+                                                  const SalmonOpts& opts,
+                                                  AlignmentLibrary<UnpairedRead>& alnLib,
+                                                  boost::filesystem::path& fname,
+                                                  std::string headerComments);
+template
+void salmon::utils::writeAbundancesFromCollapsed<ReadExperiment>(
+                                                  const SalmonOpts& opts,
+                                                  ReadExperiment& alnLib,
+                                                  boost::filesystem::path& fname,
+                                                  std::string headerComments);
+
+template
+void salmon::utils::normalizeAlphas<ReadExperiment>(const SalmonOpts& sopt,
+                         	     ReadExperiment& alnLib);
+
+template
+void salmon::utils::normalizeAlphas<AlignmentLibrary<UnpairedRead>>(const SalmonOpts& sopt,
+                         	     AlignmentLibrary<UnpairedRead>& alnLib);
+template
+void salmon::utils::normalizeAlphas<AlignmentLibrary<ReadPair>>(const SalmonOpts& sopt,
+                         	     AlignmentLibrary<ReadPair>& alnLib);
+
+// Old / unused code
+
+/*
+template< typename T >
+TranscriptGeneMap transcriptToGeneMapFromFeatures(std::vector<GenomicFeature<T>> &feats ) {
+    using std::unordered_set;
+    using std::unordered_map;
+    using std::vector;
+    using std::tuple;
+    using std::string;
+    using std::get;
+
+    using NameID = tuple<string, size_t>;
+
+    IndexVector t2g;
+    NameVector transcriptNames;
+    NameVector geneNames;
+
+    // holds the mapping from transcript ID to gene ID
+    IndexVector t2gUnordered;
+    // holds the set of gene IDs
+    unordered_map<string, size_t> geneNameToID;
+
+    // To read the input and assign ids
+    size_t geneCounter = 0;
+    string transcript;
+    string gene;
+
+    std::sort( feats.begin(), feats.end(),
+	    []( const GenomicFeature<T> & a, const GenomicFeature<T> & b) -> bool {
+	    return a.sattr.transcript_id < b.sattr.transcript_id;
+	    } );
+
+    std::string currentTranscript = "";
+    for ( auto & feat : feats ) {
+
+	auto &gene = feat.sattr.gene_id;
+	auto &transcript = feat.sattr.transcript_id;
+
+	if ( transcript != currentTranscript ) {
+	    auto geneIt = geneNameToID.find(gene);
+	    size_t geneID = 0;
+
+	    if ( geneIt == geneNameToID.end() ) {
+		// If we haven't seen this gene yet, give it a new ID
+		geneNameToID[gene] = geneCounter;
+		geneID = geneCounter;
+		geneNames.push_back(gene);
+		++geneCounter;
+	    } else {
+		// Otherwise lookup the ID
+		geneID = geneIt->second;
+	    }
+
+	    transcriptNames.push_back(transcript);
+	    t2g.push_back(geneID);
+
+	    //++transcriptID;
+	    currentTranscript = transcript;
+	}
+
+    }
+
+    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
+}
+*/
+
+
+
diff --git a/src/SequenceBiasModel.cpp b/src/SequenceBiasModel.cpp
new file mode 100644
index 0000000..ac33e06
--- /dev/null
+++ b/src/SequenceBiasModel.cpp
@@ -0,0 +1,311 @@
+#include <iostream>
+#include <cstdio>
+
+#include <boost/config.hpp> // for BOOST_LIKELY/BOOST_UNLIKELY
+
+#include "format.h"
+#include "SequenceBiasModel.hpp"
+#include "LibraryFormat.hpp"
+#include "Transcript.hpp"
+
+
+SequenceBiasModel::SequenceBiasModel(double alpha, uint32_t windowSize) :
+    // Let's try a 0th order model first
+    biasLeftForeground_(AtomicMatrix<double>(windowSize, numBases(), alpha)),
+    biasLeftBackground_(AtomicMatrix<double>(windowSize, numBases(), alpha)),
+    biasRightForeground_(AtomicMatrix<double>(windowSize, numBases(), alpha)),
+    biasRightBackground_(AtomicMatrix<double>(windowSize, numBases(), alpha)),
+    isEnabled_(true),
+    windowSize_(windowSize),
+    burnedIn_(false) {}
+
+bool SequenceBiasModel::burnedIn() { return burnedIn_; }
+void SequenceBiasModel::burnedIn(bool burnedIn) { burnedIn_ = burnedIn; }
+
+void SequenceBiasModel::setLogger(std::shared_ptr<spdlog::logger> logger){
+    logger_ = logger;
+}
+
+bool SequenceBiasModel::hasLogger(){
+    return (logger_) ? true : false;
+}
+
+bool SequenceBiasModel::update(Transcript& ref,
+                               int32_t pos,
+                               bool fwd,
+                               double mass,
+                               double prob,
+                               AtomicMatrix<double>& sequenceProfile) {
+
+    int32_t halfWindow = windowSize_ >> 1;
+    int32_t txpLen = ref.RefLength;
+
+    // We must be at least halfWindow bases into the transcript
+    // and no fewer than halfWindow bases from the end
+    if (pos < halfWindow) { return false; }
+    if (pos + halfWindow >= txpLen) { return false; }
+
+    int32_t start, stop, inc;
+    salmon::stringtools::strand readStrand = salmon::stringtools::strand::forward;
+    if (fwd) {
+        start = pos - halfWindow;
+        stop = pos + halfWindow;
+        for (int p = start; p <= stop; ++p) {
+            size_t currRefBase = salmon::stringtools::samToTwoBit[ref.baseAt(p, readStrand)];
+            sequenceProfile.increment(p - start, currRefBase, mass+prob);
+        }
+    } else {
+        start = pos + halfWindow;
+        stop = pos - halfWindow;
+        for (int p = start; p >= stop; --p) {
+            size_t currRefBase = salmon::stringtools::samToTwoBit[ref.baseAt(p, readStrand)];
+            sequenceProfile.increment(start - p, currRefBase, mass+prob);
+        }
+    }
+    return true;
+}
+
+bool SequenceBiasModel::update(Transcript& ref,
+                               int32_t pos,
+                               LibraryFormat libFmt,
+                               double mass,
+                               double prob) {
+
+    uint32_t offset{100};
+    int32_t bgPos = pos - offset;
+    bool fwd = true;
+    if (libFmt.strandedness == ReadStrandedness::S) {
+        bgPos = pos - offset;
+        fwd = true;
+    } else {
+        bgPos = pos + offset;
+        fwd = false;
+    }
+    int32_t halfWindow = windowSize_ >> 1;
+    int32_t txpLen = ref.RefLength;
+
+    // We must be at least halfWindow bases into the transcript
+    // and no fewer than halfWindow bases from the end
+    if (pos - halfWindow < 100) { return false; }
+    if (pos + halfWindow >= txpLen - 100) { return false; }
+    if (bgPos - halfWindow < 100) { return false; }
+    if (bgPos + halfWindow >= txpLen - 100) { return false; }
+
+    update(ref, pos, fwd, mass, prob, biasLeftForeground_);
+    update(ref, bgPos, fwd, mass, prob, biasLeftBackground_);
+    return true;
+}
+
+bool SequenceBiasModel::update(Transcript& ref,
+                               int32_t pos1,
+                               int32_t pos2,
+                               LibraryFormat libFmt,
+                               double mass,
+                               double prob) {
+
+    int32_t leftPos, rightPos;
+    if (pos1 < pos2) {
+        leftPos = pos1;
+        rightPos = pos2;
+    } else {
+        rightPos = pos1;
+        leftPos = pos2;
+    }
+
+    uint32_t offset{100};
+    int32_t bgPosLeft = leftPos - offset;
+    int32_t bgPosRight = rightPos + offset;
+    int32_t halfWindow = windowSize_ >> 1;
+    int32_t txpLen = ref.RefLength;
+
+    // We must be at least halfWindow bases into the transcript
+    // and no fewer than halfWindow bases from the end
+    if (leftPos < halfWindow) { return false; }
+    if (leftPos + halfWindow >= txpLen) { return false; }
+    if (bgPosLeft < halfWindow) { return false; }
+    if (bgPosLeft + halfWindow >= txpLen) { return false; }
+    if (rightPos < halfWindow) { return false; }
+    if (rightPos + halfWindow >= txpLen) { return false; }
+    if (bgPosRight < halfWindow) { return false; }
+    if (bgPosRight + halfWindow >= txpLen) { return false; }
+
+    update(ref, leftPos, true, mass, prob, biasLeftForeground_);
+    update(ref, bgPosLeft, true, mass, prob, biasLeftBackground_);
+    update(ref, rightPos, true, mass, prob, biasRightForeground_);
+    update(ref, bgPosRight, true, mass, prob, biasRightBackground_);
+    return true;
+}
+
+double SequenceBiasModel::seqProb(Transcript& ref,
+                                  int32_t pos,
+                                  bool isFwd,
+                                  AtomicMatrix<double>& profile) {
+    int32_t halfWindow = windowSize_ >> 1;
+    salmon::stringtools::strand readStrand = salmon::stringtools::strand::forward;
+    int32_t start, stop;
+    double prob = salmon::math::LOG_1;
+
+    if (isFwd) {
+        start = pos - halfWindow;
+        stop = pos + halfWindow;
+        for (int p = start; p <= stop; ++p) {
+            size_t currRefBase = salmon::stringtools::samToTwoBit[ref.baseAt(p, readStrand)];
+            prob += profile(p - start, currRefBase);
+        }
+    } else {
+        start = pos + halfWindow;
+        stop = pos - halfWindow;
+        for (int p = start; p >= stop; --p) {
+            size_t currRefBase = salmon::stringtools::samToTwoBit[ref.baseAt(p, readStrand)];
+            prob += profile(start - p, currRefBase);
+        }
+    }
+
+    return prob;
+}
+
+
+double SequenceBiasModel::biasFactor(Transcript& ref,
+                                     int32_t pos,
+                                     LibraryFormat libFormat,
+                                     AtomicMatrix<double>& foregroundProfile,
+                                     AtomicMatrix<double>& backgroundProfile) {
+
+    int32_t halfWindow = windowSize_ >> 1;
+    int32_t txpLen = ref.RefLength;
+
+    bool isFwd = true;
+    if (libFormat.strandedness == ReadStrandedness::S) {
+        isFwd = true;
+    } else {
+        isFwd = false;
+    }
+
+    // We must be at least halfWindow bases into the transcript
+    // and no fewer than halfWindow bases from the end
+    if (pos - halfWindow > 100) { return salmon::math::LOG_1; }
+    if (pos + halfWindow >= txpLen - 100) { return salmon::math::LOG_1; }
+
+    double condProbBack = seqProb(ref, pos, isFwd, backgroundProfile);
+    double condProbFore = seqProb(ref, pos, isFwd, foregroundProfile);
+    return (condProbBack - condProbFore);
+}
+
+double SequenceBiasModel::biasFactor(Transcript& ref,
+                                     int32_t pos1,
+                                     int32_t pos2,
+                                     LibraryFormat libFormat) {
+
+    int32_t leftPos, rightPos;
+    if (pos1 < pos2) {
+        leftPos = pos1;
+        rightPos = pos2;
+    } else {
+        rightPos = pos1;
+        leftPos = pos2;
+    }
+
+    int32_t halfWindow = windowSize_ >> 1;
+    int32_t txpLen = ref.RefLength;
+
+    // We must be at least halfWindow bases into the transcript
+    // and no fewer than halfWindow bases from the end
+    if (leftPos < halfWindow) { return salmon::math::LOG_1; }
+    if (leftPos + halfWindow >= txpLen) { return salmon::math::LOG_1; }
+    if (rightPos < halfWindow) { return salmon::math::LOG_1; }
+    if (rightPos + halfWindow >= txpLen) { return salmon::math::LOG_1; }
+
+    double biasLeft = biasFactor(ref, leftPos, libFormat,
+                                 biasLeftForeground_, biasLeftBackground_);
+    double biasRight = biasFactor(ref, rightPos, libFormat,
+                                  biasRightForeground_, biasRightBackground_);
+
+    // This is in log space, so it's actually log(leftBias * rightBias).  Is
+    // multiplying the biases the right thing to do here, or should we average
+    // them (i.e. log([leftBias + rightBias]/2))?
+    return biasLeft + biasRight;
+}
+
+double SequenceBiasModel::biasFactor(Transcript& ref, int32_t pos,
+                                     LibraryFormat libFormat) {
+
+    int32_t halfWindow = windowSize_ >> 1;
+    int32_t txpLen = ref.RefLength;
+
+    // We must be at least halfWindow bases into the transcript
+    // and no fewer than halfWindow bases from the end
+    if (pos < halfWindow) { return salmon::math::LOG_1; }
+    if (pos + halfWindow >= txpLen) { return salmon::math::LOG_1; }
+
+    double bias = biasFactor(ref, pos, libFormat,
+                             biasLeftForeground_, biasLeftBackground_);
+    return bias;
+}
+
+std::string SequenceBiasModel::toString() {
+    fmt::MemoryWriter srep;
+    srep << "{";
+    srep << "\"fg\": {";
+    for (size_t i = 0; i < windowSize_; ++i) {
+        srep << "\"" << i << "\":{";
+        for (size_t r = 0; r < numBases(); ++r) {
+            switch (r) {
+                case 0:
+                    srep << "\"A\":";
+                    break;
+                case 1:
+                    srep << "\"C\":";
+                    break;
+                case 2:
+                    srep << "\"G\":";
+                    break;
+                case 3:
+                    srep << "\"T\":";
+                    break;
+            }
+            srep << std::exp(biasLeftForeground_(i,r));
+            if (r < numBases()-1) {
+                srep << ", ";
+            } else {
+                srep << "} ";
+            }
+        }
+        if (i < windowSize_ -1) {
+            srep << ",";
+        }
+    }
+
+    srep << "}, ";
+    srep << "\"bg\": {";
+    for (size_t i = 0; i < windowSize_; ++i) {
+        srep << "\"" << i << "\":{";
+        for (size_t r = 0; r < numBases(); ++r) {
+            switch (r) {
+                case 0:
+                    srep << "\"A\":";
+                    break;
+                case 1:
+                    srep << "\"C\":";
+                    break;
+                case 2:
+                    srep << "\"G\":";
+                    break;
+                case 3:
+                    srep << "\"T\":";
+                    break;
+            }
+            srep << std::exp(biasLeftBackground_(i,r));
+            if (r < numBases()-1) {
+                srep << ", ";
+            } else {
+                srep << "} ";
+            }
+        }
+        if (i < windowSize_ -1) {
+            srep << ",";
+        }
+    }
+    srep << "}}";
+
+    return srep.str();
+}
diff --git a/src/StadenUtils.cpp b/src/StadenUtils.cpp
new file mode 100644
index 0000000..0b90118
--- /dev/null
+++ b/src/StadenUtils.cpp
@@ -0,0 +1,17 @@
+#include "StadenUtils.hpp"
+
+namespace staden {
+    namespace utils{
+
+        bam_seq_t* bam_init() {
+            return reinterpret_cast<bam_seq_t*>(calloc(1, sizeof(bam_seq_t)));
+        }
+
+        void bam_destroy(bam_seq_t* b) {
+            if (b == 0) { return; }
+            free(b);
+        }
+
+    }
+}
+
diff --git a/src/TestUtils.cpp b/src/TestUtils.cpp
new file mode 100644
index 0000000..7c4f6f0
--- /dev/null
+++ b/src/TestUtils.cpp
@@ -0,0 +1,72 @@
+/**
+>HEADER
+    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+
+    This file is part of Salmon.
+
+    Salmon is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Salmon is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
+<HEADER
+**/
+
+
+#include <vector>
+#include <fstream>
+#include <memory>
+#include <boost/range/irange.hpp>
+
+#include "utils.hpp"
+#include "matrix_tools.hpp"
+
+int main(int argc, char* argv[]) {
+	using std::vector;
+	using std::string;
+	using std::ifstream;
+
+	ifstream ifile(argv[1]);
+	auto t2g = utils::readTranscriptToGeneMap(ifile);
+	std::cerr << "read " << t2g.numTranscripts() << " transcripts\n";
+	std::cerr << "which mapped to " << t2g.numGenes() << " genes\n";
+    ifile.close();
+
+
+    /*
+    *  A = [0 1 0 0 0 1 1]
+	*      [0 0 1 1 0 1 1]
+	*      [0 0 0 1 1 0 0]
+	*  counts = [ 0 4 4 8 5 3 2 ]
+	*/
+    std::vector<std::vector<double>> A{{0,1,0,0,0,1,1},
+                                       {0,0,1,1,0,1,1},
+                                       {0,0,0,1,1,0,0}};
+    std::vector<double> counts{0,4,4,8,5,3,2};
+
+    std::unique_ptr<std::vector<std::vector<double>>> collapsedA{nullptr};
+    std::unique_ptr<std::vector<double>> collapsedCounts{nullptr};
+    matrix_tools::collapseIntoCategories(A, counts, collapsedA,collapsedCounts);
+
+    for (size_t i = 0; i < collapsedA->size(); ++i ){
+    	std::cerr << "[";
+    	for ( size_t j = 0; j < (*collapsedA)[0].size(); ++j ) {
+    		std::cerr << (*collapsedA)[i][j] << ", ";
+    	}
+    	std::cerr << "]\n";
+    }
+    std::cerr << "\n[";
+    for ( auto c : *collapsedCounts ) { std::cerr << c << ", "; }
+    std::cerr << "]\n";
+
+    for ( auto i : boost::irange(0, 500) ) {
+        std::cerr << i << ", ";
+    }
+}
\ No newline at end of file
diff --git a/src/TranscriptGroup.cpp b/src/TranscriptGroup.cpp
new file mode 100644
index 0000000..0870a05
--- /dev/null
+++ b/src/TranscriptGroup.cpp
@@ -0,0 +1,55 @@
+#include <vector>
+#include <cstdint>
+
+#include "TranscriptGroup.hpp"
+#include "SalmonMath.hpp"
+
+
+TranscriptGroup::TranscriptGroup() : hash(0) {}
+
+TranscriptGroup::TranscriptGroup(std::vector<uint32_t> txpsIn) : txps(txpsIn),
+    valid(true) {
+        size_t seed{0};
+        for (auto e : txps) {
+            boost::hash_combine(seed, e);
+        }
+        hash = seed;
+    }
+
+TranscriptGroup::TranscriptGroup(
+        std::vector<uint32_t> txpsIn,
+	    size_t hashIn) : txps(txpsIn), hash(hashIn), valid(true) {}
+
+TranscriptGroup::TranscriptGroup(const TranscriptGroup& other){
+    txps = other.txps;
+    hash = other.hash;
+    valid = other.valid;
+}
+
+TranscriptGroup& TranscriptGroup::operator=(const TranscriptGroup& other){
+    txps = other.txps;
+    hash = other.hash;
+    valid = other.valid;
+    return *this;
+}
+
+TranscriptGroup::TranscriptGroup(TranscriptGroup&& other) {
+    txps = std::move(other.txps);
+    hash = other.hash;
+    valid = other.valid;
+}
+
+void TranscriptGroup::setValid(bool b) const { valid = b; }
+
+TranscriptGroup& TranscriptGroup::operator=(TranscriptGroup&& other) {
+    txps = std::move(other.txps);
+    hash = other.hash;
+    valid = other.valid;
+    return *this;
+}
+
+bool operator==(const TranscriptGroup& lhs, const TranscriptGroup& rhs) {
+    return lhs.txps == rhs.txps;
+};
+
+
diff --git a/src/VersionChecker.cpp b/src/VersionChecker.cpp
new file mode 100644
index 0000000..694230e
--- /dev/null
+++ b/src/VersionChecker.cpp
@@ -0,0 +1,186 @@
+// based off of
+// async_client.cpp
+// ~~~~~~~~~~~~~~~~
+//
+// Copyright (c) 2003-2012 Christopher M. Kohlhoff (chris at kohlhoff dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+
+#include "VersionChecker.hpp"
+#include "SalmonConfig.hpp"
+
+using boost::asio::ip::tcp;
+
+
+VersionChecker::VersionChecker(boost::asio::io_service& io_service,
+  const std::string& server, const std::string& path)
+  : resolver_(io_service),
+  socket_(io_service),
+  deadline_(io_service)
+{
+    // Form the request. We specify the "Connection: close" header so that the
+    // server will close the socket after transmitting the response. This will
+    // allow us to treat all data up until the EOF as the content.
+    std::ostream request_stream(&request_);
+    request_stream << "GET " << path << " HTTP/1.0\r\n";
+    request_stream << "Host: " << server << "\r\n";
+    request_stream << "Accept: */*\r\n";
+    request_stream << "Connection: close\r\n\r\n";
+
+    deadline_.async_wait(boost::bind(&VersionChecker::cancel_upgrade_check, this, boost::asio::placeholders::error));
+    deadline_.expires_from_now(boost::posix_time::seconds(7));
+
+    // Start an asynchronous resolve to translate the server and service names
+    // into a list of endpoints.
+    tcp::resolver::query query(server, "http");
+    resolver_.async_resolve(query,
+        boost::bind(&VersionChecker::handle_resolve, this,
+          boost::asio::placeholders::error,
+          boost::asio::placeholders::iterator));
+}
+
+std::string VersionChecker::message() {
+  return messageStream_.str();
+}
+
+
+void VersionChecker::cancel_upgrade_check(const boost::system::error_code& err) {
+  if ( err != boost::asio::error::operation_aborted ) {
+    deadline_.cancel();
+    messageStream_ << "Could not resolve upgrade information in the alotted time.\n";
+    messageStream_ << "Check for upgrades manually at https://combine-lab.github.io/salmon\n";
+    socket_.close();
+  }
+}
+
+void VersionChecker::handle_resolve(const boost::system::error_code& err,
+      tcp::resolver::iterator endpoint_iterator) {
+  if (!err) {
+    // Attempt a connection to each endpoint in the list until we
+    // successfully establish a connection.
+    boost::asio::async_connect(socket_, endpoint_iterator,
+      boost::bind(&VersionChecker::handle_connect, this,
+        boost::asio::placeholders::error));
+  } else {
+    deadline_.cancel();
+    cancel_upgrade_check(err);
+  }
+}
+
+void VersionChecker::handle_connect(const boost::system::error_code& err) {
+  if (!err) {
+    // The connection was successful. Send the request.
+    boost::asio::async_write(socket_, request_,
+      boost::bind(&VersionChecker::handle_write_request, this,
+        boost::asio::placeholders::error));
+  } else {
+    deadline_.cancel();
+    cancel_upgrade_check(err);
+  }
+}
+
+void VersionChecker::handle_write_request(const boost::system::error_code& err) {
+  if (!err) {
+    // Read the response status line. The response_ streambuf will
+    // automatically grow to accommodate the entire line. The growth may be
+    // limited by passing a maximum size to the streambuf constructor.
+    boost::asio::async_read_until(socket_, response_, "\r\n",
+      boost::bind(&VersionChecker::handle_read_status_line, this,
+        boost::asio::placeholders::error));
+  } else {
+    deadline_.cancel();
+    cancel_upgrade_check(err);
+  }
+}
+
+void VersionChecker::handle_read_status_line(const boost::system::error_code& err) {
+  if (!err) {
+    // Check that response is OK.
+    std::istream response_stream(&response_);
+    std::string http_version;
+    response_stream >> http_version;
+    unsigned int status_code;
+    response_stream >> status_code;
+    std::string status_message;
+    std::getline(response_stream, status_message);
+    if (!response_stream || http_version.substr(0, 5) != "HTTP/") {
+      deadline_.cancel();
+      cancel_upgrade_check(err);
+      return;
+    }
+    if (status_code != 200) {
+      deadline_.cancel();
+      cancel_upgrade_check(err);
+      return;
+    }
+
+    // Read the response headers, which are terminated by a blank line.
+    boost::asio::async_read_until(socket_, response_, "\r\n\r\n",
+      boost::bind(&VersionChecker::handle_read_headers, this,
+        boost::asio::placeholders::error));
+  } else {
+    deadline_.cancel();
+    cancel_upgrade_check(err);
+  }
+}
+
+void VersionChecker::handle_read_headers(const boost::system::error_code& err) {
+  if (!err) {
+    deadline_.cancel();
+    // Process the response headers.
+    std::istream response_stream(&response_);
+    std::string header;
+    while (std::getline(response_stream, header) && header != "\r") {}
+
+    // Write whatever content we already have to output.
+    if (response_.size() > 0)
+      messageStream_ << &response_;
+
+    // Start reading remaining data until EOF.
+    boost::asio::async_read(socket_, response_,
+      boost::asio::transfer_at_least(1),
+      boost::bind(&VersionChecker::handle_read_content, this,
+        boost::asio::placeholders::error));
+  } else {
+    deadline_.cancel();
+    cancel_upgrade_check(err);
+  }
+}
+
+void VersionChecker::handle_read_content(const boost::system::error_code& err) {
+  if (!err) {
+    // Write all of the data that has been read so far.
+    messageStream_ << &response_;
+
+    // Continue reading remaining data until EOF.
+    boost::asio::async_read(socket_, response_,
+      boost::asio::transfer_at_least(1),
+      boost::bind(&VersionChecker::handle_read_content, this,
+        boost::asio::placeholders::error));
+  } else if (err != boost::asio::error::eof) {
+    deadline_.cancel();
+    cancel_upgrade_check(err);
+  }
+}
+
+std::string getVersionMessage() {
+  std::string baseSite{"combine-lab.github.io"};
+  std::string path{"/salmon/version_info/"};
+  path += salmon::version;
+
+  std::stringstream ss;
+  try {
+    boost::asio::io_service io_service;
+    VersionChecker c(io_service, baseSite, path);
+    io_service.run();
+    ss << "Version Info: " << c.message();
+  }
+  catch (std::exception& e)
+  {
+    ss << "Version Info Exception: " << e.what() << "\n";
+  }
+
+  return ss.str();
+}
diff --git a/src/bwt_gen.c b/src/bwt_gen.c
new file mode 100644
index 0000000..76f28c9
--- /dev/null
+++ b/src/bwt_gen.c
@@ -0,0 +1,1632 @@
+/*
+
+   BWTConstruct.c		BWT-Index Construction
+
+   This module constructs BWT and auxiliary data structures.
+
+   Copyright (C) 2004, Wong Chi Kwong.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; either version 2
+   of the License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <errno.h>
+#include "QSufSort.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+#  include "malloc_wrap.h"
+#endif
+
+typedef uint64_t bgint_t;
+typedef int64_t sbgint_t;
+
+#define ALPHABET_SIZE				4
+#define BIT_PER_CHAR				2
+#define CHAR_PER_WORD				16
+#define CHAR_PER_BYTE				4
+
+#define BITS_IN_WORD 32
+#define BITS_IN_BYTE 8
+#define BYTES_IN_WORD 4
+
+#define ALL_ONE_MASK 0xFFFFFFFF
+#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD	65536
+
+#define BITS_PER_OCC_VALUE			16
+#define OCC_VALUE_PER_WORD			2
+#define OCC_INTERVAL				256
+#define OCC_INTERVAL_MAJOR			65536
+
+#define TRUE    1
+#define FALSE   0
+
+#define BWTINC_INSERT_SORT_NUM_ITEM 7
+
+#define MIN_AVAILABLE_WORD 0x10000
+
+#define average(value1, value2)					( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 )
+#define min(value1, value2)						( ((value1) < (value2)) ? (value1) : (value2) )
+#define max(value1, value2)						( ((value1) > (value2)) ? (value1) : (value2) )
+#define med3(a, b, c)							( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
+#define swap(a, b, t);							t = a; a = b; b = t;
+#define truncateLeft(value, offset)				( (value) << (offset) >> (offset) )
+#define truncateRight(value, offset)			( (value) >> (offset) << (offset) )
+#define DNA_OCC_SUM_EXCEPTION(sum)			((sum & 0xfefefeff) == 0)
+
+typedef struct BWT {
+	bgint_t textLength;					// length of the text
+	bgint_t inverseSa0;					// SA-1[0]
+	bgint_t *cumulativeFreq;			// cumulative frequency
+	unsigned int *bwtCode;				// BWT code
+	unsigned int *occValue;				// Occurrence values stored explicitly
+	bgint_t *occValueMajor;				// Occurrence values stored explicitly
+	unsigned int *decodeTable;			// For decoding BWT by table lookup
+	bgint_t bwtSizeInWord;				// Temporary variable to hold the memory allocated
+	bgint_t occSizeInWord;				// Temporary variable to hold the memory allocated
+	bgint_t occMajorSizeInWord;			// Temporary variable to hold the memory allocated
+} BWT;
+
+typedef struct BWTInc {
+	BWT *bwt;
+	unsigned int numberOfIterationDone;
+	bgint_t *cumulativeCountInCurrentBuild;
+	bgint_t availableWord;
+	bgint_t buildSize;
+	bgint_t initialMaxBuildSize;
+	bgint_t incMaxBuildSize;
+	unsigned int firstCharInLastIteration;
+	unsigned int *workingMemory;
+	unsigned int *packedText;
+	unsigned char *textBuffer;
+	unsigned int *packedShift;
+} BWTInc;
+
+static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar,
+											 unsigned int lastByteLength)
+{
+	return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength;
+}
+
+static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue)
+{
+	bgint_t i;
+	for (i=0; i<length; i++) startAddr[i] = initValue;
+}
+
+static void initializeVAL_bg(bgint_t *startAddr, const bgint_t length, const bgint_t initValue)
+{
+	bgint_t i;
+	for (i=0; i<length; i++) startAddr[i] = initValue;
+}
+
+static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable)
+{
+	unsigned int i, j, c, t;
+
+	for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) {
+		dnaDecodeTable[i] = 0;
+		c = i;
+		for (j=0; j<8; j++) {
+			t = c & 0x00000003;
+			dnaDecodeTable[i] += 1 << (t * 8);
+			c >>= 2;
+		}
+	}
+
+}
+// for BWTIncCreate()
+static bgint_t BWTOccValueMajorSizeInWord(const bgint_t numChar)
+{
+	bgint_t numOfOccValue;
+	unsigned numOfOccIntervalPerMajor;
+	numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
+	numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
+	return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE;
+}
+// for BWTIncCreate()
+static bgint_t BWTOccValueMinorSizeInWord(const bgint_t numChar)
+{
+	bgint_t numOfOccValue;
+	numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;		// Value at both end for bi-directional encoding
+	return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE;
+}
+// for BWTIncCreate()
+static bgint_t BWTResidentSizeInWord(const bgint_t numChar) {
+
+	bgint_t numCharRoundUpToOccInterval;
+
+	// The $ in BWT at the position of inverseSa0 is not encoded
+	numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL;
+
+	return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
+
+}
+
+static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
+{
+	bgint_t maxBuildSize;
+
+	if (bwtInc->bwt->textLength == 0) {
+		// initial build
+		// Minus 2 because n+1 entries of seq and rank needed for n char
+		maxBuildSize = (bwtInc->availableWord - (2 + OCC_INTERVAL / CHAR_PER_WORD) * (sizeof(bgint_t) / 4))
+							/ (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4);
+		if (bwtInc->initialMaxBuildSize > 0) {
+			bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize);
+		} else {
+			bwtInc->buildSize = maxBuildSize;
+		}
+	} else {
+		// Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char
+		// Minus numberOfIterationDone because bwt slightly shift to left in each iteration
+		maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord
+							 - (3 + bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) * (sizeof(bgint_t) / 4)) 
+							 / 3 / (sizeof(bgint_t) / 4);
+		if (maxBuildSize < CHAR_PER_WORD) {
+			fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
+			exit(1);
+		}
+		if (bwtInc->incMaxBuildSize > 0) {
+            bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize);
+		} else {
+			bwtInc->buildSize = maxBuildSize;
+		}
+		if (bwtInc->buildSize < CHAR_PER_WORD)
+			bwtInc->buildSize = CHAR_PER_WORD;
+	}
+
+	if (bwtInc->buildSize < CHAR_PER_WORD) {
+		fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
+		exit(1);
+	}
+
+	bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
+
+	bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4);
+	bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4));
+}
+
+// for ceilLog2()
+unsigned int leadingZero(const unsigned int input)
+{
+	unsigned int l;
+	const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+											 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+											 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+											 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+	if (input & 0xFFFF0000) {
+		if (input & 0xFF000000) {
+			l = leadingZero8bit[input >> 24];
+		} else {
+			l = 8 + leadingZero8bit[input >> 16];
+		}
+	} else {
+		if (input & 0x0000FF00) {
+			l = 16 + leadingZero8bit[input >> 8];
+		} else {
+			l = 24 + leadingZero8bit[input];
+		}
+	}
+	return l;
+
+}
+// for BitPerBytePackedChar()
+static unsigned int ceilLog2(const unsigned int input)
+{
+	if (input <= 1) return 0;
+	return BITS_IN_WORD - leadingZero(input - 1);
+
+}
+// for ConvertBytePackedToWordPacked()
+static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize)
+{
+	unsigned int bitPerChar;
+	bitPerChar = ceilLog2(alphabetSize);
+	// Return the largest number of bit that does not affect packing efficiency
+	if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar)
+		bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar);
+	return bitPerChar;
+}
+// for ConvertBytePackedToWordPacked()
+static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize)
+{
+	return ceilLog2(alphabetSize);
+}
+
+static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize,
+										  const bgint_t textLength)
+{
+	bgint_t i;
+	unsigned int j, k, c;
+	unsigned int bitPerBytePackedChar;
+	unsigned int bitPerWordPackedChar;
+	unsigned int charPerWord;
+	unsigned int charPerByte;
+	unsigned int bytePerIteration;
+	bgint_t byteProcessed = 0;
+	bgint_t wordProcessed = 0;
+	unsigned int mask, shift;
+	
+	unsigned int buffer[BITS_IN_WORD];
+
+	bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize);
+	bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize);
+	charPerByte = BITS_IN_BYTE / bitPerBytePackedChar;
+	charPerWord = BITS_IN_WORD / bitPerWordPackedChar;
+
+	bytePerIteration = charPerWord / charPerByte;
+	mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar);
+	shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar;
+
+	while ((wordProcessed + 1) * charPerWord < textLength) {
+
+		k = 0;
+		for (i=0; i<bytePerIteration; i++) {
+			c = (unsigned int)input[byteProcessed] << shift;
+			for (j=0; j<charPerByte; j++) {
+				buffer[k] = c & mask;
+				c <<= bitPerBytePackedChar;
+				k++;
+			}
+			byteProcessed++;
+		}
+
+		c = 0;
+		for (i=0; i<charPerWord; i++) {
+			c |= buffer[i] >> bitPerWordPackedChar * i;
+		}
+		output[wordProcessed] = c;
+		wordProcessed++;
+
+	}
+
+	k = 0;
+	for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) {
+		c = (unsigned int)input[byteProcessed] << shift;
+		for (j=0; j<charPerByte; j++) {
+			buffer[k] = c & mask;
+			c <<= bitPerBytePackedChar;
+			k++;
+		}
+		byteProcessed++;
+	}
+
+	c = 0;
+	for (i=0; i<textLength - wordProcessed * charPerWord; i++) {
+		c |= buffer[i] >> bitPerWordPackedChar * i;
+	}
+	output[wordProcessed] = c;
+}
+
+BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable)
+{
+	BWT *bwt;
+
+	bwt = (BWT*)calloc(1, sizeof(BWT));
+
+	bwt->textLength = 0;
+
+	bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t));
+	initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);
+
+	bwt->bwtSizeInWord = 0;
+
+	// Generate decode tables
+	if (decodeTable == NULL) {
+		bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int));
+		GenerateDNAOccCountTable(bwt->decodeTable);
+	} else {
+		bwt->decodeTable = decodeTable;
+	}
+
+	bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength);
+	bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t));
+
+	bwt->occSizeInWord = 0;
+	bwt->occValue = NULL;
+
+	return bwt;
+}
+
+BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, unsigned int incMaxBuildSize)
+{
+	BWTInc *bwtInc;
+	unsigned int i, n_iter;
+
+	if (textLength < incMaxBuildSize) incMaxBuildSize = textLength;
+	if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength;
+
+	bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc));
+	bwtInc->numberOfIterationDone = 0;
+	bwtInc->bwt = BWTCreate(textLength, NULL);
+	bwtInc->initialMaxBuildSize = initialMaxBuildSize;
+	bwtInc->incMaxBuildSize = incMaxBuildSize;
+	bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t));
+	initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
+
+	// Build frequently accessed data
+	bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int));
+	for (i=0; i<CHAR_PER_WORD; i++)
+		bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR;
+
+	n_iter = (textLength - initialMaxBuildSize) / incMaxBuildSize + 1;
+	bwtInc->availableWord = BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength) // minimal memory requirement
+		+ OCC_INTERVAL / BIT_PER_CHAR * n_iter * 2 * (sizeof(bgint_t) / 4) // buffer at the end of occ array 
+		+ incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration
+	if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small
+	fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord);
+	bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD);
+
+	return bwtInc;
+}
+// for BWTIncConstruct()
+static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank,
+									  bgint_t* __restrict cumulativeCount, const bgint_t numChar)
+{
+	bgint_t i;
+	unsigned int j;
+	unsigned int c, t;
+	unsigned int packedMask;
+	bgint_t rankIndex;
+	bgint_t lastWord;
+	unsigned int numCharInLastWord;
+
+	lastWord = (numChar - 1) / CHAR_PER_WORD;
+	numCharInLastWord = numChar - lastWord * CHAR_PER_WORD;
+
+	packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR);
+	rankIndex = numChar - 1;
+
+	t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR);
+	for (i=0; i<numCharInLastWord; i++) {
+		c = t & packedMask;
+		cumulativeCount[c+1]++;
+		rank[rankIndex] = c;
+		rankIndex--;
+		t >>= BIT_PER_CHAR;
+	}
+
+	for (i=lastWord; i--;) {	// loop from lastWord - 1 to 0
+		t = packedText[i];
+		for (j=0; j<CHAR_PER_WORD; j++) {
+			c = t & packedMask;
+			cumulativeCount[c+1]++;
+			rank[rankIndex] = c;
+			rankIndex--;
+			t >>= BIT_PER_CHAR;
+		}
+	}
+
+	// Convert occurrence to cumulativeCount
+	cumulativeCount[2] += cumulativeCount[1];
+	cumulativeCount[3] += cumulativeCount[2];
+	cumulativeCount[4] += cumulativeCount[3];
+}
+
+
+static void ForwardDNAAllOccCountNoLimit(const unsigned int*  dna, const bgint_t index,
+										 bgint_t* __restrict occCount, const unsigned int*  dnaDecodeTable)
+{
+	static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
+											   0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
+											   0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
+											   0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
+
+	bgint_t iteration, i;
+	unsigned int wordToCount, charToCount;
+	unsigned int j, c, sum;
+
+	occCount[0] = 0;
+	occCount[1] = 0;
+	occCount[2] = 0;
+	occCount[3] = 0;
+
+	iteration = index / 256;
+	wordToCount = (index - iteration * 256) / 16;
+	charToCount = index - iteration * 256 - wordToCount * 16;
+
+	for (i=0; i<iteration; i++) {
+
+		sum = 0;
+		for (j=0; j<16; j++) {
+			sum += dnaDecodeTable[*dna >> 16];
+			sum += dnaDecodeTable[*dna & 0x0000FFFF];
+			dna++;
+		}
+		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+			occCount[0] += sum & 0x000000FF;	sum >>= 8;
+			occCount[1] += sum & 0x000000FF;	sum >>= 8;
+			occCount[2] += sum & 0x000000FF;	sum >>= 8;
+			occCount[3] += sum;
+		} else {
+			// only some or all of the 3 bits are on
+			// in reality, only one of the four cases are possible
+			if (sum == 0x00000100) {
+				occCount[0] += 256;
+			} else if (sum == 0x00010000) {
+				occCount[1] += 256;
+			} else if (sum == 0x01000000) {
+				occCount[2] += 256;
+			} else if (sum == 0x00000000) {
+				occCount[3] += 256;
+			} else {
+				fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n");
+				exit(1);
+			}
+		}
+
+	}
+
+	sum = 0;
+	for (j=0; j<wordToCount; j++) {
+		sum += dnaDecodeTable[*dna >> 16];
+		sum += dnaDecodeTable[*dna & 0x0000FFFF];
+		dna++;
+	}
+
+	if (charToCount > 0) {
+		c = *dna & truncateRightMask[charToCount];	// increase count of 'a' by 16 - c;
+		sum += dnaDecodeTable[c >> 16];
+		sum += dnaDecodeTable[c & 0xFFFF];
+		sum += charToCount - 16;	// decrease count of 'a' by 16 - positionToProcess
+	}
+
+	occCount[0] += sum & 0x000000FF;	sum >>= 8;
+	occCount[1] += sum & 0x000000FF;	sum >>= 8;
+	occCount[2] += sum & 0x000000FF;	sum >>= 8;
+	occCount[3] += sum;
+}
+
+static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar,
+								 const bgint_t *cumulativeCount, const unsigned int *packedShift) {
+
+	bgint_t i, r;
+	unsigned int c;
+	bgint_t previousRank, currentRank;
+	bgint_t wordIndex, charIndex;
+	bgint_t inverseSa0;
+
+	inverseSa0 = previousRank = relativeRank[0];
+
+	for (i=1; i<=numChar; i++) {
+		currentRank = relativeRank[i];
+		// previousRank > cumulativeCount[c] because $ is one of the char
+		c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) 
+											    + (previousRank > cumulativeCount[3]);
+		// set bwt for currentRank
+		if (c > 0) {
+			// c <> 'a'
+			r = currentRank;
+			if (r > inverseSa0) {
+				// - 1 because $ at inverseSa0 is not encoded			
+				r--;
+			}
+			wordIndex = r / CHAR_PER_WORD;
+			charIndex = r - wordIndex * CHAR_PER_WORD;
+			bwt[wordIndex] |= c << packedShift[charIndex];
+		}
+		previousRank = currentRank;
+	}
+}
+
+static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit,
+											   const unsigned int character)
+{
+	bgint_t occIndexMajor;
+
+	occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR;
+
+	if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) {
+		return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
+			   (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16);
+
+	} else {
+		return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
+			   (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF);
+	}
+}
+
+
+static unsigned int ForwardDNAOccCount(const unsigned int*  dna, const unsigned int index, const unsigned int character,
+									   const unsigned int*  dnaDecodeTable)
+{
+	static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
+											   0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
+											   0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
+											   0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
+
+	unsigned int wordToCount, charToCount;
+	unsigned int i, c;
+	unsigned int sum = 0;
+
+	wordToCount = index / 16;
+	charToCount = index - wordToCount * 16;
+
+	for (i=0; i<wordToCount; i++) {
+		sum += dnaDecodeTable[dna[i] >> 16];
+		sum += dnaDecodeTable[dna[i] & 0x0000FFFF];
+	}
+
+	if (charToCount > 0) {
+		c = dna[i] & truncateRightMask[charToCount];	// increase count of 'a' by 16 - c;
+		sum += dnaDecodeTable[c >> 16];
+		sum += dnaDecodeTable[c & 0xFFFF];
+		sum += charToCount - 16;	// decrease count of 'a' by 16 - positionToProcess
+	}
+
+	return (sum >> (character * 8)) & 0x000000FF;
+
+}
+
+static unsigned int BackwardDNAOccCount(const unsigned int*  dna, const unsigned int index, const unsigned int character,
+										const unsigned int*  dnaDecodeTable)
+{
+	static const unsigned int truncateLeftMask[16] =  { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F,
+											   0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF,
+											   0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF,
+											   0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF };
+
+	unsigned int wordToCount, charToCount;
+	unsigned int i, c;
+	unsigned int sum = 0;
+
+	wordToCount = index / 16;
+	charToCount = index - wordToCount * 16;
+
+	dna -= wordToCount + 1;
+
+	if (charToCount > 0) {
+		c = *dna & truncateLeftMask[charToCount];	// increase count of 'a' by 16 - c;
+		sum += dnaDecodeTable[c >> 16];
+		sum += dnaDecodeTable[c & 0xFFFF];
+		sum += charToCount - 16;	// decrease count of 'a' by 16 - positionToProcess
+	}
+	
+	for (i=0; i<wordToCount; i++) {
+		dna++;
+		sum += dnaDecodeTable[*dna >> 16];
+		sum += dnaDecodeTable[*dna & 0x0000FFFF];
+	}
+
+	return (sum >> (character * 8)) & 0x000000FF;
+
+}
+
+bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character)
+{
+	bgint_t occValue;
+	bgint_t occExplicitIndex, occIndex;
+
+	// $ is supposed to be positioned at inverseSa0 but it is not encoded
+	// therefore index is subtracted by 1 for adjustment
+	if (index > bwt->inverseSa0)
+		index--;
+
+	occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL;	// Bidirectional encoding
+	occIndex = occExplicitIndex * OCC_INTERVAL;
+	occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character);
+
+	if (occIndex == index)
+		return occValue;
+
+	if (occIndex < index) {
+		return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable);
+	} else {
+		return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable);
+	}
+}
+
+static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq,
+										  const unsigned int *packedText, const bgint_t numChar,
+										  const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration)
+{
+	bgint_t saIndex;
+	bgint_t lastWord;
+	unsigned int packedMask;
+	bgint_t i;
+	unsigned int c, t, j;
+	bgint_t rankIndex;
+	unsigned int shift;
+	bgint_t seqIndexFromStart[ALPHABET_SIZE];
+	bgint_t seqIndexFromEnd[ALPHABET_SIZE];
+
+	for (i=0; i<ALPHABET_SIZE; i++) {
+		seqIndexFromStart[i] = cumulativeCount[i];
+		seqIndexFromEnd[i] = cumulativeCount[i+1] - 1;
+	}
+
+	shift = BITS_IN_WORD - BIT_PER_CHAR;
+	packedMask = ALL_ONE_MASK >> shift;
+	saIndex = bwt->inverseSa0;
+	rankIndex = numChar - 1;
+
+	lastWord = numChar / CHAR_PER_WORD;
+	for (i=lastWord; i--;) {	// loop from lastWord - 1 to 0
+		t = packedText[i];
+		for (j=0; j<CHAR_PER_WORD; j++) {
+			c = t & packedMask;
+			saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1;
+			// A counting sort using the first character of suffix is done here
+			// If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0
+			if (saIndex > bwt->inverseSa0) {
+				seq[seqIndexFromEnd[c]] = rankIndex;
+				absoluteRank[seqIndexFromEnd[c]] = saIndex;
+				seqIndexFromEnd[c]--;
+			} else {
+				seq[seqIndexFromStart[c]] = rankIndex;
+				absoluteRank[seqIndexFromStart[c]] = saIndex;
+				seqIndexFromStart[c]++;
+			}
+			rankIndex--;
+			t >>= BIT_PER_CHAR;
+		}
+	}
+
+	absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0;	// representing the substring of all preceding characters
+	seq[seqIndexFromStart[firstCharInLastIteration]] = numChar;
+
+	return seqIndexFromStart[firstCharInLastIteration];
+}
+
+static void BWTIncSortKey(bgint_t* __restrict key, bgint_t* __restrict seq, const bgint_t numItem)
+{
+	#define EQUAL_KEY_THRESHOLD	4	// Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD
+
+	int64_t lowIndex, highIndex, midIndex;
+	int64_t lowPartitionIndex, highPartitionIndex;
+	int64_t lowStack[32], highStack[32];
+	int stackDepth;
+	int64_t i, j;
+	bgint_t tempSeq, tempKey;
+	int64_t numberOfEqualKey;
+
+	if (numItem < 2) return;
+
+	stackDepth = 0;
+
+    lowIndex = 0;
+    highIndex = numItem - 1;
+
+	for (;;) {
+
+		for (;;) {
+
+			// Sort small array of data
+			if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) {	 // Insertion sort on smallest arrays
+				for (i=lowIndex+1; i<=highIndex; i++) {
+					tempSeq = seq[i];
+					tempKey = key[i];
+					for (j = i; j > lowIndex && key[j-1] > tempKey; j--) {
+						seq[j] = seq[j-1];
+						key[j] = key[j-1];
+					}
+					if (j != i) {
+						seq[j] = tempSeq;
+						key[j] = tempKey;
+					}
+				}
+				break;
+			}
+
+			// Choose pivot as median of the lowest, middle, and highest data; sort the three data
+
+			midIndex = average(lowIndex, highIndex);
+			if (key[lowIndex] > key[midIndex]) {
+				tempSeq = seq[lowIndex];
+				tempKey = key[lowIndex];
+				seq[lowIndex] = seq[midIndex];
+				key[lowIndex] = key[midIndex];
+				seq[midIndex] = tempSeq;
+				key[midIndex] = tempKey;
+			}
+			if (key[lowIndex] > key[highIndex]) {
+				tempSeq = seq[lowIndex];
+				tempKey = key[lowIndex];
+				seq[lowIndex] = seq[highIndex];
+				key[lowIndex] = key[highIndex];
+				seq[highIndex] = tempSeq;
+				key[highIndex] = tempKey;
+			}
+			if (key[midIndex] > key[highIndex]) {
+				tempSeq = seq[midIndex];
+				tempKey = key[midIndex];
+				seq[midIndex] = seq[highIndex];
+				key[midIndex] = key[highIndex];
+				seq[highIndex] = tempSeq;
+				key[highIndex] = tempKey;
+			}
+
+			// Partition data
+
+			numberOfEqualKey = 0;
+
+			lowPartitionIndex = lowIndex + 1;
+			highPartitionIndex = highIndex - 1;
+
+			for (;;) {
+				while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) {
+					numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]);
+					lowPartitionIndex++;
+				}
+				while (lowPartitionIndex < highPartitionIndex) {
+					if (key[midIndex] >= key[highPartitionIndex]) {
+						numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]);
+						break;
+					}
+					highPartitionIndex--;
+				}
+				if (lowPartitionIndex >= highPartitionIndex) {
+					break;
+				}
+				tempSeq = seq[lowPartitionIndex];
+				tempKey = key[lowPartitionIndex];
+				seq[lowPartitionIndex] = seq[highPartitionIndex];
+				key[lowPartitionIndex] = key[highPartitionIndex];
+				seq[highPartitionIndex] = tempSeq;
+				key[highPartitionIndex] = tempKey;
+				if (highPartitionIndex == midIndex) {
+					// partition key has been moved
+					midIndex = lowPartitionIndex;
+				}
+				lowPartitionIndex++;
+				highPartitionIndex--;
+			}
+
+			// Adjust the partition index
+			highPartitionIndex = lowPartitionIndex;
+			lowPartitionIndex--;
+
+			// move the partition key to end of low partition
+			tempSeq = seq[midIndex];
+			tempKey = key[midIndex];
+			seq[midIndex] = seq[lowPartitionIndex];
+			key[midIndex] = key[lowPartitionIndex];
+			seq[lowPartitionIndex] = tempSeq;
+			key[lowPartitionIndex] = tempKey;
+
+			if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) {
+
+				// Many keys = partition key; separate the equal key data from the lower partition
+		
+				midIndex = lowIndex;
+
+				for (;;) {
+					while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) {
+						midIndex++;
+					}
+					while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) {
+						lowPartitionIndex--;
+					}
+					if (midIndex >= lowPartitionIndex) {
+						break;
+					}
+					tempSeq = seq[midIndex];
+					tempKey = key[midIndex];
+					seq[midIndex] = seq[lowPartitionIndex - 1];
+					key[midIndex] = key[lowPartitionIndex - 1];
+					seq[lowPartitionIndex - 1] = tempSeq;
+					key[lowPartitionIndex - 1] = tempKey;
+					midIndex++;
+					lowPartitionIndex--;
+				}
+
+			}
+
+			if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) {
+				// put the larger partition to stack
+				lowStack[stackDepth] = lowIndex;
+				highStack[stackDepth] = lowPartitionIndex - 1;
+				stackDepth++;
+				// sort the smaller partition first
+				lowIndex = highPartitionIndex;
+			} else {
+				// put the larger partition to stack
+				lowStack[stackDepth] = highPartitionIndex;
+				highStack[stackDepth] = highIndex;
+				stackDepth++;
+				// sort the smaller partition first
+				if (lowPartitionIndex > lowIndex) {
+					highIndex = lowPartitionIndex - 1;
+				} else {
+					// all keys in the partition equals to the partition key
+					break;
+				}
+			}
+			continue;
+		}
+
+		// Pop a range from stack
+		if (stackDepth > 0) {
+			stackDepth--;
+			lowIndex = lowStack[stackDepth];
+			highIndex = highStack[stackDepth];
+			continue;
+		} else return;
+	}
+}
+
+
+static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __restrict seq,
+									bgint_t* __restrict relativeRank, const bgint_t numItem,
+									bgint_t oldInverseSa0, const bgint_t *cumulativeCount)
+{
+	bgint_t i, c;
+	bgint_t s, r;
+	bgint_t lastRank, lastIndex;
+	bgint_t oldInverseSa0RelativeRank = 0;
+	bgint_t freq;
+
+	lastIndex = numItem;
+	lastRank = sortedRank[numItem];
+	if (lastRank > oldInverseSa0) {
+		sortedRank[numItem]--;	// to prepare for merging; $ is not encoded in bwt
+	}
+	s = seq[numItem];
+	relativeRank[s] = numItem;
+	if (lastRank == oldInverseSa0) {
+		oldInverseSa0RelativeRank = numItem;
+		oldInverseSa0++;	// so that this segment of code is not run again
+		lastRank++;			// so that oldInverseSa0 become a sorted group with 1 item
+	}
+
+	c = ALPHABET_SIZE - 1;
+	freq = cumulativeCount[c];
+
+	for (i=numItem; i--;) {	// from numItem - 1 to 0
+		r = sortedRank[i];
+		if (r > oldInverseSa0)
+			sortedRank[i]--;	// to prepare for merging; $ is not encoded in bwt
+		s = seq[i];
+		if (i < freq) {
+			if (lastIndex >= freq)
+				lastRank++;	// to trigger the group across alphabet boundary to be split
+			c--;
+			freq = cumulativeCount[c];
+		}
+		if (r == lastRank) {
+			relativeRank[s] = lastIndex;
+		} else {
+			if (i == lastIndex - 1) {
+				if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) {
+					seq[lastIndex] = seq[lastIndex + 1] - 1;
+				} else {
+					seq[lastIndex] = (bgint_t)-1;
+				}
+			}
+			lastIndex = i;
+			lastRank = r;
+			relativeRank[s] = i;
+			if (r == oldInverseSa0) {
+				oldInverseSa0RelativeRank = i;
+				oldInverseSa0++;	// so that this segment of code is not run again
+				lastRank++;			// so that oldInverseSa0 become a sorted group with 1 item
+			}
+		}
+	}
+
+}
+
+static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar,
+						   const bgint_t *cumulativeCount)
+{
+	unsigned int c;
+	bgint_t i;
+	bgint_t previousRank, currentRank;
+
+	previousRank = relativeRank[0];
+
+	for (i=1; i<=numChar; i++) {
+		currentRank = relativeRank[i];
+		c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2])
+											  	 + (previousRank >= cumulativeCount[3]);
+		insertBwt[currentRank] = c;
+		previousRank = currentRank;
+	}
+}
+
+static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
+						   unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt)
+{
+	unsigned int bitsInWordMinusBitPerChar;
+	bgint_t leftShift, rightShift;
+	bgint_t o;
+	bgint_t oIndex, iIndex, mIndex;
+	bgint_t mWord, mChar, oWord, oChar;
+	bgint_t numInsert;
+
+	bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
+
+	oIndex = 0;
+	iIndex = 0;
+	mIndex = 0;
+
+	mWord = 0;
+	mChar = 0;
+
+	mergedBwt[0] = 0;	// this can be cleared as merged Bwt slightly shift to the left in each iteration
+
+	while (oIndex < numOldBwt) {
+
+		// copy from insertBwt
+		while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) {
+			if (sortedRank[iIndex] != 0) {	// special value to indicate that this is for new inverseSa0
+				mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
+				mIndex++;
+				mChar++;
+				if (mChar == CHAR_PER_WORD) {
+					mChar = 0;
+					mWord++;
+					mergedBwt[mWord] = 0;	// no need to worry about crossing mergedBwt boundary
+				}
+			}
+			iIndex++;
+		}
+
+		// Copy from oldBwt to mergedBwt
+		if (iIndex <= numInsertBwt) {
+			o = sortedRank[iIndex];
+		} else {
+			o = numOldBwt;
+		}
+		numInsert = o - oIndex;
+
+		oWord = oIndex / CHAR_PER_WORD;
+		oChar = oIndex - oWord * CHAR_PER_WORD;
+		if (oChar > mChar) {
+			leftShift = (oChar - mChar) * BIT_PER_CHAR;
+			rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR;
+			mergedBwt[mWord] = mergedBwt[mWord]
+								| (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR))
+								| (oldBwt[oWord+1] >> rightShift);
+			oIndex += min(numInsert, CHAR_PER_WORD - mChar);
+			while (o > oIndex) {
+				oWord++;
+				mWord++;
+				mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift);
+				oIndex += CHAR_PER_WORD;
+			}
+		} else if (oChar < mChar) {
+			rightShift = (mChar - oChar) * BIT_PER_CHAR;
+			leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR;
+			mergedBwt[mWord] = mergedBwt[mWord] 
+								| (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR));
+			oIndex += min(numInsert, CHAR_PER_WORD - mChar);
+			while (o > oIndex) {
+				oWord++;
+				mWord++;
+				mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift);
+				oIndex += CHAR_PER_WORD;
+			}
+		} else { // oChar == mChar
+			mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR);
+			oIndex += min(numInsert, CHAR_PER_WORD - mChar);
+			while (o > oIndex) {
+				oWord++;
+				mWord++;
+				mergedBwt[mWord] = oldBwt[oWord];
+				oIndex += CHAR_PER_WORD;
+			}
+		}
+		oIndex = o;
+		mIndex += numInsert;
+
+		// Clear the trailing garbage in mergedBwt
+		mWord = mIndex / CHAR_PER_WORD;
+		mChar = mIndex - mWord * CHAR_PER_WORD;
+		if (mChar == 0) {
+			mergedBwt[mWord] = 0;
+		} else {
+			mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR));
+		}
+
+	}
+
+	// copy from insertBwt
+	while (iIndex <= numInsertBwt) {
+		if (sortedRank[iIndex] != 0) {
+			mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
+			mIndex++;
+			mChar++;
+			if (mChar == CHAR_PER_WORD) {
+				mChar = 0;
+				mWord++;
+				mergedBwt[mWord] = 0;	// no need to worry about crossing mergedBwt boundary
+			}
+		}
+		iIndex++;
+	}
+}
+
+void BWTClearTrailingBwtCode(BWT *bwt)
+{
+	bgint_t bwtResidentSizeInWord;
+	bgint_t wordIndex, offset;
+	bgint_t i;
+
+	bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength);
+
+	wordIndex = bwt->textLength / CHAR_PER_WORD;
+	offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR;
+	if (offset > 0) {
+		bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset);
+	} else {
+		if (wordIndex < bwtResidentSizeInWord) {
+			bwt->bwtCode[wordIndex] = 0;
+		}
+	}
+
+	for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) {
+		bwt->bwtCode[i] = 0;
+	}
+}
+
+
+void BWTGenerateOccValueFromBwt(const unsigned int*  bwt, unsigned int* __restrict occValue,
+								bgint_t* __restrict occValueMajor,
+								const bgint_t textLength, const unsigned int*  decodeTable)
+{
+	bgint_t numberOfOccValueMajor, numberOfOccValue;
+	unsigned int wordBetweenOccValue;
+	bgint_t numberOfOccIntervalPerMajor;
+	unsigned int c;
+	bgint_t i, j;
+	bgint_t occMajorIndex;
+	bgint_t occIndex, bwtIndex;
+	bgint_t sum; // perhaps unsigned is big enough
+	bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];
+
+	wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD;
+
+	// Calculate occValue
+	numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;				// Value at both end for bi-directional encoding
+	numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
+	numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor;
+
+	tempOccValue0[0] = 0;
+	tempOccValue0[1] = 0;
+	tempOccValue0[2] = 0;
+	tempOccValue0[3] = 0;
+	occValueMajor[0] = 0;
+	occValueMajor[1] = 0;
+	occValueMajor[2] = 0;
+	occValueMajor[3] = 0;
+
+	occIndex = 0;
+	bwtIndex = 0;
+	for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) {
+
+		for (i=0; i<numberOfOccIntervalPerMajor/2; i++) {
+
+			sum = 0;
+			tempOccValue1[0] = tempOccValue0[0];
+			tempOccValue1[1] = tempOccValue0[1];
+			tempOccValue1[2] = tempOccValue0[2];
+			tempOccValue1[3] = tempOccValue0[3];
+
+			for (j=0; j<wordBetweenOccValue; j++) {
+				c = bwt[bwtIndex];
+				sum += decodeTable[c >> 16];
+				sum += decodeTable[c & 0x0000FFFF];
+				bwtIndex++;
+			}
+			if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+				tempOccValue1[0] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue1[1] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue1[2] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue1[3] += sum;
+			} else {
+				if (sum == 0x00000100) {
+					tempOccValue1[0] += 256;
+				} else if (sum == 0x00010000) {
+					tempOccValue1[1] += 256;
+				} else if (sum == 0x01000000) {
+					tempOccValue1[2] += 256;
+				} else {
+					tempOccValue1[3] += 256;
+				}
+			}
+			occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
+			occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
+			occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
+			occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
+			tempOccValue0[0] = tempOccValue1[0];
+			tempOccValue0[1] = tempOccValue1[1];
+			tempOccValue0[2] = tempOccValue1[2];
+			tempOccValue0[3] = tempOccValue1[3];
+			sum = 0;
+
+			occIndex++;
+
+			for (j=0; j<wordBetweenOccValue; j++) {
+				c = bwt[bwtIndex];
+				sum += decodeTable[c >> 16];
+				sum += decodeTable[c & 0x0000FFFF];
+				bwtIndex++;
+			}
+			if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+				tempOccValue0[0] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue0[1] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue0[2] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue0[3] += sum;
+			} else {
+				if (sum == 0x00000100) {
+					tempOccValue0[0] += 256;
+				} else if (sum == 0x00010000) {
+					tempOccValue0[1] += 256;
+				} else if (sum == 0x01000000) {
+					tempOccValue0[2] += 256;
+				} else {
+					tempOccValue0[3] += 256;
+				}
+			}
+		}
+
+		occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0];
+		occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1];
+		occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2];
+		occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3];
+		tempOccValue0[0] = 0;
+		tempOccValue0[1] = 0;
+		tempOccValue0[2] = 0;
+		tempOccValue0[3] = 0;
+
+	}
+
+	while (occIndex < (numberOfOccValue-1)/2) {
+		sum = 0;
+		tempOccValue1[0] = tempOccValue0[0];
+		tempOccValue1[1] = tempOccValue0[1];
+		tempOccValue1[2] = tempOccValue0[2];
+		tempOccValue1[3] = tempOccValue0[3];
+		for (j=0; j<wordBetweenOccValue; j++) {
+			c = bwt[bwtIndex];
+			sum += decodeTable[c >> 16];
+			sum += decodeTable[c & 0x0000FFFF];
+			bwtIndex++;
+		}
+		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+			tempOccValue1[0] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[1] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[2] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[3] += sum;
+		} else {
+			if (sum == 0x00000100) {
+				tempOccValue1[0] += 256;
+			} else if (sum == 0x00010000) {
+				tempOccValue1[1] += 256;
+			} else if (sum == 0x01000000) {
+				tempOccValue1[2] += 256;
+			} else {
+				tempOccValue1[3] += 256;
+			}
+		}
+		occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
+		occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
+		occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
+		occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
+		tempOccValue0[0] = tempOccValue1[0];
+		tempOccValue0[1] = tempOccValue1[1];
+		tempOccValue0[2] = tempOccValue1[2];
+		tempOccValue0[3] = tempOccValue1[3];
+		sum = 0;
+		occIndex++;
+
+		for (j=0; j<wordBetweenOccValue; j++) {
+			c = bwt[bwtIndex];
+			sum += decodeTable[c >> 16];
+			sum += decodeTable[c & 0x0000FFFF];
+			bwtIndex++;
+		}
+		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+			tempOccValue0[0] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue0[1] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue0[2] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue0[3] += sum;
+		} else {
+			if (sum == 0x00000100) {
+				tempOccValue0[0] += 256;
+			} else if (sum == 0x00010000) {
+				tempOccValue0[1] += 256;
+			} else if (sum == 0x01000000) {
+				tempOccValue0[2] += 256;
+			} else {
+				tempOccValue0[3] += 256;
+			}
+		}
+	}
+
+	sum = 0;
+	tempOccValue1[0] = tempOccValue0[0];
+	tempOccValue1[1] = tempOccValue0[1];
+	tempOccValue1[2] = tempOccValue0[2];
+	tempOccValue1[3] = tempOccValue0[3];
+
+	if (occIndex * 2 < numberOfOccValue - 1) {
+		for (j=0; j<wordBetweenOccValue; j++) {
+			c = bwt[bwtIndex];
+			sum += decodeTable[c >> 16];
+			sum += decodeTable[c & 0x0000FFFF];
+			bwtIndex++;
+		}
+		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+			tempOccValue1[0] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[1] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[2] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[3] += sum;
+		} else {
+			if (sum == 0x00000100) {
+				tempOccValue1[0] += 256;
+			} else if (sum == 0x00010000) {
+				tempOccValue1[1] += 256;
+			} else if (sum == 0x01000000) {
+				tempOccValue1[2] += 256;
+			} else {
+				tempOccValue1[3] += 256;
+			}
+		}
+	}
+
+	occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
+	occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
+	occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
+	occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
+
+}
+
+static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar)
+{
+	unsigned int i;
+	bgint_t mergedBwtSizeInWord, mergedOccSizeInWord;
+	unsigned int firstCharInThisIteration;
+
+	bgint_t *relativeRank, *seq, *sortedRank;
+	unsigned int *insertBwt, *mergedBwt;
+	bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;
+
+	mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar);
+	mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar);
+
+	initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
+
+	if (bwtInc->bwt->textLength == 0) {		// Initial build
+
+		// Set address
+		seq = (bgint_t*)bwtInc->workingMemory;
+		relativeRank = seq + bwtInc->buildSize + 1;
+		// mergedBwt and packedTex may share memory
+		mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord;	// build in place
+
+		assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)bwtInc->packedText);
+		assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)mergedBwt);
+
+		// ->packedText is not used any more and may be overwritten by mergedBwt
+		BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar);
+
+		firstCharInThisIteration = relativeRank[0];
+		relativeRank[numChar] = 0;
+
+		// Sort suffix
+		QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE);
+		newInverseSa0 = relativeRank[0];
+
+		// Clear BWT area
+		initializeVAL(insertBwt, mergedBwtSizeInWord, 0);
+
+		// Build BWT
+		BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift);
+
+		// so that the cumulativeCount is not deducted
+		bwtInc->firstCharInLastIteration = ALPHABET_SIZE;
+
+	} else {		// Incremental build
+		// Set address
+		sortedRank = (bgint_t*)bwtInc->workingMemory;
+		seq = sortedRank + bwtInc->buildSize + 1;
+		insertBwt = (unsigned*)seq; // insertBwt and seq share memory
+		// relativeRank and ->packedText may share memory
+		relativeRank = seq + bwtInc->buildSize + 1;
+
+		assert((void*)relativeRank <= (void*)bwtInc->packedText);
+
+		// Store the first character of this iteration
+		firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR);
+
+		// Count occurrence of input text
+		ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable);
+		// Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration
+		bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++;
+		bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1];
+		bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2];
+		bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3];
+
+		// Get rank of new suffix among processed suffix
+		// The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group
+		// ->packedText is not used any more and will be overwritten by relativeRank
+		oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, 
+														  numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration);
+
+		// Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group)
+		for (i=0; i<ALPHABET_SIZE; i++) {
+			if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank ||
+				bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) {
+				BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]);
+			} else {
+				if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) {
+					BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]);
+				}
+				if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) {
+					BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1);
+				}
+			}
+		}
+
+		// build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt
+		// the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups)
+		BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild);
+		assert(relativeRank[numChar] == oldInverseSa0RelativeRank);
+
+		// Sort suffix
+		QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE);
+
+		newInverseSa0RelativeRank = relativeRank[0];
+		newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank;
+
+		sortedRank[newInverseSa0RelativeRank] = 0;	// a special value so that this is skipped in the merged bwt
+
+		// Build BWT; seq is overwritten by insertBwt
+		BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);
+
+		// Merge BWT; relativeRank may be overwritten by mergedBwt
+		mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord 
+				    - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR * (sizeof(bgint_t) / 4); // minus numberOfIteration * occInterval to create a buffer for merging
+		assert(mergedBwt >= insertBwt + numChar);
+		BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar);
+	}
+
+	// Build auxiliary structure and update info and pointers in BWT
+	bwtInc->bwt->textLength += numChar;
+	bwtInc->bwt->bwtCode = mergedBwt;
+	bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord;
+	bwtInc->bwt->occSizeInWord = mergedOccSizeInWord;
+	assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord);
+
+	bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord;
+
+	BWTClearTrailingBwtCode(bwtInc->bwt);
+	BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor,
+							   bwtInc->bwt->textLength, bwtInc->bwt->decodeTable);
+
+	bwtInc->bwt->inverseSa0 = newInverseSa0;
+	
+	bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0);
+	bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1);
+	bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2);
+	bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3);
+
+	bwtInc->firstCharInLastIteration = firstCharInThisIteration;
+
+	// Set build size and text address for the next build
+	BWTIncSetBuildSizeAndTextAddr(bwtInc);
+	bwtInc->numberOfIterationDone++;
+
+}
+
+BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize)
+{
+
+	FILE *packedFile;
+	bgint_t packedFileLen;
+	bgint_t totalTextLength;
+	bgint_t textToLoad, textSizeInByte;
+	bgint_t processedTextLength;
+	unsigned char lastByteLength;
+
+	BWTInc *bwtInc;
+
+	packedFile = (FILE*)fopen(inputFileName, "rb");
+
+	if (packedFile == NULL) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+
+	if (fseek(packedFile, -1, SEEK_END) != 0) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+	packedFileLen = ftell(packedFile);
+	if (packedFileLen == -1) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Can't ftell on %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+	if (fread(&lastByteLength, sizeof(unsigned char), 1, packedFile) != 1) {
+		fprintf(stderr,
+				"BWTIncConstructFromPacked() : Can't read from %s : %s\n",
+				inputFileName,
+				ferror(packedFile)? strerror(errno) : "Unexpected end of file");
+		exit(1);
+	}
+	totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);
+
+	bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize);
+
+	BWTIncSetBuildSizeAndTextAddr(bwtInc);
+
+	if (bwtInc->buildSize > totalTextLength) {
+		textToLoad = totalTextLength;
+	} else {
+		textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD);
+	}
+	textSizeInByte = textToLoad / CHAR_PER_BYTE;	// excluded the odd byte
+
+	if (fseek(packedFile, -((long)textSizeInByte + 2), SEEK_CUR) != 0) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+	if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile) != textSizeInByte + 1) {
+		fprintf(stderr,
+				"BWTIncConstructFromPacked() : Can't read from %s : %s\n",
+				inputFileName,
+				ferror(packedFile)? strerror(errno) : "Unexpected end of file");
+		exit(1);
+	}
+	if (fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR) != 0) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+
+	ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
+	BWTIncConstruct(bwtInc, textToLoad);
+
+	processedTextLength = textToLoad;
+
+	while (processedTextLength < totalTextLength) {
+		textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
+		if (textToLoad > totalTextLength - processedTextLength) {
+			textToLoad = totalTextLength - processedTextLength;
+		}
+		textSizeInByte = textToLoad / CHAR_PER_BYTE;
+		if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) {
+			fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+					inputFileName, strerror(errno));
+			exit(1);
+		}
+		if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile) != textSizeInByte) {
+			fprintf(stderr,
+				"BWTIncConstructFromPacked() : Can't read from %s : %s\n",
+				inputFileName,
+				ferror(packedFile)? strerror(errno) : "Unexpected end of file");
+			exit(1);
+		}
+		if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) {
+			fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+					inputFileName, strerror(errno));
+			exit(1);
+		}
+		ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
+		BWTIncConstruct(bwtInc, textToLoad);
+		processedTextLength += textToLoad;
+		if (bwtInc->numberOfIterationDone % 10 == 0) {
+			fprintf(stderr, "[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n",
+					(long)bwtInc->numberOfIterationDone, (long)processedTextLength);
+		}
+	}
+	return bwtInc;
+}
+
+void BWTFree(BWT *bwt)
+{
+	if (bwt == 0) return;
+	free(bwt->cumulativeFreq);
+	free(bwt->bwtCode);
+	free(bwt->occValue);
+	free(bwt->occValueMajor);
+	free(bwt->decodeTable);
+	free(bwt);
+}
+
+void BWTIncFree(BWTInc *bwtInc)
+{
+	if (bwtInc == 0) return;
+	free(bwtInc->bwt);
+	free(bwtInc->workingMemory);
+	free(bwtInc);
+}
+
+static bgint_t BWTFileSizeInWord(const bgint_t numChar)
+{
+	// The $ in BWT at the position of inverseSa0 is not encoded
+	return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
+}
+
+void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName)
+{
+	FILE *bwtFile;
+/*	FILE *occValueFile; */
+	bgint_t bwtLength;
+
+	bwtFile = (FILE*)fopen(bwtFileName, "wb");
+	if (bwtFile == NULL) {
+		fprintf(stderr,
+				"BWTSaveBwtCodeAndOcc(): Cannot open %s for writing: %s\n",
+				bwtFileName, strerror(errno));
+		exit(1);
+	}
+
+	bwtLength = BWTFileSizeInWord(bwt->textLength);
+
+	if (fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile) != 1
+		|| fwrite(bwt->cumulativeFreq + 1,
+				  sizeof(bgint_t), ALPHABET_SIZE, bwtFile) != ALPHABET_SIZE
+		|| fwrite(bwt->bwtCode,
+				  sizeof(unsigned int), bwtLength, bwtFile) != bwtLength) {
+		fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error writing to %s : %s\n",
+				bwtFileName, strerror(errno));
+		exit(1);
+	}
+	if (fclose(bwtFile) != 0) {
+		fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error on closing %s : %s\n",
+				bwtFileName, strerror(errno));
+		exit(1);
+	}
+}
+
+void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size)
+{
+	BWTInc *bwtInc;
+	bwtInc = BWTIncConstructFromPacked(fn_pac, block_size, block_size);
+	printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
+	BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
+	BWTIncFree(bwtInc);
+}
+
+void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
+{
+	bwt_bwtgen2(fn_pac, fn_bwt, 10000000);
+}
+
+int bwt_bwtgen_main(int argc, char *argv[])
+{
+	if (argc < 3) {
+		fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n");
+		return 1;
+	}
+	bwt_bwtgen(argv[1], argv[2]);
+	return 0;
+}
+
+#ifdef MAIN_BWT_GEN
+
+int main(int argc, char *argv[])
+{
+	return bwt_bwtgen_main(argc, argv);
+}
+
+#endif
diff --git a/src/bwtindex.c b/src/bwtindex.c
new file mode 100644
index 0000000..3a9630a
--- /dev/null
+++ b/src/bwtindex.c
@@ -0,0 +1,295 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <time.h>
+#include <zlib.h>
+#include "bntseq.h"
+#include "bwt.h"
+#include "utils.h"
+
+#ifdef _DIVBWT
+#include "divsufsort.h"
+#endif
+
+#ifdef USE_MALLOC_WRAPPERS
+#  include "malloc_wrap.h"
+#endif
+
+
+int is_bwt(ubyte_t *T, int n);
+
+int64_t bwa_seq_len(const char *fn_pac)
+{
+	FILE *fp;
+	int64_t pac_len;
+	ubyte_t c;
+	fp = xopen(fn_pac, "rb");
+	err_fseek(fp, -1, SEEK_END);
+	pac_len = err_ftell(fp);
+	err_fread_noeof(&c, 1, 1, fp);
+	err_fclose(fp);
+	return (pac_len - 1) * 4 + (int)c;
+}
+
+bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
+{
+	bwt_t *bwt;
+	ubyte_t *buf, *buf2;
+	int i, pac_size;
+	FILE *fp;
+
+	// initialization
+	bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
+	bwt->seq_len = bwa_seq_len(fn_pac);
+	bwt->bwt_size = (bwt->seq_len + 15) >> 4;
+	fp = xopen(fn_pac, "rb");
+
+	// prepare sequence
+	pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
+	buf2 = (ubyte_t*)calloc(pac_size, 1);
+	err_fread_noeof(buf2, 1, pac_size, fp);
+	err_fclose(fp);
+	memset(bwt->L2, 0, 5 * 4);
+	buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
+	for (i = 0; i < bwt->seq_len; ++i) {
+		buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
+		++bwt->L2[1+buf[i]];
+	}
+	for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
+	free(buf2);
+
+	// Burrows-Wheeler Transform
+	if (use_is) {
+		bwt->primary = is_bwt(buf, bwt->seq_len);
+	} else {
+#ifdef _DIVBWT
+		bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
+#else
+		err_fatal_simple("libdivsufsort is not compiled in.");
+#endif
+	}
+	bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
+	for (i = 0; i < bwt->seq_len; ++i)
+		bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
+	free(buf);
+	return bwt;
+}
+
+int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required!
+{
+	bwt_t *bwt;
+	int c, use_is = 1;
+	while ((c = getopt(argc, argv, "d")) >= 0) {
+		switch (c) {
+		case 'd': use_is = 0; break;
+		default: return 1;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
+		return 1;
+	}
+	bwt = bwt_pac2bwt(argv[optind], use_is);
+	bwt_dump_bwt(argv[optind+1], bwt);
+	bwt_destroy(bwt);
+	return 0;
+}
+
+#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
+
+void bwt_bwtupdate_core(bwt_t *bwt)
+{
+	bwtint_t i, k, c[4], n_occ;
+	uint32_t *buf;
+
+	n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
+	bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
+	buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
+	c[0] = c[1] = c[2] = c[3] = 0;
+	for (i = k = 0; i < bwt->seq_len; ++i) {
+		if (i % OCC_INTERVAL == 0) {
+			memcpy(buf + k, c, sizeof(bwtint_t) * 4);
+			k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
+		}
+		if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
+		++c[bwt_B00(bwt, i)];
+	}
+	// the last element
+	memcpy(buf + k, c, sizeof(bwtint_t) * 4);
+	xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
+	// update bwt
+	free(bwt->bwt); bwt->bwt = buf;
+}
+
+int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
+{
+	bwt_t *bwt;
+	if (argc < 2) {
+		fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
+		return 1;
+	}
+	bwt = bwt_restore_bwt(argv[1]);
+	bwt_bwtupdate_core(bwt);
+	bwt_dump_bwt(argv[1], bwt);
+	bwt_destroy(bwt);
+	return 0;
+}
+
+int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
+{
+	bwt_t *bwt;
+	int c, sa_intv = 32;
+	while ((c = getopt(argc, argv, "i:")) >= 0) {
+		switch (c) {
+		case 'i': sa_intv = atoi(optarg); break;
+		default: return 1;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
+		return 1;
+	}
+	bwt = bwt_restore_bwt(argv[optind]);
+	bwt_cal_sa(bwt, sa_intv);
+	bwt_dump_sa(argv[optind+1], bwt);
+	bwt_destroy(bwt);
+	return 0;
+}
+
+int bwa_index(int argc, char *argv[]) // the "index" command
+{
+	extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
+
+	char *prefix = 0, *str, *str2, *str3;
+	int c, algo_type = 0, is_64 = 0, block_size = 10000000;
+	int sa_sample_interval = 32;
+	clock_t t;
+	int64_t l_pac;
+
+	while ((c = getopt(argc, argv, "6a:p:b:s:")) >= 0) {
+		switch (c) {
+		case 'a': // if -a is not set, algo_type will be determined later
+			if (strcmp(optarg, "div") == 0) algo_type = 1;
+			else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
+			else if (strcmp(optarg, "is") == 0) algo_type = 3;
+			else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
+			break;
+		case 'p': prefix = strdup(optarg); break;
+		case '6': is_64 = 1; break;
+		case 's': sa_sample_interval = atoi(optarg); break;
+		case 'b':
+			block_size = strtol(optarg, &str, 10);
+			if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
+			else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024;
+			else if (*str == 'K' || *str == 'k') block_size *= 1024;
+			break;
+		default: return 1;
+		}
+	}
+
+	if (optind + 1 > argc) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   bwa index [options] <in.fasta>\n\n");
+		fprintf(stderr, "Options: -a STR    BWT construction algorithm: bwtsw or is [auto]\n");
+		fprintf(stderr, "         -p STR    prefix of the index [same as fasta name]\n");
+		fprintf(stderr, "         -b INT    block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
+		fprintf(stderr, "         -6        index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
+		fprintf(stderr, "\n");
+		fprintf(stderr,	"Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
+		fprintf(stderr, "         `-a div' do not work not for long genomes.\n\n");
+		return 1;
+	}
+	if (prefix == 0) {
+		prefix = malloc(strlen(argv[optind]) + 4);
+		strcpy(prefix, argv[optind]);
+		if (is_64) strcat(prefix, ".64");
+	}
+	str  = (char*)calloc(strlen(prefix) + 10, 1);
+	str2 = (char*)calloc(strlen(prefix) + 10, 1);
+	str3 = (char*)calloc(strlen(prefix) + 10, 1);
+
+	{ // nucleotide indexing
+		gzFile fp = xzopen(argv[optind], "r");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Pack FASTA... ");
+		l_pac = bns_fasta2bntseq(fp, prefix, 0);
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		err_gzclose(fp);
+	}
+	if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
+	{
+		strcpy(str, prefix); strcat(str, ".pac");
+		strcpy(str2, prefix); strcat(str2, ".bwt");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
+		if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
+		else if (algo_type == 1 || algo_type == 3) {
+			bwt_t *bwt;
+			bwt = bwt_pac2bwt(str, algo_type == 3);
+			bwt_dump_bwt(str2, bwt);
+			bwt_destroy(bwt);
+		}
+		fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	{
+		bwt_t *bwt;
+		strcpy(str, prefix); strcat(str, ".bwt");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Update BWT... ");
+		bwt = bwt_restore_bwt(str);
+		bwt_bwtupdate_core(bwt);
+		bwt_dump_bwt(str, bwt);
+		bwt_destroy(bwt);
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	{
+		gzFile fp = xzopen(argv[optind], "r");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
+		l_pac = bns_fasta2bntseq(fp, prefix, 1);
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		err_gzclose(fp);
+	}
+	{
+		bwt_t *bwt;
+		strcpy(str, prefix); strcat(str, ".bwt");
+		strcpy(str3, prefix); strcat(str3, ".sa");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
+		bwt = bwt_restore_bwt(str);
+		bwt_cal_sa(bwt, sa_sample_interval);
+		bwt_dump_sa(str3, bwt);
+		bwt_destroy(bwt);
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	free(str3); free(str2); free(str); free(prefix);
+	return 0;
+}
diff --git a/src/cokus.cpp b/src/cokus.cpp
new file mode 100755
index 0000000..103e7d3
--- /dev/null
+++ b/src/cokus.cpp
@@ -0,0 +1,196 @@
+// This is the Mersenne Twister random number generator MT19937, which
+// generates pseudorandom integers uniformly distributed in 0..(2^32 - 1)
+// starting from any odd seed in 0..(2^32 - 1).  This version is a recode
+// by Shawn Cokus (Cokus at math.washington.edu) on March 8, 1998 of a version by
+// Takuji Nishimura (who had suggestions from Topher Cooper and Marc Rieffel in
+// July-August 1997).
+//
+// Effectiveness of the recoding (on Goedel2.math.washington.edu, a DEC Alpha
+// running OSF/1) using GCC -O3 as a compiler: before recoding: 51.6 sec. to
+// generate 300 million random numbers; after recoding: 24.0 sec. for the same
+// (i.e., 46.5% of original time), so speed is now about 12.5 million random
+// number generations per second on this machine.
+//
+// According to the URL <http://www.math.keio.ac.jp/~matumoto/emt.html>
+// (and paraphrasing a bit in places), the Mersenne Twister is ``designed
+// with consideration of the flaws of various existing generators,'' has
+// a period of 2^19937 - 1, gives a sequence that is 623-dimensionally
+// equidistributed, and ``has passed many stringent tests, including the
+// die-hard test of G. Marsaglia and the load test of P. Hellekalek and
+// S. Wegenkittl.''  It is efficient in memory usage (typically using 2506
+// to 5012 bytes of static data, depending on data type sizes, and the code
+// is quite short as well).  It generates random numbers in batches of 624
+// at a time, so the caching and pipelining of modern systems is exploited.
+// It is also divide- and mod-free.
+//
+// This library is free software; you can redistribute it and/or modify it
+// under the terms of the GNU Library General Public License as published by
+// the Free Software Foundation (either version 2 of the License or, at your
+// option, any later version).  This library is distributed in the hope that
+// it will be useful, but WITHOUT ANY WARRANTY, without even the implied
+// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+// the GNU Library General Public License for more details.  You should have
+// received a copy of the GNU Library General Public License along with this
+// library; if not, write to the Free Software Foundation, Inc., 59 Temple
+// Place, Suite 330, Boston, MA 02111-1307, USA.
+//
+// The code as Shawn received it included the following notice:
+//
+//   Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura.  When
+//   you use this, send an e-mail to <matumoto at math.keio.ac.jp> with
+//   an appropriate reference to your work.
+//
+// It would be nice to CC: <Cokus at math.washington.edu> when you write.
+//
+
+//#include <stdio.h>
+//#include <stdlib.h>
+
+//
+// uint32 must be an unsigned integer type capable of holding at least 32
+// bits; exactly 32 should be fastest, but 64 is better on an Alpha with
+// GCC at -O3 optimization so try your options and see whats best for you
+//
+
+typedef unsigned long uint32;
+
+#define N              (624)                 // length of state vector
+#define M              (397)                 // a period parameter
+#define K              (0x9908B0DFU)         // a magic constant
+#define hiBit(u)       ((u) & 0x80000000U)   // mask all but highest   bit of u
+#define loBit(u)       ((u) & 0x00000001U)   // mask all but lowest    bit of u
+#define loBits(u)      ((u) & 0x7FFFFFFFU)   // mask     the highest   bit of u
+#define mixBits(u, v)  (hiBit(u)|loBits(v))  // move hi bit of u to hi bit of v
+
+static uint32   state[N+1];     // state vector + 1 extra to not violate ANSI C
+static uint32   *next;          // next random value is computed from here
+static int      left = -1;      // can *next++ this many times before reloading
+
+
+void seedMT(uint32 seed)
+ {
+    //
+    // We initialize state[0..(N-1)] via the generator
+    //
+    //   x_new = (69069 * x_old) mod 2^32
+    //
+    // from Line 15 of Table 1, p. 106, Sec. 3.3.4 of Knuths
+    // _The Art of Computer Programming_, Volume 2, 3rd ed.
+    //
+    // Notes (SJC): I do not know what the initial state requirements
+    // of the Mersenne Twister are, but it seems this seeding generator
+    // could be better.  It achieves the maximum period for its modulus
+    // (2^30) iff x_initial is odd (p. 20-21, Sec. 3.2.1.2, Knuth); if
+    // x_initial can be even, you have sequences like 0, 0, 0, ...;
+    // 2^31, 2^31, 2^31, ...; 2^30, 2^30, 2^30, ...; 2^29, 2^29 + 2^31,
+    // 2^29, 2^29 + 2^31, ..., etc. so I force seed to be odd below.
+    //
+    // Even if x_initial is odd, if x_initial is 1 mod 4 then
+    //
+    //   the          lowest bit of x is always 1,
+    //   the  next-to-lowest bit of x is always 0,
+    //   the 2nd-from-lowest bit of x alternates      ... 0 1 0 1 0 1 0 1 ... ,
+    //   the 3rd-from-lowest bit of x 4-cycles        ... 0 1 1 0 0 1 1 0 ... ,
+    //   the 4th-from-lowest bit of x has the 8-cycle ... 0 0 0 1 1 1 1 0 ... ,
+    //    ...
+    //
+    // and if x_initial is 3 mod 4 then
+    //
+    //   the          lowest bit of x is always 1,
+    //   the  next-to-lowest bit of x is always 1,
+    //   the 2nd-from-lowest bit of x alternates      ... 0 1 0 1 0 1 0 1 ... ,
+    //   the 3rd-from-lowest bit of x 4-cycles        ... 0 0 1 1 0 0 1 1 ... ,
+    //   the 4th-from-lowest bit of x has the 8-cycle ... 0 0 1 1 1 1 0 0 ... ,
+    //    ...
+    //
+    // The generators potency (min. s>=0 with (69069-1)^s = 0 mod 2^32) is
+    // 16, which seems to be alright by p. 25, Sec. 3.2.1.3 of Knuth.  It
+    // also does well in the dimension 2..5 spectral tests, but it could be
+    // better in dimension 6 (Line 15, Table 1, p. 106, Sec. 3.3.4, Knuth).
+    //
+    // Note that the random number user does not see the values generated
+    // here directly since reloadMT() will always munge them first, so maybe
+    // none of all of this matters.  In fact, the seed values made here could
+    // even be extra-special desirable if the Mersenne Twister theory says
+    // so-- thats why the only change I made is to restrict to odd seeds.
+    //
+
+    register uint32 x = (seed | 1U) & 0xFFFFFFFFU, *s = state;
+    register int    j;
+
+    for(left=0, *s++=x, j=N; --j;
+        *s++ = (x*=69069U) & 0xFFFFFFFFU);
+ }
+
+
+uint32 reloadMT(void)
+ {
+    register uint32 *p0=state, *p2=state+2, *pM=state+M, s0, s1;
+    register int    j;
+
+    if(left < -1)
+        seedMT(4357U);
+
+    left=N-1, next=state+1;
+
+    for(s0=state[0], s1=state[1], j=N-M+1; --j; s0=s1, s1=*p2++)
+        *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
+
+    for(pM=state, j=M; --j; s0=s1, s1=*p2++)
+        *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
+
+    s1=state[0], *p0 = *pM ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
+    s1 ^= (s1 >> 11);
+    s1 ^= (s1 <<  7) & 0x9D2C5680U;
+    s1 ^= (s1 << 15) & 0xEFC60000U;
+    return(s1 ^ (s1 >> 18));
+ }
+
+
+uint32 randomMT(void)
+ {
+    uint32 y;
+
+    if(--left < 0)
+        return(reloadMT());
+
+    y  = *next++;
+    y ^= (y >> 11);
+    y ^= (y <<  7) & 0x9D2C5680U;
+    y ^= (y << 15) & 0xEFC60000U;
+    y ^= (y >> 18);
+    return(y);
+ }
+
+/*
+ #define uint32 unsigned long
+#define SMALL_INT char
+#define SMALL_INT_CLASS mxCHAR_CLASS
+void seedMT(uint32 seed);
+uint32 randomMT(void);
+
+#include "stdio.h"
+#include "math.h"
+
+int main(void)
+ {
+    int j;
+
+    // you can seed with any uint32, but the best are odds in 0..(2^32 - 1)
+
+    seedMT(4357U);
+    uint32 MAX=pow(2,32)-1;
+    // print the first 2,002 random numbers seven to a line as an example
+
+    for(j=0; j<2002; j++)
+        printf(" %10lu%s", (unsigned long) randomMT(), (j%7)==6 ? "\n" : "");
+    
+    for(j=0; j<2002; j++)
+        printf(" %f%s", ((double)randomMT()/(double)MAX), (j%7)==6 ? "\n" : "");
+
+    
+    return(1);
+ }
+*/
+
+
diff --git a/src/format.cc b/src/format.cc
new file mode 100644
index 0000000..9fd32de
--- /dev/null
+++ b/src/format.cc
@@ -0,0 +1,1213 @@
+/*
+ Formatting library for C++
+
+ Copyright (c) 2012 - 2015, Victor Zverovich
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "format.h"
+
+#include <string.h>
+
+#include <cctype>
+#include <cerrno>
+#include <climits>
+#include <cmath>
+#include <cstdarg>
+
+#ifdef _WIN32
+# ifdef __MINGW32__
+#  include <cstring>
+# endif
+# if defined(NOMINMAX) || defined(FMT_WIN_MINMAX)
+#  include <windows.h>
+# else
+#  define NOMINMAX
+#  include <windows.h>
+#  undef NOMINMAX
+# endif
+#endif
+
+using fmt::internal::Arg;
+
+// Check if exceptions are disabled.
+#if __GNUC__ && !__EXCEPTIONS
+# define FMT_EXCEPTIONS 0
+#endif
+#if _MSC_VER && !_HAS_EXCEPTIONS
+# define FMT_EXCEPTIONS 0
+#endif
+#ifndef FMT_EXCEPTIONS
+# define FMT_EXCEPTIONS 1
+#endif
+
+#if FMT_EXCEPTIONS
+# define FMT_TRY try
+# define FMT_CATCH(x) catch (x)
+#else
+# define FMT_TRY if (true)
+# define FMT_CATCH(x) if (false)
+#endif
+
+#ifndef FMT_THROW
+# if FMT_EXCEPTIONS
+#  define FMT_THROW(x) throw x
+# else
+#  define FMT_THROW(x) assert(false)
+# endif
+#endif
+
+#ifdef FMT_HEADER_ONLY
+# define FMT_FUNC inline
+#else
+# define FMT_FUNC
+#endif
+
+#if _MSC_VER
+# pragma warning(push)
+# pragma warning(disable: 4127)  // conditional expression is constant
+# pragma warning(disable: 4702)  // unreachable code
+// Disable deprecation warning for strerror. The latter is not called but
+// MSVC fails to detect it.
+# pragma warning(disable: 4996)
+#endif
+
+// Dummy implementations of strerror_r and strerror_s called if corresponding
+// system functions are not available.
+static inline fmt::internal::None<> strerror_r(int, char *, ...) {
+  return fmt::internal::None<>();
+}
+static inline fmt::internal::None<> strerror_s(char *, std::size_t, ...) {
+  return fmt::internal::None<>();
+}
+
+namespace {
+
+#ifndef _MSC_VER
+# define FMT_SNPRINTF snprintf
+#else  // _MSC_VER
+inline int fmt_snprintf(char *buffer, size_t size, const char *format, ...) {
+  va_list args;
+  va_start(args, format);
+  int result = vsnprintf_s(buffer, size, _TRUNCATE, format, args);
+  va_end(args);
+  return result;
+}
+# define FMT_SNPRINTF fmt_snprintf
+#endif  // _MSC_VER
+
+#if defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
+# define FMT_SWPRINTF snwprintf
+#else
+# define FMT_SWPRINTF swprintf
+#endif // defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
+
+// Checks if a value fits in int - used to avoid warnings about comparing
+// signed and unsigned integers.
+template <bool IsSigned>
+struct IntChecker {
+  template <typename T>
+  static bool fits_in_int(T value) {
+    unsigned max = INT_MAX;
+    return value <= max;
+  }
+};
+
+template <>
+struct IntChecker<true> {
+  template <typename T>
+  static bool fits_in_int(T value) {
+    return value >= INT_MIN && value <= INT_MAX;
+  }
+};
+
+const char RESET_COLOR[] = "\x1b[0m";
+
+typedef void (*FormatFunc)(fmt::Writer &, int, fmt::StringRef);
+
+// Portable thread-safe version of strerror.
+// Sets buffer to point to a string describing the error code.
+// This can be either a pointer to a string stored in buffer,
+// or a pointer to some static immutable string.
+// Returns one of the following values:
+//   0      - success
+//   ERANGE - buffer is not large enough to store the error message
+//   other  - failure
+// Buffer should be at least of size 1.
+int safe_strerror(
+    int error_code, char *&buffer, std::size_t buffer_size) FMT_NOEXCEPT {
+  assert(buffer != 0 && buffer_size != 0);
+
+  class StrError {
+   private:
+    int error_code_;
+    char *&buffer_;
+    std::size_t buffer_size_;
+
+    // A noop assignment operator to avoid bogus warnings.
+    void operator=(const StrError &) {}
+
+    // Handle the result of XSI-compliant version of strerror_r.
+    int handle(int result) {
+      // glibc versions before 2.13 return result in errno.
+      return result == -1 ? errno : result;
+    }
+
+    // Handle the result of GNU-specific version of strerror_r.
+    int handle(char *message) {
+      // If the buffer is full then the message is probably truncated.
+      if (message == buffer_ && strlen(buffer_) == buffer_size_ - 1)
+        return ERANGE;
+      buffer_ = message;
+      return 0;
+    }
+
+    // Handle the case when strerror_r is not available.
+    int handle(fmt::internal::None<>) {
+      return fallback(strerror_s(buffer_, buffer_size_, error_code_));
+    }
+
+    // Fallback to strerror_s when strerror_r is not available.
+    int fallback(int result) {
+      // If the buffer is full then the message is probably truncated.
+      return result == 0 && strlen(buffer_) == buffer_size_ - 1 ?
+            ERANGE : result;
+    }
+
+    // Fallback to strerror if strerror_r and strerror_s are not available.
+    int fallback(fmt::internal::None<>) {
+      errno = 0;
+      buffer_ = strerror(error_code_);
+      return errno;
+    }
+
+   public:
+    StrError(int error_code, char *&buffer, std::size_t buffer_size)
+      : error_code_(error_code), buffer_(buffer), buffer_size_(buffer_size) {}
+
+    int run() { return handle(strerror_r(error_code_, buffer_, buffer_size_)); }
+  };
+  return StrError(error_code, buffer, buffer_size).run();
+}
+
+void format_error_code(fmt::Writer &out, int error_code,
+                       fmt::StringRef message) FMT_NOEXCEPT {
+  // Report error code making sure that the output fits into
+  // INLINE_BUFFER_SIZE to avoid dynamic memory allocation and potential
+  // bad_alloc.
+  out.clear();
+  static const char SEP[] = ": ";
+  static const char ERROR_STR[] = "error ";
+  fmt::internal::IntTraits<int>::MainType ec_value = error_code;
+  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
+  std::size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
+  error_code_size += fmt::internal::count_digits(ec_value);
+  if (message.size() <= fmt::internal::INLINE_BUFFER_SIZE - error_code_size)
+    out << message << SEP;
+  out << ERROR_STR << error_code;
+  assert(out.size() <= fmt::internal::INLINE_BUFFER_SIZE);
+}
+
+void report_error(FormatFunc func,
+    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
+  fmt::MemoryWriter full_message;
+  func(full_message, error_code, message);
+  // Use Writer::data instead of Writer::c_str to avoid potential memory
+  // allocation.
+  std::fwrite(full_message.data(), full_message.size(), 1, stderr);
+  std::fputc('\n', stderr);
+}
+
+// IsZeroInt::visit(arg) returns true iff arg is a zero integer.
+class IsZeroInt : public fmt::internal::ArgVisitor<IsZeroInt, bool> {
+ public:
+  template <typename T>
+  bool visit_any_int(T value) { return value == 0; }
+};
+
+// Parses an unsigned integer advancing s to the end of the parsed input.
+// This function assumes that the first character of s is a digit.
+template <typename Char>
+int parse_nonnegative_int(const Char *&s) {
+  assert('0' <= *s && *s <= '9');
+  unsigned value = 0;
+  do {
+    unsigned new_value = value * 10 + (*s++ - '0');
+    // Check if value wrapped around.
+    if (new_value < value) {
+      value = UINT_MAX;
+      break;
+    }
+    value = new_value;
+  } while ('0' <= *s && *s <= '9');
+  if (value > INT_MAX)
+    FMT_THROW(fmt::FormatError("number is too big"));
+  return value;
+}
+
+inline void require_numeric_argument(const Arg &arg, char spec) {
+  if (arg.type > Arg::LAST_NUMERIC_TYPE) {
+    std::string message =
+        fmt::format("format specifier '{}' requires numeric argument", spec);
+    FMT_THROW(fmt::FormatError(message));
+  }
+}
+
+template <typename Char>
+void check_sign(const Char *&s, const Arg &arg) {
+  char sign = static_cast<char>(*s);
+  require_numeric_argument(arg, sign);
+  if (arg.type == Arg::UINT || arg.type == Arg::ULONG_LONG) {
+    FMT_THROW(fmt::FormatError(fmt::format(
+      "format specifier '{}' requires signed argument", sign)));
+  }
+  ++s;
+}
+
+// Checks if an argument is a valid printf width specifier and sets
+// left alignment if it is negative.
+class WidthHandler : public fmt::internal::ArgVisitor<WidthHandler, unsigned> {
+ private:
+  fmt::FormatSpec &spec_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(WidthHandler);
+
+ public:
+  explicit WidthHandler(fmt::FormatSpec &spec) : spec_(spec) {}
+
+  void report_unhandled_arg() {
+    FMT_THROW(fmt::FormatError("width is not integer"));
+  }
+
+  template <typename T>
+  unsigned visit_any_int(T value) {
+    typedef typename fmt::internal::IntTraits<T>::MainType UnsignedType;
+    UnsignedType width = value;
+    if (fmt::internal::is_negative(value)) {
+      spec_.align_ = fmt::ALIGN_LEFT;
+      width = 0 - width;
+    }
+    if (width > INT_MAX)
+      FMT_THROW(fmt::FormatError("number is too big"));
+    return static_cast<unsigned>(width);
+  }
+};
+
+class PrecisionHandler :
+    public fmt::internal::ArgVisitor<PrecisionHandler, int> {
+ public:
+  void report_unhandled_arg() {
+    FMT_THROW(fmt::FormatError("precision is not integer"));
+  }
+
+  template <typename T>
+  int visit_any_int(T value) {
+    if (!IntChecker<std::numeric_limits<T>::is_signed>::fits_in_int(value))
+      FMT_THROW(fmt::FormatError("number is too big"));
+    return static_cast<int>(value);
+  }
+};
+
+// Converts an integer argument to an integral type T for printf.
+template <typename T>
+class ArgConverter : public fmt::internal::ArgVisitor<ArgConverter<T>, void> {
+ private:
+  fmt::internal::Arg &arg_;
+  wchar_t type_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(ArgConverter);
+
+ public:
+  ArgConverter(fmt::internal::Arg &arg, wchar_t type)
+    : arg_(arg), type_(type) {}
+
+  template <typename U>
+  void visit_any_int(U value) {
+    bool is_signed = type_ == 'd' || type_ == 'i';
+    using fmt::internal::Arg;
+    if (sizeof(T) <= sizeof(int)) {
+      // Extra casts are used to silence warnings.
+      if (is_signed) {
+        arg_.type = Arg::INT;
+        arg_.int_value = static_cast<int>(static_cast<T>(value));
+      } else {
+        arg_.type = Arg::UINT;
+        arg_.uint_value = static_cast<unsigned>(
+            static_cast<typename fmt::internal::MakeUnsigned<T>::Type>(value));
+      }
+    } else {
+      if (is_signed) {
+        arg_.type = Arg::LONG_LONG;
+        arg_.long_long_value =
+            static_cast<typename fmt::internal::MakeUnsigned<U>::Type>(value);
+      } else {
+        arg_.type = Arg::ULONG_LONG;
+        arg_.ulong_long_value =
+            static_cast<typename fmt::internal::MakeUnsigned<U>::Type>(value);
+      }
+    }
+  }
+};
+
+// Converts an integer argument to char for printf.
+class CharConverter : public fmt::internal::ArgVisitor<CharConverter, void> {
+ private:
+  fmt::internal::Arg &arg_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(CharConverter);
+
+ public:
+  explicit CharConverter(fmt::internal::Arg &arg) : arg_(arg) {}
+
+  template <typename T>
+  void visit_any_int(T value) {
+    arg_.type = Arg::CHAR;
+    arg_.int_value = static_cast<char>(value);
+  }
+};
+
+// This function template is used to prevent compile errors when handling
+// incompatible string arguments, e.g. handling a wide string in a narrow
+// string formatter.
+template <typename Char>
+Arg::StringValue<Char> ignore_incompatible_str(Arg::StringValue<wchar_t>);
+
+template <>
+inline Arg::StringValue<char> ignore_incompatible_str(
+    Arg::StringValue<wchar_t>) { return Arg::StringValue<char>(); }
+
+template <>
+inline Arg::StringValue<wchar_t> ignore_incompatible_str(
+    Arg::StringValue<wchar_t> s) { return s; }
+}  // namespace
+
+FMT_FUNC void fmt::SystemError::init(
+    int err_code, StringRef format_str, ArgList args) {
+  error_code_ = err_code;
+  MemoryWriter w;
+  internal::format_system_error(w, err_code, format(format_str, args));
+  std::runtime_error &base = *this;
+  base = std::runtime_error(w.str());
+}
+
+template <typename T>
+int fmt::internal::CharTraits<char>::format_float(
+    char *buffer, std::size_t size, const char *format,
+    unsigned width, int precision, T value) {
+  if (width == 0) {
+    return precision < 0 ?
+        FMT_SNPRINTF(buffer, size, format, value) :
+        FMT_SNPRINTF(buffer, size, format, precision, value);
+  }
+  return precision < 0 ?
+      FMT_SNPRINTF(buffer, size, format, width, value) :
+      FMT_SNPRINTF(buffer, size, format, width, precision, value);
+}
+
+template <typename T>
+int fmt::internal::CharTraits<wchar_t>::format_float(
+    wchar_t *buffer, std::size_t size, const wchar_t *format,
+    unsigned width, int precision, T value) {
+  if (width == 0) {
+    return precision < 0 ?
+        FMT_SWPRINTF(buffer, size, format, value) :
+        FMT_SWPRINTF(buffer, size, format, precision, value);
+  }
+  return precision < 0 ?
+      FMT_SWPRINTF(buffer, size, format, width, value) :
+      FMT_SWPRINTF(buffer, size, format, width, precision, value);
+}
+
+template <typename T>
+const char fmt::internal::BasicData<T>::DIGITS[] =
+    "0001020304050607080910111213141516171819"
+    "2021222324252627282930313233343536373839"
+    "4041424344454647484950515253545556575859"
+    "6061626364656667686970717273747576777879"
+    "8081828384858687888990919293949596979899";
+
+#define FMT_POWERS_OF_10(factor) \
+  factor * 10, \
+  factor * 100, \
+  factor * 1000, \
+  factor * 10000, \
+  factor * 100000, \
+  factor * 1000000, \
+  factor * 10000000, \
+  factor * 100000000, \
+  factor * 1000000000
+
+template <typename T>
+const uint32_t fmt::internal::BasicData<T>::POWERS_OF_10_32[] = {
+  0, FMT_POWERS_OF_10(1)
+};
+
+template <typename T>
+const uint64_t fmt::internal::BasicData<T>::POWERS_OF_10_64[] = {
+  0,
+  FMT_POWERS_OF_10(1),
+  FMT_POWERS_OF_10(fmt::ULongLong(1000000000)),
+  // Multiply several constants instead of using a single long long constant
+  // to avoid warnings about C++98 not supporting long long.
+  fmt::ULongLong(1000000000) * fmt::ULongLong(1000000000) * 10
+};
+
+FMT_FUNC void fmt::internal::report_unknown_type(char code, const char *type) {
+  (void)type;
+  if (std::isprint(static_cast<unsigned char>(code))) {
+    FMT_THROW(fmt::FormatError(
+        fmt::format("unknown format code '{}' for {}", code, type)));
+  }
+  FMT_THROW(fmt::FormatError(
+      fmt::format("unknown format code '\\x{:02x}' for {}",
+        static_cast<unsigned>(code), type)));
+}
+
+#ifdef _WIN32
+
+FMT_FUNC fmt::internal::UTF8ToUTF16::UTF8ToUTF16(fmt::StringRef s) {
+  int length = MultiByteToWideChar(
+      CP_UTF8, MB_ERR_INVALID_CHARS, s.c_str(), -1, 0, 0);
+  static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
+  if (length == 0)
+    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
+  buffer_.resize(length);
+  length = MultiByteToWideChar(
+    CP_UTF8, MB_ERR_INVALID_CHARS, s.c_str(), -1, &buffer_[0], length);
+  if (length == 0)
+    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
+}
+
+FMT_FUNC fmt::internal::UTF16ToUTF8::UTF16ToUTF8(fmt::WStringRef s) {
+  if (int error_code = convert(s)) {
+    FMT_THROW(WindowsError(error_code,
+        "cannot convert string from UTF-16 to UTF-8"));
+  }
+}
+
+FMT_FUNC int fmt::internal::UTF16ToUTF8::convert(fmt::WStringRef s) {
+  int length = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, 0, 0, 0, 0);
+  if (length == 0)
+    return GetLastError();
+  buffer_.resize(length);
+  length = WideCharToMultiByte(
+    CP_UTF8, 0, s.c_str(), -1, &buffer_[0], length, 0, 0);
+  if (length == 0)
+    return GetLastError();
+  return 0;
+}
+
+FMT_FUNC void fmt::WindowsError::init(
+    int err_code, StringRef format_str, ArgList args) {
+  error_code_ = err_code;
+  MemoryWriter w;
+  internal::format_windows_error(w, err_code, format(format_str, args));
+  std::runtime_error &base = *this;
+  base = std::runtime_error(w.str());
+}
+
+#endif
+
+FMT_FUNC void fmt::internal::format_system_error(
+    fmt::Writer &out, int error_code,
+    fmt::StringRef message) FMT_NOEXCEPT {
+  FMT_TRY {
+    MemoryBuffer<char, INLINE_BUFFER_SIZE> buffer;
+    buffer.resize(INLINE_BUFFER_SIZE);
+    for (;;) {
+      char *system_message = &buffer[0];
+      int result = safe_strerror(error_code, system_message, buffer.size());
+      if (result == 0) {
+        out << message << ": " << system_message;
+        return;
+      }
+      if (result != ERANGE)
+        break;  // Can't get error message, report error code instead.
+      buffer.resize(buffer.size() * 2);
+    }
+  } FMT_CATCH(...) {}
+  format_error_code(out, error_code, message);
+}
+
+#ifdef _WIN32
+FMT_FUNC void fmt::internal::format_windows_error(
+    fmt::Writer &out, int error_code,
+    fmt::StringRef message) FMT_NOEXCEPT {
+  class String {
+   private:
+    LPWSTR str_;
+
+   public:
+    String() : str_() {}
+    ~String() { LocalFree(str_); }
+    LPWSTR *ptr() { return &str_; }
+    LPCWSTR c_str() const { return str_; }
+  };
+  FMT_TRY {
+    String system_message;
+    if (FormatMessageW(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+        FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, 0,
+        error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+        reinterpret_cast<LPWSTR>(system_message.ptr()), 0, 0)) {
+      UTF16ToUTF8 utf8_message;
+      if (utf8_message.convert(system_message.c_str()) == ERROR_SUCCESS) {
+        out << message << ": " << utf8_message;
+        return;
+      }
+    }
+  } FMT_CATCH(...) {}
+  format_error_code(out, error_code, message);
+}
+#endif
+
+// An argument formatter.
+template <typename Char>
+class fmt::internal::ArgFormatter :
+    public fmt::internal::ArgVisitor<fmt::internal::ArgFormatter<Char>, void> {
+ private:
+  fmt::BasicFormatter<Char> &formatter_;
+  fmt::BasicWriter<Char> &writer_;
+  fmt::FormatSpec &spec_;
+  const Char *format_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(ArgFormatter);
+
+ public:
+  ArgFormatter(
+      fmt::BasicFormatter<Char> &f,fmt::FormatSpec &s, const Char *fmt)
+  : formatter_(f), writer_(f.writer()), spec_(s), format_(fmt) {}
+
+  template <typename T>
+  void visit_any_int(T value) { writer_.write_int(value, spec_); }
+
+  template <typename T>
+  void visit_any_double(T value) { writer_.write_double(value, spec_); }
+
+  void visit_char(int value) {
+    if (spec_.type_ && spec_.type_ != 'c') {
+      spec_.flags_ |= CHAR_FLAG;
+      writer_.write_int(value, spec_);
+      return;
+    }
+    if (spec_.align_ == ALIGN_NUMERIC || spec_.flags_ != 0)
+      FMT_THROW(FormatError("invalid format specifier for char"));
+    typedef typename fmt::BasicWriter<Char>::CharPtr CharPtr;
+    Char fill = static_cast<Char>(spec_.fill());
+    if (spec_.precision_ == 0) {
+      std::fill_n(writer_.grow_buffer(spec_.width_), spec_.width_, fill);
+      return;
+    }
+    CharPtr out = CharPtr();
+    if (spec_.width_ > 1) {
+      out = writer_.grow_buffer(spec_.width_);
+      if (spec_.align_ == fmt::ALIGN_RIGHT) {
+        std::fill_n(out, spec_.width_ - 1, fill);
+        out += spec_.width_ - 1;
+      } else if (spec_.align_ == fmt::ALIGN_CENTER) {
+        out = writer_.fill_padding(out, spec_.width_, 1, fill);
+      } else {
+        std::fill_n(out + 1, spec_.width_ - 1, fill);
+      }
+    } else {
+      out = writer_.grow_buffer(1);
+    }
+    *out = static_cast<Char>(value);
+  }
+
+  void visit_string(Arg::StringValue<char> value) {
+    writer_.write_str(value, spec_);
+  }
+  void visit_wstring(Arg::StringValue<wchar_t> value) {
+    writer_.write_str(ignore_incompatible_str<Char>(value), spec_);
+  }
+
+  void visit_pointer(const void *value) {
+    if (spec_.type_ && spec_.type_ != 'p')
+      fmt::internal::report_unknown_type(spec_.type_, "pointer");
+    spec_.flags_ = fmt::HASH_FLAG;
+    spec_.type_ = 'x';
+    writer_.write_int(reinterpret_cast<uintptr_t>(value), spec_);
+  }
+
+  void visit_custom(Arg::CustomValue c) {
+    c.format(&formatter_, c.value, &format_);
+  }
+};
+
+template <typename Char>
+void fmt::internal::FixedBuffer<Char>::grow(std::size_t) {
+  FMT_THROW(std::runtime_error("buffer overflow"));
+}
+
+template <typename Char>
+template <typename StrChar>
+void fmt::BasicWriter<Char>::write_str(
+    const Arg::StringValue<StrChar> &s, const FormatSpec &spec) {
+  // Check if StrChar is convertible to Char.
+  internal::CharTraits<Char>::convert(StrChar());
+  if (spec.type_ && spec.type_ != 's')
+    internal::report_unknown_type(spec.type_, "string");
+  const StrChar *str_value = s.value;
+  std::size_t str_size = s.size;
+  if (str_size == 0) {
+    if (!str_value)
+      FMT_THROW(FormatError("string pointer is null"));
+    if (*str_value)
+      str_size = std::char_traits<StrChar>::length(str_value);
+  }
+  std::size_t precision = spec.precision_;
+  if (spec.precision_ >= 0 && precision < str_size)
+    str_size = spec.precision_;
+  write_str(str_value, str_size, spec);
+}
+
+template <typename Char>
+inline Arg fmt::BasicFormatter<Char>::parse_arg_index(const Char *&s) {
+  const char *error = 0;
+  Arg arg = *s < '0' || *s > '9' ?
+        next_arg(error) : get_arg(parse_nonnegative_int(s), error);
+  if (error) {
+    FMT_THROW(FormatError(
+                *s != '}' && *s != ':' ? "invalid format string" : error));
+  }
+  return arg;
+}
+
+FMT_FUNC Arg fmt::internal::FormatterBase::do_get_arg(
+    unsigned arg_index, const char *&error) {
+  Arg arg = args_[arg_index];
+  if (arg.type == Arg::NONE)
+    error = "argument index out of range";
+  return arg;
+}
+
+inline Arg fmt::internal::FormatterBase::next_arg(const char *&error) {
+  if (next_arg_index_ >= 0)
+    return do_get_arg(next_arg_index_++, error);
+  error = "cannot switch from manual to automatic argument indexing";
+  return Arg();
+}
+
+inline Arg fmt::internal::FormatterBase::get_arg(
+    unsigned arg_index, const char *&error) {
+  if (next_arg_index_ <= 0) {
+    next_arg_index_ = -1;
+    return do_get_arg(arg_index, error);
+  }
+  error = "cannot switch from automatic to manual argument indexing";
+  return Arg();
+}
+
+template <typename Char>
+void fmt::internal::PrintfFormatter<Char>::parse_flags(
+    FormatSpec &spec, const Char *&s) {
+  for (;;) {
+    switch (*s++) {
+      case '-':
+        spec.align_ = ALIGN_LEFT;
+        break;
+      case '+':
+        spec.flags_ |= SIGN_FLAG | PLUS_FLAG;
+        break;
+      case '0':
+        spec.fill_ = '0';
+        break;
+      case ' ':
+        spec.flags_ |= SIGN_FLAG;
+        break;
+      case '#':
+        spec.flags_ |= HASH_FLAG;
+        break;
+      default:
+        --s;
+        return;
+    }
+  }
+}
+
+template <typename Char>
+Arg fmt::internal::PrintfFormatter<Char>::get_arg(
+    const Char *s, unsigned arg_index) {
+  (void)s;
+  const char *error = 0;
+  Arg arg = arg_index == UINT_MAX ?
+    next_arg(error) : FormatterBase::get_arg(arg_index - 1, error);
+  if (error)
+    FMT_THROW(FormatError(!*s ? "invalid format string" : error));
+  return arg;
+}
+
+template <typename Char>
+unsigned fmt::internal::PrintfFormatter<Char>::parse_header(
+  const Char *&s, FormatSpec &spec) {
+  unsigned arg_index = UINT_MAX;
+  Char c = *s;
+  if (c >= '0' && c <= '9') {
+    // Parse an argument index (if followed by '$') or a width possibly
+    // preceded with '0' flag(s).
+    unsigned value = parse_nonnegative_int(s);
+    if (*s == '$') {  // value is an argument index
+      ++s;
+      arg_index = value;
+    } else {
+      if (c == '0')
+        spec.fill_ = '0';
+      if (value != 0) {
+        // Nonzero value means that we parsed width and don't need to
+        // parse it or flags again, so return now.
+        spec.width_ = value;
+        return arg_index;
+      }
+    }
+  }
+  parse_flags(spec, s);
+  // Parse width.
+  if (*s >= '0' && *s <= '9') {
+    spec.width_ = parse_nonnegative_int(s);
+  } else if (*s == '*') {
+    ++s;
+    spec.width_ = WidthHandler(spec).visit(get_arg(s));
+  }
+  return arg_index;
+}
+
+template <typename Char>
+void fmt::internal::PrintfFormatter<Char>::format(
+    BasicWriter<Char> &writer, BasicStringRef<Char> format_str,
+    const ArgList &args) {
+  const Char *start = format_str.c_str();
+  set_args(args);
+  const Char *s = start;
+  while (*s) {
+    Char c = *s++;
+    if (c != '%') continue;
+    if (*s == c) {
+      write(writer, start, s);
+      start = ++s;
+      continue;
+    }
+    write(writer, start, s - 1);
+
+    FormatSpec spec;
+    spec.align_ = ALIGN_RIGHT;
+
+    // Parse argument index, flags and width.
+    unsigned arg_index = parse_header(s, spec);
+
+    // Parse precision.
+    if (*s == '.') {
+      ++s;
+      if ('0' <= *s && *s <= '9') {
+        spec.precision_ = parse_nonnegative_int(s);
+      } else if (*s == '*') {
+        ++s;
+        spec.precision_ = PrecisionHandler().visit(get_arg(s));
+      }
+    }
+
+    Arg arg = get_arg(s, arg_index);
+    if (spec.flag(HASH_FLAG) && IsZeroInt().visit(arg))
+      spec.flags_ &= ~HASH_FLAG;
+    if (spec.fill_ == '0') {
+      if (arg.type <= Arg::LAST_NUMERIC_TYPE)
+        spec.align_ = ALIGN_NUMERIC;
+      else
+        spec.fill_ = ' ';  // Ignore '0' flag for non-numeric types.
+    }
+
+    // Parse length and convert the argument to the required type.
+    switch (*s++) {
+    case 'h':
+      if (*s == 'h')
+        ArgConverter<signed char>(arg, *++s).visit(arg);
+      else
+        ArgConverter<short>(arg, *s).visit(arg);
+      break;
+    case 'l':
+      if (*s == 'l')
+        ArgConverter<fmt::LongLong>(arg, *++s).visit(arg);
+      else
+        ArgConverter<long>(arg, *s).visit(arg);
+      break;
+    case 'j':
+      ArgConverter<intmax_t>(arg, *s).visit(arg);
+      break;
+    case 'z':
+      ArgConverter<size_t>(arg, *s).visit(arg);
+      break;
+    case 't':
+      ArgConverter<ptrdiff_t>(arg, *s).visit(arg);
+      break;
+    case 'L':
+      // printf produces garbage when 'L' is omitted for long double, no
+      // need to do the same.
+      break;
+    default:
+      --s;
+      ArgConverter<int>(arg, *s).visit(arg);
+    }
+
+    // Parse type.
+    if (!*s)
+      FMT_THROW(FormatError("invalid format string"));
+    spec.type_ = static_cast<char>(*s++);
+    if (arg.type <= Arg::LAST_INTEGER_TYPE) {
+      // Normalize type.
+      switch (spec.type_) {
+      case 'i': case 'u':
+        spec.type_ = 'd';
+        break;
+      case 'c':
+        // TODO: handle wchar_t
+        CharConverter(arg).visit(arg);
+        break;
+      }
+    }
+
+    start = s;
+
+    // Format argument.
+    switch (arg.type) {
+    case Arg::INT:
+      writer.write_int(arg.int_value, spec);
+      break;
+    case Arg::UINT:
+      writer.write_int(arg.uint_value, spec);
+      break;
+    case Arg::LONG_LONG:
+      writer.write_int(arg.long_long_value, spec);
+      break;
+    case Arg::ULONG_LONG:
+      writer.write_int(arg.ulong_long_value, spec);
+      break;
+    case Arg::CHAR: {
+      if (spec.type_ && spec.type_ != 'c')
+        writer.write_int(arg.int_value, spec);
+      typedef typename BasicWriter<Char>::CharPtr CharPtr;
+      CharPtr out = CharPtr();
+      if (spec.width_ > 1) {
+        Char fill = ' ';
+        out = writer.grow_buffer(spec.width_);
+        if (spec.align_ != ALIGN_LEFT) {
+          std::fill_n(out, spec.width_ - 1, fill);
+          out += spec.width_ - 1;
+        } else {
+          std::fill_n(out + 1, spec.width_ - 1, fill);
+        }
+      } else {
+        out = writer.grow_buffer(1);
+      }
+      *out = static_cast<Char>(arg.int_value);
+      break;
+    }
+    case Arg::DOUBLE:
+      writer.write_double(arg.double_value, spec);
+      break;
+    case Arg::LONG_DOUBLE:
+      writer.write_double(arg.long_double_value, spec);
+      break;
+    case Arg::CSTRING:
+      arg.string.size = 0;
+      writer.write_str(arg.string, spec);
+      break;
+    case Arg::STRING:
+      writer.write_str(arg.string, spec);
+      break;
+    case Arg::WSTRING:
+      writer.write_str(ignore_incompatible_str<Char>(arg.wstring), spec);
+      break;
+    case Arg::POINTER:
+      if (spec.type_ && spec.type_ != 'p')
+        internal::report_unknown_type(spec.type_, "pointer");
+      spec.flags_= HASH_FLAG;
+      spec.type_ = 'x';
+      writer.write_int(reinterpret_cast<uintptr_t>(arg.pointer), spec);
+      break;
+    case Arg::CUSTOM: {
+      if (spec.type_)
+        internal::report_unknown_type(spec.type_, "object");
+      const void *str_format = "s";
+      arg.custom.format(&writer, arg.custom.value, &str_format);
+      break;
+    }
+    default:
+      assert(false);
+      break;
+    }
+  }
+  write(writer, start, s);
+}
+
+template <typename Char>
+const Char *fmt::BasicFormatter<Char>::format(
+    const Char *&format_str, const Arg &arg) {
+  const Char *s = format_str;
+  FormatSpec spec;
+  if (*s == ':') {
+    if (arg.type == Arg::CUSTOM) {
+      arg.custom.format(this, arg.custom.value, &s);
+      return s;
+    }
+    ++s;
+    // Parse fill and alignment.
+    if (Char c = *s) {
+      const Char *p = s + 1;
+      spec.align_ = ALIGN_DEFAULT;
+      do {
+        switch (*p) {
+          case '<':
+            spec.align_ = ALIGN_LEFT;
+            break;
+          case '>':
+            spec.align_ = ALIGN_RIGHT;
+            break;
+          case '=':
+            spec.align_ = ALIGN_NUMERIC;
+            break;
+          case '^':
+            spec.align_ = ALIGN_CENTER;
+            break;
+        }
+        if (spec.align_ != ALIGN_DEFAULT) {
+          if (p != s) {
+            if (c == '}') break;
+            if (c == '{')
+              FMT_THROW(FormatError("invalid fill character '{'"));
+            s += 2;
+            spec.fill_ = c;
+          } else ++s;
+          if (spec.align_ == ALIGN_NUMERIC)
+            require_numeric_argument(arg, '=');
+          break;
+        }
+      } while (--p >= s);
+    }
+
+    // Parse sign.
+    switch (*s) {
+      case '+':
+        check_sign(s, arg);
+        spec.flags_ |= SIGN_FLAG | PLUS_FLAG;
+        break;
+      case '-':
+        check_sign(s, arg);
+        spec.flags_ |= MINUS_FLAG;
+        break;
+      case ' ':
+        check_sign(s, arg);
+        spec.flags_ |= SIGN_FLAG;
+        break;
+    }
+
+    if (*s == '#') {
+      require_numeric_argument(arg, '#');
+      spec.flags_ |= HASH_FLAG;
+      ++s;
+    }
+
+    // Parse width and zero flag.
+    if ('0' <= *s && *s <= '9') {
+      if (*s == '0') {
+        require_numeric_argument(arg, '0');
+        spec.align_ = ALIGN_NUMERIC;
+        spec.fill_ = '0';
+      }
+      // Zero may be parsed again as a part of the width, but it is simpler
+      // and more efficient than checking if the next char is a digit.
+      spec.width_ = parse_nonnegative_int(s);
+    }
+
+    // Parse precision.
+    if (*s == '.') {
+      ++s;
+      spec.precision_ = 0;
+      if ('0' <= *s && *s <= '9') {
+        spec.precision_ = parse_nonnegative_int(s);
+      } else if (*s == '{') {
+        ++s;
+        const Arg &precision_arg = parse_arg_index(s);
+        if (*s++ != '}')
+          FMT_THROW(FormatError("invalid format string"));
+        ULongLong value = 0;
+        switch (precision_arg.type) {
+          case Arg::INT:
+            if (precision_arg.int_value < 0)
+              FMT_THROW(FormatError("negative precision"));
+            value = precision_arg.int_value;
+            break;
+          case Arg::UINT:
+            value = precision_arg.uint_value;
+            break;
+          case Arg::LONG_LONG:
+            if (precision_arg.long_long_value < 0)
+              FMT_THROW(FormatError("negative precision"));
+            value = precision_arg.long_long_value;
+            break;
+          case Arg::ULONG_LONG:
+            value = precision_arg.ulong_long_value;
+            break;
+          default:
+            FMT_THROW(FormatError("precision is not integer"));
+        }
+        if (value > INT_MAX)
+          FMT_THROW(FormatError("number is too big"));
+        spec.precision_ = static_cast<int>(value);
+      } else {
+        FMT_THROW(FormatError("missing precision specifier"));
+      }
+      if (arg.type < Arg::LAST_INTEGER_TYPE || arg.type == Arg::POINTER) {
+        FMT_THROW(FormatError(
+            fmt::format("precision not allowed in {} format specifier",
+            arg.type == Arg::POINTER ? "pointer" : "integer")));
+      }
+    }
+
+    // Parse type.
+    if (*s != '}' && *s)
+      spec.type_ = static_cast<char>(*s++);
+  }
+
+  if (*s++ != '}')
+    FMT_THROW(FormatError("missing '}' in format string"));
+  start_ = s;
+
+  // Format argument.
+  internal::ArgFormatter<Char>(*this, spec, s - 1).visit(arg);
+  return s;
+}
+
+template <typename Char>
+void fmt::BasicFormatter<Char>::format(
+    BasicStringRef<Char> format_str, const ArgList &args) {
+  const Char *s = start_ = format_str.c_str();
+  set_args(args);
+  while (*s) {
+    Char c = *s++;
+    if (c != '{' && c != '}') continue;
+    if (*s == c) {
+      write(writer_, start_, s);
+      start_ = ++s;
+      continue;
+    }
+    if (c == '}')
+      FMT_THROW(FormatError("unmatched '}' in format string"));
+    write(writer_, start_, s - 1);
+    Arg arg = parse_arg_index(s);
+    s = format(s, arg);
+  }
+  write(writer_, start_, s);
+}
+
+FMT_FUNC void fmt::report_system_error(
+    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
+  report_error(internal::format_system_error, error_code, message);
+}
+
+#ifdef _WIN32
+FMT_FUNC void fmt::report_windows_error(
+    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
+  report_error(internal::format_windows_error, error_code, message);
+}
+#endif
+
+FMT_FUNC void fmt::print(std::FILE *f, StringRef format_str, ArgList args) {
+  MemoryWriter w;
+  w.write(format_str, args);
+  std::fwrite(w.data(), 1, w.size(), f);
+}
+
+FMT_FUNC void fmt::print(StringRef format_str, ArgList args) {
+  print(stdout, format_str, args);
+}
+
+FMT_FUNC void fmt::print(std::ostream &os, StringRef format_str, ArgList args) {
+  MemoryWriter w;
+  w.write(format_str, args);
+  os.write(w.data(), w.size());
+}
+
+FMT_FUNC void fmt::print_colored(Color c, StringRef format, ArgList args) {
+  char escape[] = "\x1b[30m";
+  escape[3] = '0' + static_cast<char>(c);
+  std::fputs(escape, stdout);
+  print(format, args);
+  std::fputs(RESET_COLOR, stdout);
+}
+
+FMT_FUNC int fmt::fprintf(std::FILE *f, StringRef format, ArgList args) {
+  MemoryWriter w;
+  printf(w, format, args);
+  std::size_t size = w.size();
+  return std::fwrite(w.data(), 1, size, f) < size ? -1 : static_cast<int>(size);
+}
+
+#ifndef FMT_HEADER_ONLY
+
+// Explicit instantiations for char.
+
+template void fmt::internal::FixedBuffer<char>::grow(std::size_t);
+
+template const char *fmt::BasicFormatter<char>::format(
+    const char *&format_str, const fmt::internal::Arg &arg);
+
+template void fmt::BasicFormatter<char>::format(
+  BasicStringRef<char> format, const ArgList &args);
+
+template void fmt::internal::PrintfFormatter<char>::format(
+  BasicWriter<char> &writer, BasicStringRef<char> format, const ArgList &args);
+
+template int fmt::internal::CharTraits<char>::format_float(
+    char *buffer, std::size_t size, const char *format,
+    unsigned width, int precision, double value);
+
+template int fmt::internal::CharTraits<char>::format_float(
+    char *buffer, std::size_t size, const char *format,
+    unsigned width, int precision, long double value);
+
+// Explicit instantiations for wchar_t.
+
+template void fmt::internal::FixedBuffer<wchar_t>::grow(std::size_t);
+
+template const wchar_t *fmt::BasicFormatter<wchar_t>::format(
+    const wchar_t *&format_str, const fmt::internal::Arg &arg);
+
+template void fmt::BasicFormatter<wchar_t>::format(
+    BasicStringRef<wchar_t> format, const ArgList &args);
+
+template void fmt::internal::PrintfFormatter<wchar_t>::format(
+    BasicWriter<wchar_t> &writer, BasicStringRef<wchar_t> format,
+    const ArgList &args);
+
+template int fmt::internal::CharTraits<wchar_t>::format_float(
+    wchar_t *buffer, std::size_t size, const wchar_t *format,
+    unsigned width, int precision, double value);
+
+template int fmt::internal::CharTraits<wchar_t>::format_float(
+    wchar_t *buffer, std::size_t size, const wchar_t *format,
+    unsigned width, int precision, long double value);
+
+#endif  // FMT_HEADER_ONLY
+
+#if _MSC_VER
+# pragma warning(pop)
+#endif
diff --git a/src/is.c b/src/is.c
new file mode 100644
index 0000000..46f1772
--- /dev/null
+++ b/src/is.c
@@ -0,0 +1,223 @@
+/*
+ * sais.c for sais-lite
+ * Copyright (c) 2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+#  include "malloc_wrap.h"
+#endif
+
+typedef unsigned char ubyte_t;
+#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i])
+
+/* find the start or end of each bucket */
+static void getCounts(const unsigned char *T, int *C, int n, int k, int cs)
+{
+	int i;
+	for (i = 0; i < k; ++i) C[i] = 0;
+	for (i = 0; i < n; ++i) ++C[chr(i)];
+}
+static void getBuckets(const int *C, int *B, int k, int end)
+{
+	int i, sum = 0;
+	if (end) {
+		for (i = 0; i < k; ++i) {
+			sum += C[i];
+			B[i] = sum;
+		}
+	} else {
+		for (i = 0; i < k; ++i) {
+			sum += C[i];
+			B[i] = sum - C[i];
+		}
+	}
+}
+
+/* compute SA */
+static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs)
+{
+	int *b, i, j;
+	int  c0, c1;
+	/* compute SAl */
+	if (C == B) getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 0);	/* find starts of buckets */
+	j = n - 1;
+	b = SA + B[c1 = chr(j)];
+	*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
+	for (i = 0; i < n; ++i) {
+		j = SA[i], SA[i] = ~j;
+		if (0 < j) {
+			--j;
+			if ((c0 = chr(j)) != c1) {
+				B[c1] = b - SA;
+				b = SA + B[c1 = c0];
+			}
+			*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
+		}
+	}
+	/* compute SAs */
+	if (C == B) getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 1);	/* find ends of buckets */
+	for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
+		if (0 < (j = SA[i])) {
+			--j;
+			if ((c0 = chr(j)) != c1) {
+				B[c1] = b - SA;
+				b = SA + B[c1 = c0];
+			}
+			*--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j;
+		} else SA[i] = ~j;
+	}
+}
+
+/*
+ * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working
+ * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet
+ */
+static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs)
+{
+	int *C, *B, *RA;
+	int  i, j, c, m, p, q, plen, qlen, name;
+	int  c0, c1;
+	int  diff;
+
+	/* stage 1: reduce the problem by at least 1/2 sort all the
+	 * S-substrings */
+	if (k <= fs) {
+		C = SA + n;
+		B = (k <= (fs - k)) ? C + k : C;
+	} else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
+	getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 1);	/* find ends of buckets */
+	for (i = 0; i < n; ++i) SA[i] = 0;
+	for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+		if ((c0 = chr(i)) < (c1 + c)) c = 1;
+		else if (c != 0) SA[--B[c1]] = i + 1, c = 0;
+	}
+	induceSA(T, SA, C, B, n, k, cs);
+	if (fs < k) free(C);
+	/* compact all the sorted substrings into the first m items of SA
+	 * 2*m must be not larger than n (proveable) */
+	for (i = 0, m = 0; i < n; ++i) {
+		p = SA[i];
+		if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) {
+			for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j);
+			if ((j < n) && (c0 < c1)) SA[m++] = p;
+		}
+	}
+	for (i = m; i < n; ++i) SA[i] = 0;	/* init the name array buffer */
+	/* store the length of all substrings */
+	for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+		if ((c0 = chr(i)) < (c1 + c)) c = 1;
+		else if (c != 0) {
+			SA[m + ((i + 1) >> 1)] = j - i - 1;
+			j = i + 1;
+			c = 0;
+		}
+	}
+	/* find the lexicographic names of all substrings */
+	for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
+		p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
+		if (plen == qlen) {
+			for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++);
+			if (j == plen) diff = 0;
+		}
+		if (diff != 0) ++name, q = p, qlen = plen;
+		SA[m + (p >> 1)] = name;
+	}
+
+	/* stage 2: solve the reduced problem recurse if names are not yet
+	 * unique */
+	if (name < m) {
+		RA = SA + n + fs - m;
+		for (i = n - 1, j = m - 1; m <= i; --i) {
+			if (SA[i] != 0) RA[j--] = SA[i] - 1;
+		}
+		if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2;
+		for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+			if ((c0 = chr(i)) < (c1 + c)) c = 1;
+			else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */
+		}
+		for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */
+	}
+	/* stage 3: induce the result for the original problem */
+	if (k <= fs) {
+		C = SA + n;
+		B = (k <= (fs - k)) ? C + k : C;
+	} else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
+	/* put all left-most S characters into their buckets */
+	getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 1);	/* find ends of buckets */
+	for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
+	for (i = m - 1; 0 <= i; --i) {
+		j = SA[i], SA[i] = 0;
+		SA[--B[chr(j)]] = j;
+	}
+	induceSA(T, SA, C, B, n, k, cs);
+	if (fs < k) free(C);
+	return 0;
+}
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n] The output array of suffixes.
+ * @param n The length of the given string.
+ * @return 0 if no error occurred
+ */
+int is_sa(const ubyte_t *T, int *SA, int n)
+{
+	if ((T == NULL) || (SA == NULL) || (n < 0)) return -1;
+	SA[0] = n;
+	if (n <= 1) {
+		if (n == 1) SA[1] = 0;
+		return 0;
+	}
+	return sais_main(T, SA+1, 0, n, 256, 1);
+}
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T[0..n-1] The input string.
+ * @param n The length of the given string.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int is_bwt(ubyte_t *T, int n)
+{
+	int *SA, i, primary = 0;
+	SA = (int*)calloc(n+1, sizeof(int));
+
+	if (is_sa(T, SA, n)) return -1;
+
+	for (i = 0; i <= n; ++i) {
+		if (SA[i] == 0) primary = i;
+		else SA[i] = T[SA[i] - 1];
+	}
+	for (i = 0; i < primary; ++i) T[i] = SA[i];
+	for (; i < n; ++i) T[i] = SA[i + 1];
+	free(SA);
+	return primary;
+}
diff --git a/src/merge_files.cc b/src/merge_files.cc
new file mode 100644
index 0000000..6203c60
--- /dev/null
+++ b/src/merge_files.cc
@@ -0,0 +1,149 @@
+/*  This file is part of Jellyfish.
+
+    Jellyfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Jellyfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Jellyfish.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "merge_files.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <memory>
+#include <string>
+
+#include <jellyfish/err.hpp>
+#include <jellyfish/misc.hpp>
+#include <jellyfish/mer_heap.hpp>
+#include <jellyfish/jellyfish.hpp>
+#include <jellyfish/rectangular_binary_matrix.hpp>
+#include <jellyfish/cpp_array.hpp>
+
+using jellyfish::file_header;
+using jellyfish::RectangularBinaryMatrix;
+using jellyfish::mer_dna;
+using jellyfish::cpp_array;
+typedef std::auto_ptr<binary_reader> binary_reader_ptr;
+typedef std::auto_ptr<text_reader> text_reader_ptr;
+
+struct file_info {
+  std::ifstream is;
+  file_header   header;
+
+  file_info(const char* path) :
+  is(path),
+  header(is)
+  { }
+};
+typedef std::auto_ptr<RectangularBinaryMatrix> matrix_ptr;
+
+template<typename reader_type, typename writer_type>
+void do_merge(cpp_array<file_info>& files, std::ostream& out, writer_type& writer,
+              uint64_t min, uint64_t max) {
+  cpp_array<reader_type> readers(files.size());
+  typedef jellyfish::mer_heap::heap<mer_dna, reader_type> heap_type;
+  typedef typename heap_type::const_item_t heap_item;
+  heap_type heap(files.size());
+
+  for(size_t i = 0; i < files.size(); ++i) {
+    readers.init(i, files[i].is, &files[i].header);
+    if(readers[i].next())
+      heap.push(readers[i]);
+  }
+
+  heap_item head = heap.head();
+  mer_dna   key;
+  while(heap.is_not_empty()) {
+    key = head->key_;
+    uint64_t sum = 0;
+    do {
+      sum += head->val_;
+      heap.pop();
+      if(head->it_->next())
+        heap.push(*head->it_);
+      head = heap.head();
+    } while(head->key_ == key && heap.is_not_empty());
+    if(sum >= min && sum <= max)
+      writer.write(out, key, sum);
+  }
+}
+
+// Merge files. Throws an error if unsuccessful.
+void merge_files(std::vector<const char*> input_files,
+                 const char* out_file,
+                 file_header& out_header,
+                 uint64_t min, uint64_t max) {
+  unsigned int key_len            = 0;
+  size_t       max_reprobe_offset = 0;
+  size_t       size               = 0;
+  unsigned int out_counter_len    = std::numeric_limits<unsigned int>::max();
+  std::string  format;
+  matrix_ptr   matrix;
+
+  cpp_array<file_info> files(input_files.size());
+
+  // create an iterator for each hash file
+  for(size_t i = 0; i < files.size(); i++) {
+    files.init(i, input_files[i]);
+    if(!files[i].is.good())
+      eraise(MergeError) << "Failed to open input file '" << input_files[i] << "'";
+
+    file_header& h = files[i].header;
+    if(i == 0) {
+      key_len            = h.key_len();
+      max_reprobe_offset = h.max_reprobe_offset();
+      size               = h.size();
+      matrix.reset(new RectangularBinaryMatrix(h.matrix()));
+      out_header.size(size);
+      out_header.key_len(key_len);
+      format = h.format();
+      out_header.matrix(*matrix);
+      out_header.max_reprobe(h.max_reprobe());
+      size_t reprobes[h.max_reprobe() + 1];
+      h.get_reprobes(reprobes);
+      out_header.set_reprobes(reprobes);
+      out_counter_len = std::min(out_counter_len, h.counter_len());
+    } else {
+      if(format != h.format())
+        eraise(MergeError) << "Can't merge files with different formats (" << format << ", " << h.format() << ")";
+      if(h.key_len() != key_len)
+        eraise(MergeError) << "Can't merge hashes of different key lengths (" << key_len << ", " << h.key_len() << ")";
+      if(h.max_reprobe_offset() != max_reprobe_offset)
+        eraise(MergeError) << "Can't merge hashes with different reprobing strategies";
+      if(h.size() != size)
+        eraise(MergeError) << "Can't merge hash with different size (" << size << ", " << h.size() << ")";
+      if(h.matrix() != *matrix)
+        eraise(MergeError) << "Can't merge hash with different hash function";
+    }
+  }
+  mer_dna::k(key_len / 2);
+
+  std::ofstream out(out_file);
+  if(!out.good())
+    eraise(MergeError) << "Can't open out file '" << out_file << "'";
+  out_header.format(format);
+
+  if(!format.compare(binary_dumper::format)) {
+    out_header.counter_len(out_counter_len);
+    out_header.write(out);
+    binary_writer writer(out_counter_len, key_len);
+    do_merge<binary_reader, binary_writer>(files, out, writer, min, max);
+  } else if(!format.compare(text_dumper::format)) {
+    out_header.write(out);
+    text_writer writer;
+    do_merge<text_reader, text_writer>(files, out, writer, min, max);
+  } else {
+    eraise(MergeError) << "Unknown format '" << format << "'";
+  }
+  out.close();
+}
diff --git a/src/posix.cc b/src/posix.cc
new file mode 100644
index 0000000..0efb5af
--- /dev/null
+++ b/src/posix.cc
@@ -0,0 +1,252 @@
+/*
+ A C++ interface to POSIX functions.
+
+ Copyright (c) 2014 - 2015, Victor Zverovich
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Disable bogus MSVC warnings.
+#ifndef _CRT_SECURE_NO_WARNINGS
+# define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include "posix.h"
+
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#ifndef _WIN32
+# include <unistd.h>
+#else
+# include <windows.h>
+# include <io.h>
+
+# define O_CREAT _O_CREAT
+# define O_TRUNC _O_TRUNC
+
+# ifndef S_IRUSR
+#  define S_IRUSR _S_IREAD
+# endif
+
+# ifndef S_IWUSR
+#  define S_IWUSR _S_IWRITE
+# endif
+
+# ifdef __MINGW32__
+#  define _SH_DENYNO 0x40
+#  undef fileno
+# endif
+
+#endif  // _WIN32
+
+namespace {
+#ifdef _WIN32
+// Return type of read and write functions.
+typedef int RWResult;
+
+// On Windows the count argument to read and write is unsigned, so convert
+// it from size_t preventing integer overflow.
+inline unsigned convert_rwcount(std::size_t count) {
+  return count <= UINT_MAX ? static_cast<unsigned>(count) : UINT_MAX;
+}
+#else
+// Return type of read and write functions.
+typedef ssize_t RWResult;
+
+inline std::size_t convert_rwcount(std::size_t count) { return count; }
+#endif
+}
+
+fmt::BufferedFile::~BufferedFile() FMT_NOEXCEPT {
+  if (file_ && FMT_SYSTEM(fclose(file_)) != 0)
+    fmt::report_system_error(errno, "cannot close file");
+}
+
+fmt::BufferedFile::BufferedFile(fmt::StringRef filename, fmt::StringRef mode) {
+  FMT_RETRY_VAL(file_, FMT_SYSTEM(fopen(filename.c_str(), mode.c_str())), 0);
+  if (!file_)
+    throw SystemError(errno, "cannot open file {}", filename);
+}
+
+void fmt::BufferedFile::close() {
+  if (!file_)
+    return;
+  int result = FMT_SYSTEM(fclose(file_));
+  file_ = 0;
+  if (result != 0)
+    throw SystemError(errno, "cannot close file");
+}
+
+// A macro used to prevent expansion of fileno on broken versions of MinGW.
+#define FMT_ARGS
+
+int fmt::BufferedFile::fileno() const {
+  int fd = FMT_POSIX_CALL(fileno FMT_ARGS(file_));
+  if (fd == -1)
+    throw SystemError(errno, "cannot get file descriptor");
+  return fd;
+}
+
+fmt::File::File(fmt::StringRef path, int oflag) {
+  int mode = S_IRUSR | S_IWUSR;
+#if defined(_WIN32) && !defined(__MINGW32__)
+  fd_ = -1;
+  FMT_POSIX_CALL(sopen_s(&fd_, path.c_str(), oflag, _SH_DENYNO, mode));
+#else
+  FMT_RETRY(fd_, FMT_POSIX_CALL(open(path.c_str(), oflag, mode)));
+#endif
+  if (fd_ == -1)
+    throw SystemError(errno, "cannot open file {}", path);
+}
+
+fmt::File::~File() FMT_NOEXCEPT {
+  // Don't retry close in case of EINTR!
+  // See http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-09/3000.html
+  if (fd_ != -1 && FMT_POSIX_CALL(close(fd_)) != 0)
+    fmt::report_system_error(errno, "cannot close file");
+}
+
+void fmt::File::close() {
+  if (fd_ == -1)
+    return;
+  // Don't retry close in case of EINTR!
+  // See http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-09/3000.html
+  int result = FMT_POSIX_CALL(close(fd_));
+  fd_ = -1;
+  if (result != 0)
+    throw SystemError(errno, "cannot close file");
+}
+
+fmt::LongLong fmt::File::size() const {
+#ifdef _WIN32
+  // Use GetFileSize instead of GetFileSizeEx for the case when _WIN32_WINNT
+  // is less than 0x0500 as is the case with some default MinGW builds.
+  // Both functions support large file sizes.
+  DWORD size_upper = 0;
+  HANDLE handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd_));
+  DWORD size_lower = FMT_SYSTEM(GetFileSize(handle, &size_upper));
+  if (size_lower == INVALID_FILE_SIZE) {
+    DWORD error = GetLastError();
+    if (error != NO_ERROR)
+      throw WindowsError(GetLastError(), "cannot get file size");
+  }
+  fmt::ULongLong size = size_upper;
+  return (size << sizeof(DWORD) * CHAR_BIT) | size_lower;
+#else
+  typedef struct stat Stat;
+  Stat file_stat = Stat();
+  if (FMT_POSIX_CALL(fstat(fd_, &file_stat)) == -1)
+    throw SystemError(errno, "cannot get file attributes");
+  FMT_STATIC_ASSERT(sizeof(fmt::LongLong) >= sizeof(file_stat.st_size),
+      "return type of File::size is not large enough");
+  return file_stat.st_size;
+#endif
+}
+
+std::size_t fmt::File::read(void *buffer, std::size_t count) {
+  RWResult result = 0;
+  FMT_RETRY(result, FMT_POSIX_CALL(read(fd_, buffer, convert_rwcount(count))));
+  if (result < 0)
+    throw SystemError(errno, "cannot read from file");
+  return result;
+}
+
+std::size_t fmt::File::write(const void *buffer, std::size_t count) {
+  RWResult result = 0;
+  FMT_RETRY(result, FMT_POSIX_CALL(write(fd_, buffer, convert_rwcount(count))));
+  if (result < 0)
+    throw SystemError(errno, "cannot write to file");
+  return result;
+}
+
+fmt::File fmt::File::dup(int fd) {
+  // Don't retry as dup doesn't return EINTR.
+  // http://pubs.opengroup.org/onlinepubs/009695399/functions/dup.html
+  int new_fd = FMT_POSIX_CALL(dup(fd));
+  if (new_fd == -1)
+    throw SystemError(errno, "cannot duplicate file descriptor {}", fd);
+  return File(new_fd);
+}
+
+void fmt::File::dup2(int fd) {
+  int result = 0;
+  FMT_RETRY(result, FMT_POSIX_CALL(dup2(fd_, fd)));
+  if (result == -1) {
+    throw SystemError(errno,
+      "cannot duplicate file descriptor {} to {}", fd_, fd);
+  }
+}
+
+void fmt::File::dup2(int fd, ErrorCode &ec) FMT_NOEXCEPT {
+  int result = 0;
+  FMT_RETRY(result, FMT_POSIX_CALL(dup2(fd_, fd)));
+  if (result == -1)
+    ec = ErrorCode(errno);
+}
+
+void fmt::File::pipe(File &read_end, File &write_end) {
+  // Close the descriptors first to make sure that assignments don't throw
+  // and there are no leaks.
+  read_end.close();
+  write_end.close();
+  int fds[2] = {};
+#ifdef _WIN32
+  // Make the default pipe capacity same as on Linux 2.6.11+.
+  enum { DEFAULT_CAPACITY = 65536 };
+  int result = FMT_POSIX_CALL(pipe(fds, DEFAULT_CAPACITY, _O_BINARY));
+#else
+  // Don't retry as the pipe function doesn't return EINTR.
+  // http://pubs.opengroup.org/onlinepubs/009696799/functions/pipe.html
+  int result = FMT_POSIX_CALL(pipe(fds));
+#endif
+  if (result != 0)
+    throw SystemError(errno, "cannot create pipe");
+  // The following assignments don't throw because read_fd and write_fd
+  // are closed.
+  read_end = File(fds[0]);
+  write_end = File(fds[1]);
+}
+
+fmt::BufferedFile fmt::File::fdopen(const char *mode) {
+  // Don't retry as fdopen doesn't return EINTR.
+  FILE *f = FMT_POSIX_CALL(fdopen(fd_, mode));
+  if (!f)
+    throw SystemError(errno, "cannot associate stream with file descriptor");
+  BufferedFile file(f);
+  fd_ = -1;
+  return file;
+}
+
+long fmt::getpagesize() {
+#ifdef _WIN32
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return si.dwPageSize;
+#else
+  long size = FMT_POSIX_CALL(sysconf(_SC_PAGESIZE));
+  if (size < 0)
+    throw SystemError(errno, "cannot get memory page size");
+  return size;
+#endif
+}
diff --git a/src/tags b/src/tags
new file mode 100644
index 0000000..16335a8
--- /dev/null
+++ b/src/tags
@@ -0,0 +1,429 @@
+Alignment	SalmonQuantify.cpp	126
+Alignment::Alignment	SalmonQuantify.cpp	/^        Alignment(TranscriptID transcriptIDIn, uin/
+Alignment::kmerCount	SalmonQuantify.cpp	132
+Alignment::logProb	SalmonQuantify.cpp	133
+Alignment::transcriptID	SalmonQuantify.cpp	/^        inline TranscriptID transcriptID() { retur/
+Alignment::transcriptID_	SalmonQuantify.cpp	136
+AlignmentBatch	TestBAMParser.cpp	54
+BamTools::BamAlignment	MultithreadedBAMParser.cpp	39
+BamTools::BamReader	MultithreadedBAMParser.cpp	38
+ClusterForest	TestBAMParser.cpp	163
+ClusterForest::ClusterForest	TestBAMParser.cpp	/^    ClusterForest(size_t numTranscripts, std::vect/
+ClusterForest::boost::disjoint_sets	TestBAMParser.cpp	255
+ClusterForest::clusterMutex_	TestBAMParser.cpp	257
+ClusterForest::clusters_	TestBAMParser.cpp	256
+ClusterForest::disjointSets_	TestBAMParser.cpp	255
+ClusterForest::getClusters	TestBAMParser.cpp	/^    std::vector<TranscriptCluster*> getClusters() /
+ClusterForest::mergeClusters	TestBAMParser.cpp	/^    void mergeClusters(typename AlignmentBatch<Fra/
+ClusterForest::parent_	TestBAMParser.cpp	254
+ClusterForest::rank_	TestBAMParser.cpp	253
+ClusterForest::updateCluster	TestBAMParser.cpp	/^    void updateCluster(size_t memberTranscript, si/
+ContainingTranscript	BuildLUT.cpp	74
+ContainingTranscript::kmerID	BuildLUT.cpp	75
+ContainingTranscript::transcriptID	BuildLUT.cpp	76
+DEFAULT_BOOTSTRAP	PerformBiasCorrection.cpp	62
+DEFAULT_BOOTSTRAP	PerformBiasCorrection_old.cpp	57
+DEFAULT_COMPUTE_IMPORTANCE	PerformBiasCorrection.cpp	63
+DEFAULT_COMPUTE_IMPORTANCE	PerformBiasCorrection_old.cpp	58
+DEFAULT_LEARN_RATE	PerformBiasCorrection.cpp	59
+DEFAULT_LEARN_RATE	PerformBiasCorrection_old.cpp	54
+DEFAULT_LOSS	PerformBiasCorrection.cpp	58
+DEFAULT_LOSS	PerformBiasCorrection_old.cpp	53
+DEFAULT_MAX_DEPTH	PerformBiasCorrection.cpp	55
+DEFAULT_MAX_DEPTH	PerformBiasCorrection_old.cpp	50
+DEFAULT_MAX_FEATURES_RATIO	PerformBiasCorrection.cpp	53
+DEFAULT_MAX_FEATURES_RATIO	PerformBiasCorrection_old.cpp	48
+DEFAULT_MIN_SAMPLE_LEAF	PerformBiasCorrection.cpp	54
+DEFAULT_MIN_SAMPLE_LEAF	PerformBiasCorrection_old.cpp	49
+DEFAULT_N_JOBS	PerformBiasCorrection.cpp	52
+DEFAULT_N_JOBS	PerformBiasCorrection_old.cpp	47
+DEFAULT_N_TREES	PerformBiasCorrection.cpp	51
+DEFAULT_N_TREES	PerformBiasCorrection_old.cpp	46
+DEFAULT_OOB	PerformBiasCorrection.cpp	60
+DEFAULT_OOB	PerformBiasCorrection_old.cpp	55
+DEFAULT_SPLIT_CRITERION	PerformBiasCorrection.cpp	57
+DEFAULT_SPLIT_CRITERION	PerformBiasCorrection_old.cpp	52
+DEFAULT_SUBSAMPLE	PerformBiasCorrection.cpp	56
+DEFAULT_SUBSAMPLE	PerformBiasCorrection_old.cpp	51
+DEFAULT_VERBOSE	PerformBiasCorrection.cpp	61
+DEFAULT_VERBOSE	PerformBiasCorrection_old.cpp	56
+ErrorModel::ErrorModel	ErrorModel.cpp	/^ErrorModel::ErrorModel(double alpha) :$/
+ErrorModel::burnedIn	ErrorModel.cpp	/^bool ErrorModel::burnedIn() { return burnedIn_; }$/
+ErrorModel::burnedIn	ErrorModel.cpp	/^void ErrorModel::burnedIn(bool burnedIn) { burnedI/
+ErrorModel::logLikelihood	ErrorModel.cpp	/^double ErrorModel::logLikelihood(bam1_t* read, Tra/
+ErrorModel::logLikelihood	ErrorModel.cpp	/^double ErrorModel::logLikelihood(const ReadPair& h/
+ErrorModel::logLikelihood	ErrorModel.cpp	/^double ErrorModel::logLikelihood(const UnpairedRea/
+ErrorModel::update	ErrorModel.cpp	/^void ErrorModel::update(const UnpairedRead& hit, T/
+ErrorModel::update	ErrorModel.cpp	/^void ErrorModel::update(bam1_t* read, Transcript& /
+ErrorModel::update	ErrorModel.cpp	/^void ErrorModel::update(const ReadPair& hit, Trans/
+ExpressionRecord	SailfishUtils.cpp	544
+ExpressionRecord::ExpressionRecord	SailfishUtils.cpp	/^        ExpressionRecord() : name(""), length(0), /
+ExpressionRecord::length	SailfishUtils.cpp	548
+ExpressionRecord::name	SailfishUtils.cpp	547
+ExpressionRecord::quants	SailfishUtils.cpp	549
+ExpressionResults	PerformBiasCorrection.cpp	104
+ExpressionResults	PerformBiasCorrection_old.cpp	101
+ExpressionResults::comments	PerformBiasCorrection.cpp	105
+ExpressionResults::comments	PerformBiasCorrection_old.cpp	102
+ExpressionResults::expressions	PerformBiasCorrection.cpp	106
+ExpressionResults::expressions	PerformBiasCorrection_old.cpp	103
+ExpressionResults::std::unordered_map	PerformBiasCorrection.cpp	106
+ExpressionResults::std::unordered_map	PerformBiasCorrection_old.cpp	103
+FASTAParser::populateTargets	FASTAParser.cpp	/^void FASTAParser::populateTargets(std::vector<Tran/
+FragmentLengthDistribution::FragmentLengthDistribution	FragmentLengthDistribution.cpp	/^FragmentLengthDistribution::FragmentLengthDistribu/
+FragmentLengthDistribution::addVal	FragmentLengthDistribution.cpp	/^void FragmentLengthDistribution::addVal(size_t len/
+FragmentLengthDistribution::cmf	FragmentLengthDistribution.cpp	/^double FragmentLengthDistribution::cmf(size_t len)/
+FragmentLengthDistribution::cmf	FragmentLengthDistribution.cpp	/^vector<double> FragmentLengthDistribution::cmf() c/
+FragmentLengthDistribution::maxVal	FragmentLengthDistribution.cpp	/^size_t FragmentLengthDistribution::maxVal() const /
+FragmentLengthDistribution::mean	FragmentLengthDistribution.cpp	/^double FragmentLengthDistribution::mean() const {$/
+FragmentLengthDistribution::minVal	FragmentLengthDistribution.cpp	/^size_t FragmentLengthDistribution::minVal() const /
+FragmentLengthDistribution::pmf	FragmentLengthDistribution.cpp	/^double FragmentLengthDistribution::pmf(size_t len)/
+FragmentLengthDistribution::totMass	FragmentLengthDistribution.cpp	/^double FragmentLengthDistribution::totMass() const/
+FragmentList::FragmentList	FragmentList.cpp	/^FragmentList::FragmentList(){$/
+FragmentList::addFragMatch	FragmentList.cpp	/^    void FragmentList::addFragMatch(uint32_t refSt/
+FragmentList::addFragMatch	FragmentList.cpp	/^    void FragmentList::addFragMatch(uint32_t refSt/
+FragmentList::addFragMatchRC	FragmentList.cpp	/^    void FragmentList::addFragMatchRC(uint32_t ref/
+FragmentList::addFragMatchRC	FragmentList.cpp	/^    void FragmentList::addFragMatchRC(uint32_t ref/
+FragmentList::computeBestChain	FragmentList.cpp	/^    void FragmentList::computeBestChain() {$/
+FragmentList::computeBestChain_	FragmentList.cpp	/^void FragmentList::computeBestChain_(Container* fr/
+FragmentList::freeFragList	FragmentList.cpp	/^void FragmentList::freeFragList(Container* frags) /
+FragmentList::~FragmentList	FragmentList.cpp	/^FragmentList::~FragmentList() {$/
+HeptamerIndex::HeptamerIndex	HeptamerIndex.cpp	/^HeptamerIndex::HeptamerIndex() :$/
+HeptamerIndex::incHeptamer	HeptamerIndex.cpp	/^HeptamerIndex::incHeptamer(uint64_t heptamer) {$/
+HeptamerIndex::index	HeptamerIndex.cpp	/^std::size_t HeptamerIndex::index(uint64_t heptamer/
+IndexPair	PartitionRefiner.cpp	84
+IndexPair::newIndex	PartitionRefiner.cpp	84
+IndexPair::oldIndex	PartitionRefiner.cpp	84
+K	cokus.cpp	59
+KSEQ_INIT	FASTAParser.cpp	/^KSEQ_INIT(int, read)$/
+Kmer	ComputeBiasFeatures.cpp	46
+Kmer	PerformBiasCorrection.cpp	66
+Kmer	PerformBiasCorrection_old.cpp	61
+KmerID	BuildLUT.cpp	69
+KmerIDMap	AnalyzeKmerTradeoff.cpp	59
+KmerIDMap	SalmonQuantify.cpp	113
+KmerVote	SalmonQuantify.cpp	289
+KmerVote::KmerVote	SalmonQuantify.cpp	/^        KmerVote(uint32_t vp, uint32_t rp, uint32_/
+KmerVote::readPos	SalmonQuantify.cpp	293
+KmerVote::voteLen	SalmonQuantify.cpp	294
+KmerVote::votePos	SalmonQuantify.cpp	292
+LIN	FragmentList.cpp	21
+LUTTools	LookUpTableUtils.cpp	47
+LUTTools::dumpKmerEquivClasses	LookUpTableUtils.cpp	/^void dumpKmerEquivClasses($/
+Length	BuildLUT.cpp	71
+LibraryFormat::LibraryFormat	LibraryFormat.cpp	/^LibraryFormat::LibraryFormat(ReadType type_in, Rea/
+LibraryFormat::check	LibraryFormat.cpp	/^bool LibraryFormat::check() {$/
+M	cokus.cpp	58
+MAnalyzeKmerTradeoff.cpp	AnalyzeKmerTradeoff.cpp	/^int main(int argc, char* argv[]) {$/
+MBiasCorrectionDriver.cpp	BiasCorrectionDriver.cpp	/^int main(int argc, char* argv[]) {$/
+MBuildTranscriptMap.cpp	BuildTranscriptMap.cpp	/^int main(int argc, char* argv[] ) {$/
+MPerformBiasCorrection_old.cpp	PerformBiasCorrection_old.cpp	/^int main(int argc, char* argv[]) {$/
+MSailfish.cpp	Sailfish.cpp	/^int main( int argc, char* argv[] ) {$/
+MSalmon.cpp	Salmon.cpp	/^int main( int argc, char* argv[] ) {$/
+MTestBAMParser.cpp	TestBAMParser.cpp	/^int main(int argc, char* argv[]) {$/
+MTestHeptamerIndex.cpp	TestHeptamerIndex.cpp	/^int main(int argc, char* argv[]) {$/
+MTestUtils.cpp	TestUtils.cpp	/^int main(int argc, char* argv[]) {$/
+MatchFragment	SalmonQuantify.cpp	297
+MatchFragment::MatchFragment	SalmonQuantify.cpp	/^        MatchFragment(uint32_t refStart_, uint32_t/
+MatchFragment::length	SalmonQuantify.cpp	302
+MatchFragment::queryStart	SalmonQuantify.cpp	302
+MatchFragment::refStart	SalmonQuantify.cpp	302
+MatchFragment::score	SalmonQuantify.cpp	304
+MatchFragment::weight	SalmonQuantify.cpp	303
+MerDirection	IndexedCounter.cpp	67
+MerDirection::BOTH	IndexedCounter.cpp	67
+MerDirection::FORWARD	IndexedCounter.cpp	67
+MerDirection::REVERSE	IndexedCounter.cpp	67
+MerIteratorType	JellyfishMerCounter.cpp	151
+MiniBatchQueue	TestBAMParser.cpp	57
+MultithreadedBAMParser::MultithreadedBAMParser	MultithreadedBAMParser.cpp	/^MultithreadedBAMParser::MultithreadedBAMParser(std/
+MultithreadedBAMParser::finishedWithAlignment	MultithreadedBAMParser.cpp	/^void MultithreadedBAMParser::finishedWithAlignment/
+MultithreadedBAMParser::nextAlignment	MultithreadedBAMParser.cpp	/^bool MultithreadedBAMParser::nextAlignment(BamAlig/
+MultithreadedBAMParser::start	MultithreadedBAMParser.cpp	/^bool MultithreadedBAMParser::start() {$/
+MultithreadedBAMParser::~MultithreadedBAMParser	MultithreadedBAMParser.cpp	/^MultithreadedBAMParser::~MultithreadedBAMParser() /
+N	cokus.cpp	57
+NODEBUG	PCA.cpp	23
+OPERATION	JellyfishMerCounter.cpp	150
+OPERATION::COUNT	JellyfishMerCounter.cpp	150
+OPERATION::PRIME	JellyfishMerCounter.cpp	150
+OPERATION::UPDATE	JellyfishMerCounter.cpp	150
+ParserT	SalmonQuantify.cpp	702
+PartitionRefiner::PartitionRefiner	PartitionRefiner.cpp	/^PartitionRefiner::PartitionRefiner(LUTTools::KmerI/
+PartitionRefiner::partitionMembership	PartitionRefiner.cpp	/^const std::vector<LUTTools::KmerID>& PartitionRefi/
+PartitionRefiner::relabel	PartitionRefiner.cpp	/^void PartitionRefiner::relabel() {$/
+PartitionRefiner::splitWith	PartitionRefiner.cpp	/^void PartitionRefiner::splitWith(std::vector<LUTTo/
+PathIterator	JellyfishMerCounter.cpp	151
+Pca::Calculate	PCA.cpp	/^int Pca::Calculate(vector<float> &x,$/
+Pca::Pca	PCA.cpp	/^Pca::Pca(void) {$/
+Pca::cum_prop	PCA.cpp	/^std::vector<float> Pca::cum_prop(void) { return _c/
+Pca::eliminated_columns	PCA.cpp	/^std::vector<unsigned int> Pca::eliminated_columns(/
+Pca::is_center	PCA.cpp	/^bool Pca::is_center(void) { return _is_center; }$/
+Pca::is_scale	PCA.cpp	/^bool Pca::is_scale(void) {  return _is_scale; }$/
+Pca::kaiser	PCA.cpp	/^unsigned int Pca::kaiser(void) { return _kaiser; }/
+Pca::method	PCA.cpp	/^string Pca::method(void) { return _method; }$/
+Pca::ncols	PCA.cpp	/^unsigned int Pca::ncols(void) { return _ncols; }$/
+Pca::nrows	PCA.cpp	/^unsigned int Pca::nrows(void) { return _nrows; }$/
+Pca::prop_of_var	PCA.cpp	/^std::vector<float> Pca::prop_of_var(void) {return /
+Pca::scores	PCA.cpp	/^std::vector<float> Pca::scores(void) { return _sco/
+Pca::sd	PCA.cpp	/^std::vector<float> Pca::sd(void) { return _sd; };$/
+Pca::thresh95	PCA.cpp	/^unsigned int Pca::thresh95(void) { return _thresh9/
+Pca::~Pca	PCA.cpp	/^Pca::~Pca(void) {$/
+PosteriorAbundanceVector	TestBAMParser.cpp	60
+PriorAbundanceVector	TestBAMParser.cpp	59
+ReadLength	BuildLUT.cpp	70
+RefSeq	TestBAMParser.cpp	62
+RefSeq::RefLength	TestBAMParser.cpp	65
+RefSeq::RefName	TestBAMParser.cpp	64
+RefSeq::RefSeq	TestBAMParser.cpp	/^    RefSeq(char* name, uint32_t len) : RefName(nam/
+SOP	FragmentList.cpp	20
+Sailfish::TranscriptFeatures	ComputeBiasFeatures.cpp	47
+Sailfish::TranscriptFeatures	PerformBiasCorrection.cpp	67
+StrandHasher	IndexedCounter.cpp	431
+StrandHasher::operator()	IndexedCounter.cpp	/^              size_t operator()(const ReadStranded/
+StreamingReadParser::StreamingReadParser	StreamingSequenceParser.cpp	/^StreamingReadParser::StreamingReadParser( std::vec/
+StreamingReadParser::finishedWithRead	StreamingSequenceParser.cpp	/^void StreamingReadParser::finishedWithRead(ReadSeq/
+StreamingReadParser::nextRead	StreamingSequenceParser.cpp	/^bool StreamingReadParser::nextRead(ReadSeq*& seq) /
+StreamingReadParser::start	StreamingSequenceParser.cpp	/^bool StreamingReadParser::start() {$/
+StreamingReadParser::~StreamingReadParser	StreamingSequenceParser.cpp	/^StreamingReadParser::~StreamingReadParser() {$/
+TranscriptCluster	TestBAMParser.cpp	76
+TranscriptCluster::TranscriptCluster	TestBAMParser.cpp	/^    TranscriptCluster() : members_(std::list<size_/
+TranscriptCluster::TranscriptCluster	TestBAMParser.cpp	/^    TranscriptCluster(size_t initialMember) : memb/
+TranscriptCluster::active_	TestBAMParser.cpp	/^    TranscriptCluster() : members_(std::list<size_/
+TranscriptCluster::active_	TestBAMParser.cpp	/^                                              logM/
+TranscriptCluster::active_	TestBAMParser.cpp	159
+TranscriptCluster::addMass	TestBAMParser.cpp	/^    void addMass(double logNewMass) { logMass_ = l/
+TranscriptCluster::count_	TestBAMParser.cpp	157
+TranscriptCluster::deactivate	TestBAMParser.cpp	/^    void deactivate() { active_ = false; }$/
+TranscriptCluster::incrementCount	TestBAMParser.cpp	/^    void incrementCount(size_t num) { count_ += nu/
+TranscriptCluster::isActive	TestBAMParser.cpp	/^    bool isActive() { return active_; }$/
+TranscriptCluster::logMass	TestBAMParser.cpp	/^    double logMass() { return logMass_; }$/
+TranscriptCluster::logMass_	TestBAMParser.cpp	/^    TranscriptCluster() : members_(std::list<size_/
+TranscriptCluster::logMass_	TestBAMParser.cpp	/^                                              logM/
+TranscriptCluster::logMass_	TestBAMParser.cpp	158
+TranscriptCluster::members	TestBAMParser.cpp	/^    std::list<size_t>& members() { return members_/
+TranscriptCluster::members_	TestBAMParser.cpp	156
+TranscriptCluster::merge	TestBAMParser.cpp	/^    void merge(TranscriptCluster& other) {$/
+TranscriptCluster::numHits	TestBAMParser.cpp	/^    size_t numHits() { return count_.load(); }$/
+TranscriptCluster::projectToPolytope	TestBAMParser.cpp	/^    void projectToPolytope(std::vector<Transcript>/
+TranscriptFeatures	PerformBiasCorrection_old.cpp	63
+TranscriptFeatures::diNucleotides	PerformBiasCorrection_old.cpp	67
+TranscriptFeatures::gcContent	PerformBiasCorrection_old.cpp	66
+TranscriptFeatures::length	PerformBiasCorrection_old.cpp	65
+TranscriptFeatures::name	PerformBiasCorrection_old.cpp	64
+TranscriptFeatures::std::array	PerformBiasCorrection_old.cpp	67
+TranscriptHitList	SalmonQuantify.cpp	313
+TranscriptHitList::VoteInfo	SalmonQuantify.cpp	337
+TranscriptHitList::VoteInfo::coverage	SalmonQuantify.cpp	338
+TranscriptHitList::VoteInfo::rightmostBase	SalmonQuantify.cpp	339
+TranscriptHitList::addFragMatch	SalmonQuantify.cpp	/^        void addFragMatch(uint32_t tpos, uint32_t /
+TranscriptHitList::addFragMatchRC	SalmonQuantify.cpp	/^        void addFragMatchRC(uint32_t tpos, uint32_/
+TranscriptHitList::bestHitPos	SalmonQuantify.cpp	315
+TranscriptHitList::bestHitScore	SalmonQuantify.cpp	316
+TranscriptHitList::computeBestChain	SalmonQuantify.cpp	/^        bool computeBestChain() {$/
+TranscriptHitList::computeBestLoc_	SalmonQuantify.cpp	/^        void computeBestLoc_(std::vector<KmerVote>/
+TranscriptHitList::rcVotes	SalmonQuantify.cpp	319
+TranscriptHitList::targetID	SalmonQuantify.cpp	321
+TranscriptHitList::totalNumHits	SalmonQuantify.cpp	/^        uint32_t totalNumHits() { return std::max(/
+TranscriptHitList::votes	SalmonQuantify.cpp	318
+TranscriptID	AnalyzeKmerTradeoff.cpp	57
+TranscriptID	BuildLUT.cpp	68
+TranscriptID	SalmonQuantify.cpp	111
+TranscriptIDVector	AnalyzeKmerTradeoff.cpp	58
+TranscriptIDVector	SalmonQuantify.cpp	112
+TranscriptKey	SailfishUtils.cpp	336
+TranscriptKey::DYNAMIC	SailfishUtils.cpp	336
+TranscriptKey::GENE_ID	SailfishUtils.cpp	336
+TranscriptKey::GENE_NAME	SailfishUtils.cpp	336
+TranscriptKeyPair	SailfishUtils.cpp	327
+TranscriptKeyPair::TranscriptKeyPair	SailfishUtils.cpp	/^        TranscriptKeyPair(const char* t, const cha/
+TranscriptKeyPair::key	SailfishUtils.cpp	329
+TranscriptKeyPair::transcript_id	SailfishUtils.cpp	328
+TranscriptList	BuildLUT.cpp	72
+TranscriptResult	PerformBiasCorrection.cpp	95
+TranscriptResult	PerformBiasCorrection_old.cpp	95
+TranscriptResult::approxCount	PerformBiasCorrection.cpp	101
+TranscriptResult::approxKmerCount	PerformBiasCorrection.cpp	100
+TranscriptResult::kpkm	PerformBiasCorrection.cpp	99
+TranscriptResult::length	PerformBiasCorrection.cpp	96
+TranscriptResult::length	PerformBiasCorrection_old.cpp	96
+TranscriptResult::rpkm	PerformBiasCorrection.cpp	98
+TranscriptResult::rpkm	PerformBiasCorrection_old.cpp	98
+TranscriptResult::tpm	PerformBiasCorrection.cpp	97
+TranscriptResult::tpm	PerformBiasCorrection_old.cpp	97
+VersionChecker::VersionChecker	VersionChecker.cpp	/^VersionChecker::VersionChecker(boost::asio::io_ser/
+VersionChecker::cancel_upgrade_check	VersionChecker.cpp	/^void VersionChecker::cancel_upgrade_check(const bo/
+VersionChecker::handle_connect	VersionChecker.cpp	/^void VersionChecker::handle_connect(const boost::s/
+VersionChecker::handle_read_content	VersionChecker.cpp	/^void VersionChecker::handle_read_content(const boo/
+VersionChecker::handle_read_headers	VersionChecker.cpp	/^void VersionChecker::handle_read_headers(const boo/
+VersionChecker::handle_read_status_line	VersionChecker.cpp	/^void VersionChecker::handle_read_status_line(const/
+VersionChecker::handle_resolve	VersionChecker.cpp	/^void VersionChecker::handle_resolve(const boost::s/
+VersionChecker::handle_write_request	VersionChecker.cpp	/^void VersionChecker::handle_write_request(const bo/
+VersionChecker::message	VersionChecker.cpp	/^std::string VersionChecker::message() {$/
+_GNU_SOURCE	IndexedCounter.cpp	48
+aggregateEstimatesToGeneLevel	SailfishUtils.cpp	/^void aggregateEstimatesToGeneLevel(TranscriptGeneM/
+args	JellyfishMerCounter.cpp	73
+as_seconds	JellyfishMerCounter.cpp	/^inline double as_seconds(DtnType dtn) { return dur/
+assert_file_good	PCAUtils.cpp	/^void assert_file_good(const bool& is_file_good, co/
+basesCovered	SalmonQuantify.cpp	/^uint32_t basesCovered(std::vector<uint32_t>& kmerH/
+basesCovered	SalmonQuantify.cpp	/^uint32_t basesCovered(std::vector<uint32_t>& posLe/
+boost::asio::ip::tcp	VersionChecker.cpp	14
+buildLUTs	BuildLUT.cpp	/^int buildLUTs($/
+buildPerfectHashIndex	PerfectHashIndexer.cpp	/^void buildPerfectHashIndex(bool canonical, std::ve/
+buildTLUTIndex	LookUpTableUtils.cpp	/^std::vector<Offset> buildTLUTIndex(const std::stri/
+bwa_pg	SalmonQuantify.cpp	105
+computeBiasFeatures	ComputeBiasFeatures.cpp	/^int computeBiasFeatures($/
+computeBiasFeaturesHelper	ComputeBiasFeatures.cpp	/^bool computeBiasFeaturesHelper(ParserT& parser,$/
+compute_column_means	PCAUtils.cpp	/^arma::Col<double> compute_column_means(const arma:/
+compute_column_rms	PCAUtils.cpp	/^arma::Col<double> compute_column_rms(const arma::M/
+countKmers	IndexedCounter.cpp	/^bool countKmers(std::vector<std::string> readFiles/
+dumpKmerLUT	LookUpTableUtils.cpp	/^void dumpKmerLUT($/
+eigen_pair	PCA.cpp	198
+encode	SailfishUtils.cpp	/^uint64_t encode(uint64_t tid, uint64_t offset) {$/
+enforce_positive_sign_by_column	PCAUtils.cpp	/^void enforce_positive_sign_by_column(arma::Mat<dou/
+extractReadLibraries	SailfishUtils.cpp	/^std::vector<ReadLibrary> extractReadLibraries(boos/
+extract_column_vector	PCAUtils.cpp	/^std::vector<double> extract_column_vector(const ar/
+extract_row_vector	PCAUtils.cpp	/^std::vector<double> extract_row_vector(const arma:/
+fifoRead	StreamingSequenceParser.cpp	/^int fifoRead(int f, void* buf, int n) {$/
+file_vector	JellyfishMerCounter.cpp	85
+file_vector::const_iterator	JellyfishMerCounter.cpp	209
+file_vector::const_iterator	JellyfishMerCounter.cpp	210
+filter	JellyfishMerCounter.cpp	116
+filter::and_res	JellyfishMerCounter.cpp	/^  bool and_res(bool r, const mer_dna& x) const {$/
+filter::filter	JellyfishMerCounter.cpp	/^  filter(filter* prev = 0) : prev_(prev) { }$/
+filter::operator()	JellyfishMerCounter.cpp	/^  virtual bool operator()(const mer_dna& x) { retu/
+filter::prev_	JellyfishMerCounter.cpp	117
+filter::~filter	JellyfishMerCounter.cpp	/^  virtual ~filter() { }$/
+filter_bc	JellyfishMerCounter.cpp	126
+filter_bc::counter_	JellyfishMerCounter.cpp	127
+filter_bc::filter_bc	JellyfishMerCounter.cpp	/^  filter_bc(const mer_dna_bloom_counter& counter, /
+filter_bc::operator()	JellyfishMerCounter.cpp	/^  bool operator()(const mer_dna& m) {$/
+filter_bf	JellyfishMerCounter.cpp	138
+filter_bf::bf_	JellyfishMerCounter.cpp	139
+filter_bf::filter_bf	JellyfishMerCounter.cpp	/^  filter_bf(mer_dna_bloom_filter& bf, filter* prev/
+filter_bf::operator()	JellyfishMerCounter.cpp	/^  bool operator()(const mer_dna& m) {$/
+futureIsReady	IndexedCounter.cpp	/^bool futureIsReady(std::future<R>& f) {$/
+getHitsForFragment	SalmonQuantify.cpp	/^void getHitsForFragment(std::pair<header_sequence_/
+getHitsForFragment	SalmonQuantify.cpp	/^void getHitsForFragment(jellyfish::header_sequence/
+getVersionMessage	VersionChecker.cpp	/^std::string getVersionMessage() {$/
+get_mean	PCAUtils.cpp	/^double get_mean(const std::vector<double>& iter) {/
+get_sigma	PCAUtils.cpp	/^double get_sigma(const std::vector<double>& iter) /
+help	Sailfish.cpp	/^int help(int argc, char* argv[]) {$/
+help	Salmon.cpp	/^int help(int argc, char* argv[]) {$/
+hiBit	cokus.cpp	/^#define hiBit(u)       ((u) & 0x80000000U)   \/\/ ma/
+jellyfish::mer_dna	JellyfishMerCounter.cpp	82
+jellyfish::mer_dna_bloom_counter	JellyfishMerCounter.cpp	83
+jellyfish::mer_dna_bloom_filter	JellyfishMerCounter.cpp	84
+jellyfish_count_main	JellyfishMerCounter.cpp	/^int jellyfish_count_main(int argc, char *argv[])$/
+left	cokus.cpp	67
+loBit	cokus.cpp	/^#define loBit(u)       ((u) & 0x00000001U)   \/\/ ma/
+loBits	cokus.cpp	/^#define loBits(u)      ((u) & 0x7FFFFFFFU)   \/\/ ma/
+load_bloom_filter	JellyfishMerCounter.cpp	/^mer_dna_bloom_counter* load_bloom_filter(const cha/
+mainBuildLUT	BuildLUT.cpp	/^int mainBuildLUT(int argc, char* argv[] ) {$/
+mainCount	IndexedCounter.cpp	/^int mainCount( uint32_t numThreads,$/
+mainIndex	PerfectHashIndexer.cpp	/^int mainIndex( int argc, char *argv[] ) {$/
+mainQuantify	QuantificationDriver.cpp	/^int mainQuantify( int argc, char *argv[] ) {$/
+mainSailfish	Sailfish.cpp	/^int mainSailfish(int argc, char* argv[]) {$/
+make_shuffled_matrix	PCAUtils.cpp	/^arma::Mat<double> make_shuffled_matrix(const arma:/
+manager_pid	JellyfishMerCounter.cpp	230
+mer_counter	JellyfishMerCounter.cpp	209
+mer_counter_base	JellyfishMerCounter.cpp	152
+mer_counter_base::ary_	JellyfishMerCounter.cpp	154
+mer_counter_base::filter_	JellyfishMerCounter.cpp	157
+mer_counter_base::mer_counter_base	JellyfishMerCounter.cpp	/^  mer_counter_base(int nb_threads, mer_hash& ary,$/
+mer_counter_base::nb_threads_	JellyfishMerCounter.cpp	153
+mer_counter_base::op_	JellyfishMerCounter.cpp	158
+mer_counter_base::parser_	JellyfishMerCounter.cpp	156
+mer_counter_base::start	JellyfishMerCounter.cpp	/^  virtual void start(int thid) {$/
+mer_counter_base::streams_	JellyfishMerCounter.cpp	155
+mer_iterator	JellyfishMerCounter.cpp	89
+mer_iterator	JellyfishMerCounter.cpp	209
+mer_qual_counter	JellyfishMerCounter.cpp	210
+mer_qual_iterator	JellyfishMerCounter.cpp	105
+mer_qual_iterator	JellyfishMerCounter.cpp	210
+mer_qual_iterator::mer_qual_iterator	JellyfishMerCounter.cpp	/^  mer_qual_iterator(sequence_qual_parser& parser, /
+mer_qual_iterator::sequence_qual_parser	JellyfishMerCounter.cpp	106
+mer_qual_iterator::super	JellyfishMerCounter.cpp	106
+mixBits	cokus.cpp	/^#define mixBits(u, v)  (hiBit(u)|loBits(v))  \/\/ mo/
+mpdec	PerformBiasCorrection.cpp	68
+mute	FragmentList.cpp	23
+my_mer	BuildSalmonIndex.cpp	58
+my_mer	SalmonQuantify.cpp	114
+next	cokus.cpp	66
+normalize_by_column	PCAUtils.cpp	/^void normalize_by_column(arma::Mat<double>& data, /
+numberOfReadsInFastaFile	SailfishUtils.cpp	/^size_t numberOfReadsInFastaFile(const std::string&/
+offset	SailfishUtils.cpp	/^uint32_t offset(uint64_t enc) {$/
+offset	SalmonQuantify.cpp	/^uint32_t offset(uint64_t enc) {$/
+operator<<	GenomicFeature.cpp	/^std::ostream& operator<< ( std::ostream& out, cons/
+operator<<	GenomicFeature.cpp	/^std::ostream& operator<< ( std::ostream& out, cons/
+operator<<	LibraryFormat.cpp	/^std::ostream& operator<<(std::ostream& os, Library/
+operator>>	GenomicFeature.cpp	/^std::istream& operator>> ( std::istream& in, Genom/
+overlap	SailfishUtils.cpp	/^bool overlap( const S<T> &a, const S<T> &b ) {$/
+paired_parser	SalmonQuantify.cpp	107
+parseExpressionRecord	SailfishUtils.cpp	/^ExpressionRecord parseExpressionRecord(std::string/
+parseFeature	PerformBiasCorrection.cpp	/^TranscriptFeatures parseFeature(std::ifstream& ifs/
+parseFeature	PerformBiasCorrection_old.cpp	/^TranscriptFeatures parseFeature(std::ifstream& ifs/
+parseFeatureFile	PerformBiasCorrection.cpp	/^std::vector<TranscriptFeatures> parseFeatureFile(c/
+parseFeatureFile	PerformBiasCorrection_old.cpp	/^std::vector<TranscriptFeatures> parseFeatureFile(c/
+parseLibraryFormatString	SailfishUtils.cpp	/^LibraryFormat parseLibraryFormatString(std::string/
+parseLibraryFormatString	TestBAMParser.cpp	/^LibraryFormat parseLibraryFormatString(std::string/
+parseSailfishFile	PerformBiasCorrection.cpp	/^ExpressionResults parseSailfishFile(const bfs::pat/
+parseSailfishFile	PerformBiasCorrection_old.cpp	/^ExpressionResults parseSailfishFile(const bfs::pat/
+parseTranscripts	BuildTranscriptMap.cpp	/^bool parseTranscripts(boost::filesystem::path& tpa/
+performBiasCorrection	PerformBiasCorrection.cpp	/^int performBiasCorrection($/
+populateFromTPMs	PerformBiasCorrection.cpp	/^void populateFromTPMs(vector<mpdec>& tpms,$/
+precedes	SalmonQuantify.cpp	/^bool precedes(const MatchFragment& a, const MatchF/
+processMiniBatch	SalmonQuantify.cpp	/^void processMiniBatch($/
+processMiniBatch	TestBAMParser.cpp	/^void processMiniBatch(MiniBatchQueue<AlignmentGrou/
+processReadsMEM	SalmonQuantify.cpp	/^void processReadsMEM(ParserT* parser,$/
+quantifyLibrary	TestBAMParser.cpp	/^void quantifyLibrary(LibraryFormat libFmt,$/
+randomMT	cokus.cpp	/^uint32 randomMT(void)$/
+readKmerEquivClasses	LookUpTableUtils.cpp	/^std::vector<KmerID> readKmerEquivClasses(const std/
+readKmerLUT	LookUpTableUtils.cpp	/^void readKmerLUT($/
+readKmerOrder	SailfishUtils.cpp	/^bool readKmerOrder( const std::string& fname, std:/
+readTranscriptInfo	LookUpTableUtils.cpp	/^std::unique_ptr<TranscriptInfo> readTranscriptInfo/
+readTranscriptToGeneMap	SailfishUtils.cpp	/^TranscriptGeneMap readTranscriptToGeneMap( std::if/
+reloadMT	cokus.cpp	/^uint32 reloadMT(void)$/
+remove_column_means	PCAUtils.cpp	/^void remove_column_means(arma::Mat<double>& data, /
+runIterativeOptimizer	QuantificationDriver.cpp	/^int runIterativeOptimizer(int argc, char* argv[] )/
+runJellyfish	PerfectHashIndexer.cpp	/^int runJellyfish(bool canonical,$/
+runKmerCounter	QuantificationDriver.cpp	/^int runKmerCounter(const std::string& sfCommand,$/
+runSailfishEstimation	QuantificationDriver.cpp	/^int runSailfishEstimation(const std::string& sfCom/
+sailfish	SailfishUtils.cpp	44
+salmon::math::LOG_0	TestBAMParser.cpp	48
+salmon::math::LOG_1	TestBAMParser.cpp	49
+salmon::math::logAdd	TestBAMParser.cpp	50
+salmon::math::logSub	TestBAMParser.cpp	51
+salmon::stringtools::encodeSequenceInSAM	SailfishStringUtils.cpp	/^uint8_t* salmon::stringtools::encodeSequenceInSA/
+salmon::utils	SailfishUtils.cpp	45
+salmon::utils::parseLibraryFormatStringNew	SailfishUtils.cpp	/^LibraryFormat parseLibraryFormatStringNew(std::str/
+salmon::utils::std::string	SailfishUtils.cpp	46
+salmonIndex	BuildSalmonIndex.cpp	/^int salmonIndex(int argc, char* argv[]) {$/
+salmonQuantify	SalmonQuantify.cpp	/^int salmonQuantify(int argc, char *argv[]) {$/
+salmonSwim	Salmon.cpp	/^int salmonSwim(int argc, char* argv[]) {$/
+scaleBy	TestBAMParser.cpp	/^void scaleBy(std::vector<T>& vec, T scale) {$/
+seedMT	cokus.cpp	/^void seedMT(uint32 seed)$/
+sequence_parser	BuildSalmonIndex.cpp	61
+sequence_parser	JellyfishMerCounter.cpp	88
+sequence_parser	JellyfishMerCounter.cpp	89
+sequence_qual_parser	JellyfishMerCounter.cpp	93
+sequence_qual_parser::StreamIterator	JellyfishMerCounter.cpp	96
+sequence_qual_parser::sequence_qual_parser	JellyfishMerCounter.cpp	/^  sequence_qual_parser(uint16_t mer_len, uint32_t /
+sequence_qual_parser::super	JellyfishMerCounter.cpp	97
+signal_handler	JellyfishMerCounter.cpp	/^static void signal_handler(int sig) {$/
+single_parser	SalmonQuantify.cpp	109
+state	cokus.cpp	65
+stats	PCAUtils.cpp	28
+stats::utils	PCAUtils.cpp	29
+stats::utils::make_covariance_matrix	PCAUtils.cpp	/^arma::Mat<double> make_covariance_matrix(const arm/
+std::chrono::duration	JellyfishMerCounter.cpp	76
+std::chrono::duration_cast	JellyfishMerCounter.cpp	77
+std::chrono::system_clock	JellyfishMerCounter.cpp	75
+std::string	QuantificationDriver.cpp	56
+stream_manager	BuildSalmonIndex.cpp	60
+stream_manager	SalmonQuantify.cpp	108
+transcript	SailfishUtils.cpp	/^uint32_t transcript(uint64_t enc) {$/
+transcript	SalmonQuantify.cpp	/^uint32_t transcript(uint64_t enc) {$/
+transcriptGeneMapFromGTF	SailfishUtils.cpp	/^TranscriptGeneMap transcriptGeneMapFromGTF(const s/
+transcriptToGeneMapFromFasta	SailfishUtils.cpp	/^TranscriptGeneMap transcriptToGeneMapFromFasta( co/
+uint32	cokus.cpp	55
+writeTranscriptInfo	LookUpTableUtils.cpp	/^void writeTranscriptInfo (TranscriptInfo *ti, std:/

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/salmon.git