[med-svn] [libpll] 01/01: New upstream version 0.3.1

Wed May 17 14:42:59 UTC 2017

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to annotated tag upstream/0.3.1
in repository libpll.

commit 89cb22dd7a401e45463439ca9156089ecb981095
Author: Andreas Tille <tille at debian.org>
Date:   Wed May 17 16:35:34 2017 +0200

    New upstream version 0.3.1
---
 .travis.yml                 |  42 ++++++++----
 ChangeLog.md                |  11 ++++
 configure.ac                |   2 +-
 man/libpll.3                |   7 +-
 src/Makefile.am             |   3 +-
 src/core_derivatives.c      |  11 ++--
 src/core_derivatives_avx.c  |  10 +--
 src/core_derivatives_avx2.c |  10 +--
 src/core_derivatives_sse.c  |   8 +--
 src/core_pmatrix.c          |   4 +-
 src/hardware.c              | 152 ++++++++++++++++++++++++++++++--------------
 src/init.c                  |  34 ----------
 src/models.c                |  16 +++--
 src/pll.c                   |   5 +-
 src/pll.h                   |  34 +++++++---
 15 files changed, 219 insertions(+), 130 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d205bfa..5f71157 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,9 +7,27 @@ matrix:
       addons:
         apt:
           sources: ['ubuntu-toolchain-r-test']
+          packages: ['gcc-4.6']
+      env:
+        - COMPILER=gcc-4.6 CONFPARAMS="--disable-avx2"
+
+    - os: linux
+      compiler: gcc
+      addons:
+        apt:
+          sources: ['ubuntu-toolchain-r-test']
+          packages: ['gcc-4.7']
+      env:
+        - COMPILER=gcc-4.7 CONFPARAMS=""
+
+    - os: linux
+      compiler: gcc
+      addons:
+        apt:
+          sources: ['ubuntu-toolchain-r-test']
           packages: ['gcc-4.8']
       env:
-        - COMPILER=gcc-4.8
+        - COMPILER=gcc-4.8 CONFPARAMS=""
 
     - os: linux
       compiler: gcc
@@ -18,16 +36,16 @@ matrix:
           sources: ['ubuntu-toolchain-r-test']
           packages: ['gcc-4.9']
       env:
-        - COMPILER=gcc-4.9
+        - COMPILER=gcc-4.9 CONFPARAMS=""
 
     - os: linux
       compiler: gcc
       addons:
         apt:
           sources: ['ubuntu-toolchain-r-test']
-          packages: ['g++-5']
+          packages: ['gcc-5']
       env:
-        - COMPILER=gcc-5
+        - COMPILER=gcc-5 CONFPARAMS=""
 
     - os: linux
       compiler: gcc
@@ -36,7 +54,7 @@ matrix:
           sources: ['ubuntu-toolchain-r-test']
           packages: ['gcc-6']
       env:
-        - COMPILER=gcc-6
+        - COMPILER=gcc-6 CONFPARAMS=""
 
     - os: linux
       compiler: clang
@@ -45,7 +63,7 @@ matrix:
           sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.5']
           packages: ['clang-3.5']
       env:
-        - COMPILER=clang-3.5
+        - COMPILER=clang-3.5 CONFPARAMS=""
 
     - os: linux
       compiler: clang
@@ -54,7 +72,7 @@ matrix:
           sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.6']
           packages: ['clang-3.6']
       env:
-        - COMPILER=clang-3.6
+        - COMPILER=clang-3.6 CONFPARAMS=""
 
     - os: linux
       compiler: clang
@@ -63,7 +81,7 @@ matrix:
           sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.7']
           packages: ['clang-3.7']
       env:
-        - COMPILER=clang-3.7
+        - COMPILER=clang-3.7 CONFPARAMS=""
 
     - os: linux
       compiler: clang
@@ -72,7 +90,7 @@ matrix:
           sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.8']
           packages: ['clang-3.8']
       env:
-        - COMPILER=clang-3.8
+        - COMPILER=clang-3.8 CONFPARAMS=""
 
     - os: linux
       dist: trusty
@@ -82,7 +100,7 @@ matrix:
           sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty-3.9']
           packages: ['clang-3.9']
       env:
-        - COMPILER=clang-3.9
+        - COMPILER=clang-3.9 CONFPARAMS=""
 
     - os: linux
       dist: trusty
@@ -92,6 +110,6 @@ matrix:
           sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty-4.0']
           packages: ['clang-4.0']
       env:
-        - COMPILER=clang-4.0
+        - COMPILER=clang-4.0 CONFPARAMS=""
 
-script: ./autogen.sh && CC=$COMPILER ./configure && make && make check
+script: ./autogen.sh && CC=$COMPILER ./configure $CONFPARAMS && make && make check
diff --git a/ChangeLog.md b/ChangeLog.md
index ed3e43e..e1b325c 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -2,6 +2,17 @@
 All notable changes to `libpll` will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).
 
+## [0.3.1] - 2017-05-17
+### Added
+ - Checks for older versions of clang and gcc to use assembly instructions
+   for cpu features detection
+ - Include guards for pll.h
+### Fixed
+ - Correct updating of padded eigen-decomposition arrays for models with a
+   number of states not being a power of two
+ - Changed to the usage of builtin functions for cpu features detection
+ - Check for x86intrin.h
+
 ## [0.3.0] - 2017-05-15
 ### Added
  - Run-time detection of cpu features
diff --git a/configure.ac b/configure.ac
index 62ba2db..377b391 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.63])
-AC_INIT([libpll], [0.3.0], [Tomas.Flouri at h-its.org])
+AC_INIT([libpll], [0.3.1], [Tomas.Flouri at h-its.org])
 AM_INIT_AUTOMAKE([subdir-objects])
 AC_LANG([C])
 AC_CONFIG_SRCDIR([src/pll.c])
diff --git a/man/libpll.3 b/man/libpll.3
index 2ff6344..e2f1dc1 100644
--- a/man/libpll.3
+++ b/man/libpll.3
@@ -1,6 +1,6 @@
 .\" -*- coding: utf-8 -*-
 .\" ============================================================================
-.TH libpll 3 "September 9, 2016" "libpll 0.1.0" "Library Functions Manual"
+.TH libpll 3 "May 17, 2017" "libpll 0.3.1" "Library Functions Manual"
 .\" ============================================================================
 .SH NAME
 libpll \(em Phylogenetic Likelihood Library
@@ -586,5 +586,10 @@ for custom printing. Fixed derivatives computation, parsing of branch lengths,
 invariant sites computation, log-likelihood computation for cases where we have
 scaling and patterns, ascertainment bias computation, per-site log-likelihood
 computation, memory leaks. Added run-time detection of hardware.
+.TP
+.BR v0.3.1\~ "released May 17th, 2017"
+Correct updating of paddded eigen-decomposition arrays for models with a number
+of states not being a power of two. Added portable hardware detection for clang
+and GCC. 
 .RE
 .LP
diff --git a/src/Makefile.am b/src/Makefile.am
index 23c16ed..977595b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -37,8 +37,7 @@ fast_parsimony.c \
 stepwise.c \
 random.c \
 phylip.c \
-hardware.c \
-init.c
+hardware.c
 
 libpll_la_CFLAGS = $(AM_CFLAGS)
 
diff --git a/src/core_derivatives.c b/src/core_derivatives.c
index a5fea48..2b6e205 100644
--- a/src/core_derivatives.c
+++ b/src/core_derivatives.c
@@ -146,6 +146,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states,
   const double * t_inv_eigenvecs;
   const double * t_freqs;
 
+  unsigned int states_padded = states;
+
 #ifdef HAVE_SSE3
   if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
   {
@@ -251,8 +253,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states,
         righterm = 0;
         for (k = 0; k < states; ++k)
         {
-          lefterm  += t_clvp[k] * t_freqs[k] * t_inv_eigenvecs[k * states + j];
-          righterm += t_eigenvecs[j * states + k] * t_clvc[k];
+          lefterm  += t_clvp[k] * t_freqs[k] *
+                                      t_inv_eigenvecs[k * states_padded + j];
+          righterm += t_eigenvecs[j * states_padded + k] * t_clvc[k];
         }
         sum[j] = lefterm * righterm;
 
@@ -385,8 +388,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states,
         for (k = 0; k < states; ++k)
         {
           lefterm += (tipstate & 1) * t_freqs[k]
-              * t_inv_eigenvecs[k * states + j];
-          righterm += t_eigenvecs[j * states + k] * t_clvc[k];
+              * t_inv_eigenvecs[k * states_padded + j];
+          righterm += t_eigenvecs[j * states_padded + k] * t_clvc[k];
           tipstate >>= 1;
         }
         sum[j] = lefterm * righterm;
diff --git a/src/core_derivatives_avx.c b/src/core_derivatives_avx.c
index 54f03ff..992e41b 100644
--- a/src/core_derivatives_avx.c
+++ b/src/core_derivatives_avx.c
@@ -273,9 +273,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,
       for (k = 0; k < states; ++k)
       {
         tt_inv_eigenvecs[i * states_padded * states_padded + j * states_padded
-            + k] = inv_eigenvecs[i][k * states + j] * t_freqs[k];
+            + k] = inv_eigenvecs[i][k * states_padded + j] * t_freqs[k];
         tt_eigenvecs[i * states_padded * states_padded + j * states_padded
-            + k] = eigenvecs[i][j * states + k];
+            + k] = eigenvecs[i][j * states_padded + k];
       }
   }
 
@@ -636,7 +636,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
       for (k = 0; k < states_padded; ++k)
       {
         eigenvecs_padded[i*states_padded*states_padded + j*states_padded + k] =
-            (j < states && k < states) ? eigenvecs[i][j*states + k] : 0.;
+            (j < states && k < states) ? eigenvecs[i][j*states_padded + k] : 0.;
       }
   }
 
@@ -659,7 +659,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
           /* special case for non-ambiguous state */
           __m256d v_freqs = _mm256_set1_pd(freqs[i][ss]);
           __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] +
-                                                       ss*states + j);
+                                                       ss*states_padded + j);
           v_lefterm =  _mm256_mul_pd(v_eigen, v_freqs);
         }
         else
@@ -671,7 +671,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
             {
               __m256d v_freqs = _mm256_set1_pd(freqs[i][k]);
               __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] +
-                                                           k*states + j);
+                                                           k*states_padded + j);
 
               v_lefterm = _mm256_add_pd(v_lefterm,
                                         _mm256_mul_pd(v_eigen, v_freqs));
diff --git a/src/core_derivatives_avx2.c b/src/core_derivatives_avx2.c
index 20216d5..bd345ec 100644
--- a/src/core_derivatives_avx2.c
+++ b/src/core_derivatives_avx2.c
@@ -98,9 +98,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,
       for (k = 0; k < states; ++k)
       {
         tt_inv_eigenvecs[i * states_padded * states_padded + j * states_padded
-            + k] = inv_eigenvecs[i][k * states + j] * t_freqs[k];
+            + k] = inv_eigenvecs[i][k * states_padded + j] * t_freqs[k];
         tt_eigenvecs[i * states_padded * states_padded + j * states_padded
-            + k] = eigenvecs[i][j * states + k];
+            + k] = eigenvecs[i][j * states_padded + k];
       }
   }
 
@@ -281,7 +281,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
       for (k = 0; k < states_padded; ++k)
       {
         eigenvecs_padded[i*states_padded*states_padded + j*states_padded + k] =
-            (j < states && k < states) ? eigenvecs[i][j*states + k] : 0.;
+            (j < states && k < states) ? eigenvecs[i][j*states_padded + k] : 0.;
       }
   }
 
@@ -304,7 +304,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
           /* special case for non-ambiguous state */
           __m256d v_freqs = _mm256_set1_pd(freqs[i][ss]);
           __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] +
-                                                       ss*states + j);
+                                                       ss*states_padded + j);
           v_lefterm =  _mm256_mul_pd(v_eigen, v_freqs);
         }
         else
@@ -316,7 +316,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
             {
               __m256d v_freqs = _mm256_set1_pd(freqs[i][k]);
               __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] +
-                                                           k*states + j);
+                                                           k*states_padded + j);
 
               v_lefterm = _mm256_fmadd_pd(v_eigen, v_freqs, v_lefterm);
             }
diff --git a/src/core_derivatives_sse.c b/src/core_derivatives_sse.c
index f9849dd..cf77d84 100644
--- a/src/core_derivatives_sse.c
+++ b/src/core_derivatives_sse.c
@@ -208,8 +208,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_sse(unsigned int states,
 
         for (k = 0; k < states; ++k)
         {
-          lterm += clvp[k] * freqs[k] * invev[k*states+j];
-          rterm += ev[j*states+k] * clvc[k];
+          lterm += clvp[k] * freqs[k] * invev[k*states_padded+j];
+          rterm += ev[j*states_padded+k] * clvc[k];
         }
 
         sum[j] = lterm*rterm;
@@ -288,8 +288,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states,
 
         for (k = 0; k < states; ++k)
         {
-          lterm += (tipstate & 1) * freqs[k] * invev[k*states+j];
-          rterm += ev[j*states+k] * clvc[k];
+          lterm += (tipstate & 1) * freqs[k] * invev[k*states_padded+j];
+          rterm += ev[j*states_padded+k] * clvc[k];
           tipstate >>= 1;
         }
         sum[j] = lterm*rterm;
diff --git a/src/core_pmatrix.c b/src/core_pmatrix.c
index 4fb1e84..77aedb5 100644
--- a/src/core_pmatrix.c
+++ b/src/core_pmatrix.c
@@ -214,7 +214,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix,
 
         for (j = 0; j < states; ++j)
           for (k = 0; k < states; ++k)
-            temp[j*states+k] = inv_evecs[j*states+k] * expd[k];
+            temp[j*states+k] = inv_evecs[j*states_padded+k] * expd[k];
 
         for (j = 0; j < states; ++j)
         {
@@ -224,7 +224,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix,
             for (m = 0; m < states; ++m)
             {
               pmat[j*states_padded+k] +=
-                  temp[j*states+m] * evecs[m*states+k];
+                  temp[j*states+m] * evecs[m*states_padded+k];
             }
           }
         }
diff --git a/src/hardware.c b/src/hardware.c
index 49d414a..8bbd151 100644
--- a/src/hardware.c
+++ b/src/hardware.c
@@ -21,21 +21,52 @@
 
 #include "pll.h"
 
-#ifndef __PPC__
-#define cpuid(f1, f2, a, b, c, d)                                \
-  __asm__ __volatile__ ("cpuid"                                  \
-                        : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
-                        : "a" (f1), "c" (f2));
-#endif
+#if (!defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 4 || \
+      (__GNUC__ == 4 && __GNUC_MINOR__ < 8))) || \
+    (defined(__clang__) && (__clang_major__ < 3 || \
+      (__clang_major__ == 3 && __clang_minor__ < 9)))
+  
+  #if defined(__i386__) && defined(__PIC__)
+    #if (defined(__GNUC__) && __GNUC__ < 3)
+#define cpuid(level, count, a, b, c, d)                 \
+  __asm__ ("xchgl\t%%ebx, %k1\n\t"                      \
+           "cpuid\n\t"                                  \
+           "xchgl\t%%ebx, %k1\n\t"                      \
+           : "=a" (a), "=&r" (b), "=c" (c), "=d" (d)    \
+           : "0" (level), "2" (count))
+    #else
+#define cpuid(level, count, a, b, c, d)                 \
+  __asm__ ("xchg{l}\t{%%}ebx, %k1\n\t"                  \
+           "cpuid\n\t"                                  \
+           "xchg{l}\t{%%}ebx, %k1\n\t"                  \
+           : "=a" (a), "=&r" (b), "=c" (c), "=d" (d)    \
+           : "0" (level), "2" (count))
+    #endif
+  #elif defined(__x86_64__) && (defined(__code_model_medium__) || \
+        defined(__code_model_large__)) && defined(__PIC__)
+#define cpuid(level, count, a, b, c, d)                 \
+  __asm__ ("xchg{q}\t{%%}rbx, %q1\n\t"                  \
+           "cpuid\n\t"                                  \
+           "xchg{q}\t{%%}rbx, %q1\n\t"                  \
+           : "=a" (a), "=&r" (b), "=c" (c), "=d" (d)    \
+           : "0" (level), "2" (count))
+  #else
+#define cpuid(level, count, a, b, c, d)                 \
+  __asm__ ("cpuid\n\t"                                  \
+           : "=a" (a), "=b" (b), "=c" (c), "=d" (d)     \
+           : "0" (level), "2" (count))
+  #endif
 
 static void cpu_features_detect()
 {
   unsigned int a,b,c,d;
 
-  memset(pll_hardware,0,sizeof(pll_hardware_t));
+  memset(&pll_hardware,0,sizeof(pll_hardware_t));
+
+  pll_hardware.init            = 1;
 
-#ifdef __PPC__
-  pll_hardware->altivec_present = 1;
+#if defined(__PPC__)
+  pll_hardware.altivec_present = 1;
 #else
 
   cpuid(0,0,a,b,c,d);
@@ -44,56 +75,75 @@ static void cpu_features_detect()
   if (maxlevel >= 1)
   {
     cpuid(1,0,a,b,c,d);
-    pll_hardware->mmx_present    = (d >> 23) & 1;
-    pll_hardware->sse_present    = (d >> 25) & 1;
-    pll_hardware->sse2_present   = (d >> 26) & 1;
-    pll_hardware->sse3_present   = (c >>  0) & 1;
-    pll_hardware->ssse3_present  = (c >>  9) & 1;
-    pll_hardware->sse41_present  = (c >> 19) & 1;
-    pll_hardware->sse42_present  = (c >> 20) & 1;
-    pll_hardware->popcnt_present = (c >> 23) & 1;
-    pll_hardware->avx_present    = (c >> 28) & 1;
+    pll_hardware.mmx_present    = (d >> 23) & 1;
+    pll_hardware.sse_present    = (d >> 25) & 1;
+    pll_hardware.sse2_present   = (d >> 26) & 1;
+    pll_hardware.sse3_present   = (c >>  0) & 1;
+    pll_hardware.ssse3_present  = (c >>  9) & 1;
+    pll_hardware.sse41_present  = (c >> 19) & 1;
+    pll_hardware.sse42_present  = (c >> 20) & 1;
+    pll_hardware.popcnt_present = (c >> 23) & 1;
+    pll_hardware.avx_present    = (c >> 28) & 1;
 
     if (maxlevel >= 7)
     {
       cpuid(7,0,a,b,c,d);
-      pll_hardware->avx2_present = (b >> 5) & 1;
+      pll_hardware.avx2_present = (b >> 5) & 1;
     }
   }
 #endif
 }
 
+#else
+
+
+static void cpu_features_detect()
+{
+  memset(&pll_hardware,0,sizeof(pll_hardware_t));
+
+  pll_hardware.init            = 1;
+#if defined(__PPC__)
+  pll_hardware.altivec_present = __builtin_cpu_supports("altivec");
+#elif defined(__x86_64__) || defined(__i386__)
+  pll_hardware.mmx_present     = __builtin_cpu_supports("mmx");
+  pll_hardware.sse_present     = __builtin_cpu_supports("sse");
+  pll_hardware.sse2_present    = __builtin_cpu_supports("sse2");
+  pll_hardware.sse3_present    = __builtin_cpu_supports("sse3");
+  pll_hardware.ssse3_present   = __builtin_cpu_supports("ssse3");
+  pll_hardware.sse41_present   = __builtin_cpu_supports("sse4.1");
+  pll_hardware.sse42_present   = __builtin_cpu_supports("sse4.2");
+  pll_hardware.popcnt_present  = __builtin_cpu_supports("popcnt");
+  pll_hardware.avx_present     = __builtin_cpu_supports("avx");
+  pll_hardware.avx2_present    = __builtin_cpu_supports("avx2");
+#endif
+}
+
+#endif
+
 static void cpu_features_show()
 {
-  if (!pll_hardware)
-  {
-    /* TODO: Add proper error control after we figure out
-       cross-platform compatibility */
-    return;
-  }
-    
   fprintf(stderr, "CPU features:");
-  if (pll_hardware->altivec_present)
+  if (pll_hardware.altivec_present)
     fprintf(stderr, " altivec");
-  if (pll_hardware->mmx_present)
+  if (pll_hardware.mmx_present)
     fprintf(stderr, " mmx");
-  if (pll_hardware->sse_present)
+  if (pll_hardware.sse_present)
     fprintf(stderr, " sse");
-  if (pll_hardware->sse2_present)
+  if (pll_hardware.sse2_present)
     fprintf(stderr, " sse2");
-  if (pll_hardware->sse3_present)
+  if (pll_hardware.sse3_present)
     fprintf(stderr, " sse3");
-  if (pll_hardware->ssse3_present)
+  if (pll_hardware.ssse3_present)
     fprintf(stderr, " ssse3");
-  if (pll_hardware->sse41_present)
+  if (pll_hardware.sse41_present)
     fprintf(stderr, " sse4.1");
-  if (pll_hardware->sse42_present)
+  if (pll_hardware.sse42_present)
     fprintf(stderr, " sse4.2");
-  if (pll_hardware->popcnt_present)
+  if (pll_hardware.popcnt_present)
     fprintf(stderr, " popcnt");
-  if (pll_hardware->avx_present)
+  if (pll_hardware.avx_present)
     fprintf(stderr, " avx");
-  if (pll_hardware->avx2_present)
+  if (pll_hardware.avx2_present)
     fprintf(stderr, " avx2");
   fprintf(stderr, "\n");
 }
@@ -101,15 +151,6 @@ static void cpu_features_show()
 PLL_EXPORT int pll_hardware_probe()
 {
   /* probe cpu features */
-  if (!pll_hardware)
-  {
-    if (!(pll_hardware = (pll_hardware_t *)calloc(1,sizeof(pll_hardware_t))))
-    {
-      pll_errno = PLL_ERROR_MEM_ALLOC;
-      snprintf(pll_errmsg, 200, "Unable to allocate enough memory.");
-      return PLL_FAILURE;
-    }
-  }
   cpu_features_detect();
 
   return PLL_SUCCESS;
@@ -117,5 +158,24 @@ PLL_EXPORT int pll_hardware_probe()
 
 PLL_EXPORT void pll_hardware_dump()
 {
+  if (!pll_hardware.init)
+    pll_hardware_probe();
+
   cpu_features_show(); 
 }
+
+PLL_EXPORT void pll_hardware_ignore()
+{
+  pll_hardware.init            = 1;
+  pll_hardware.altivec_present = 1;
+  pll_hardware.mmx_present     = 1;
+  pll_hardware.sse_present     = 1;
+  pll_hardware.sse2_present    = 1;
+  pll_hardware.sse3_present    = 1;
+  pll_hardware.ssse3_present   = 1;
+  pll_hardware.sse41_present   = 1;
+  pll_hardware.sse42_present   = 1;
+  pll_hardware.popcnt_present  = 1;
+  pll_hardware.avx_present     = 1;
+  pll_hardware.avx2_present    = 1;
+}
diff --git a/src/init.c b/src/init.c
deleted file mode 100644
index 90d5bd3..0000000
--- a/src/init.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
-    Copyright (C) 2017 Tomas Flouri
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Affero General Public License as
-    published by the Free Software Foundation, either version 3 of the
-    License, or (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Affero General Public License for more details.
-
-    You should have received a copy of the GNU Affero General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-    Contact: Tomas Flouri <Tomas.Flouri at h-its.org>,
-    Exelixis Lab, Heidelberg Instutute for Theoretical Studies
-    Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
-*/
-
-#include "pll.h"
-
-PLL_EXPORT void pll_init()
-{
-  pll_hardware_probe();
-}
-
-PLL_EXPORT void pll_fini()
-{
-  if (pll_hardware)
-    free(pll_hardware);
-  pll_hardware = NULL;
-}
diff --git a/src/models.c b/src/models.c
index 6081283..0148f92 100644
--- a/src/models.c
+++ b/src/models.c
@@ -262,6 +262,7 @@ PLL_EXPORT int pll_update_eigen(pll_partition_t * partition,
   double * subst_params = partition->subst_params[params_index];
 
   unsigned int states = partition->states;
+  unsigned int states_padded = partition->states_padded;
 
   a = create_ratematrix(subst_params,
                         freqs,
@@ -291,25 +292,32 @@ PLL_EXPORT int pll_update_eigen(pll_partition_t * partition,
 
   /* store eigen vectors */
   for (i = 0; i < states; ++i)
-    memcpy(eigenvecs + i*states, a[i], states*sizeof(double));
+    memcpy(eigenvecs + i*states_padded, a[i], states*sizeof(double));
 
   /* store eigen values */
   memcpy(eigenvals, d, states*sizeof(double));
 
   /* store inverse eigen vectors */
   for (k = 0, i = 0; i < states; ++i)
-    for (j = i; j < states*states; j += states)
+  {
+    for (j = i; j < states_padded*states; j += states_padded)
       inv_eigenvecs[k++] = eigenvecs[j];
 
+    /* account for padding */
+    k += states_padded - states;
+  }
+
+  assert(k == states_padded*states);
+
   /* multiply the inverse eigen vectors from the left with sqrt(pi)^-1 */
   for (i = 0; i < states; ++i)
     for (j = 0; j < states; ++j)
-      inv_eigenvecs[i*states+ j] /= sqrt(freqs[i]);
+      inv_eigenvecs[i*states_padded+ j] /= sqrt(freqs[i]);
 
   /* multiply the eigen vectors from the right with sqrt(pi) */
   for (i = 0; i < states; ++i)
     for (j = 0; j < states; ++j)
-      eigenvecs[i*states+j] *= sqrt(freqs[j]);
+      eigenvecs[i*states_padded+j] *= sqrt(freqs[j]);
 
   partition->eigen_decomp_valid[params_index] = 1;
 
diff --git a/src/pll.c b/src/pll.c
index 50254aa..299d0cd 100644
--- a/src/pll.c
+++ b/src/pll.c
@@ -24,7 +24,7 @@
 __thread int pll_errno;
 __thread char pll_errmsg[200] = {0};
 
-pll_hardware_t * pll_hardware = NULL;
+pll_hardware_t pll_hardware = {0,0,0,0,0,0,0,0,0,0,0,0};
 
 static void dealloc_partition_data(pll_partition_t * partition);
 
@@ -604,6 +604,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips,
                "Unable to allocate enough memory for eigenvectors.");
       return PLL_FAILURE;
     }
+    memset(partition->eigenvecs[i], 0, states * states_padded * sizeof(double));
     /* TODO: don't forget to add code for SSE/AVX */
   }
 
@@ -632,6 +633,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips,
                "Unable to allocate enough memory for inverse eigenvectors.");
       return PLL_FAILURE;
     }
+    memset(partition->inv_eigenvecs[i], 0, states * states_padded * sizeof(double));
     /* TODO: don't forget to add code for SSE/AVX */
   }
 
@@ -660,6 +662,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips,
                "Unable to allocate enough memory for eigenvalues.");
       return PLL_FAILURE;
     }
+    memset(partition->eigenvals[i], 0, states_padded * sizeof(double));
     /* TODO: don't forget to add code for SSE/AVX */
   }
 
diff --git a/src/pll.h b/src/pll.h
index fe38a76..83daa9c 100644
--- a/src/pll.h
+++ b/src/pll.h
@@ -19,6 +19,8 @@
     Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
 */
 
+#ifndef PLL_H
+#define PLL_H
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
@@ -26,12 +28,28 @@
 #include <stdint.h>
 #include <string.h>
 #include <ctype.h>
-#include <x86intrin.h>
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 
+#if (!defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 4 || \
+     (__GNUC__ == 4 && __GNUC_MINOR__ < 7)))
+  #if ((__GNUC__ == 4) && (__GNUC_MINOR__ == 6))
+    #if (defined(HAVE_AVX2))
+      #error "GCC 4.6.x. Please run ./configure --disable-avx2"
+    #endif
+  #else
+    #if (defined(HAVE_AVX2) || defined(HAVE_AVX))
+      #error "GCC < 4.6. Please run ./configure --disable-avx --disable-avx2"
+    #endif
+  #endif
+#endif
+
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h>
+#endif
+
 /* platform specific */
 
 #if (!defined(__APPLE__) && !defined(__WIN32__) && !defined(__WIN64__))
@@ -49,7 +67,8 @@
 #define PLL_MIN(a,b) ((a) < (b) ? (a) : (b))
 #define PLL_MAX(a,b) ((a) > (b) ? (a) : (b))
 #define PLL_SWAP(x,y) do { __typeof__ (x) _t = x; x = y; y = _t; } while(0)
-#define PLL_STAT(x) (pll_hardware && pll_hardware->x)
+#define PLL_STAT(x) ((pll_hardware.init || pll_hardware_probe()) \
+                     && pll_hardware.x)
 
 /* constants */
 
@@ -163,6 +182,7 @@
 
 typedef struct pll_hardware_s
 {
+  int init;
   /* cpu features */
   int altivec_present;
   int mmx_present;
@@ -451,7 +471,7 @@ struct pll_random_data
 
 PLL_EXPORT extern __thread int pll_errno;
 PLL_EXPORT extern __thread char pll_errmsg[200];
-PLL_EXPORT extern pll_hardware_t * pll_hardware;
+PLL_EXPORT extern pll_hardware_t pll_hardware;
 
 PLL_EXPORT extern const unsigned int pll_map_bin[256];
 PLL_EXPORT extern const unsigned int pll_map_nt[256];
@@ -1875,13 +1895,9 @@ PLL_EXPORT int pll_hardware_probe(void);
 
 PLL_EXPORT void pll_hardware_dump();
 
-/* functions in init.c */
-
-PLL_EXPORT void pll_init(void) __attribute__((constructor));
-
-PLL_EXPORT void pll_fini(void) __attribute__((destructor));
-
+PLL_EXPORT void pll_hardware_ignore();
 
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libpll.git