[med-svn] [Git][med-team/kalign][upstream] New upstream version 3.3.2

Andreas Tille (@tille) gitlab at salsa.debian.org
Mon May 2 14:40:52 BST 2022



Andreas Tille pushed to branch upstream at Debian Med / kalign


Commits:
c843a487 by Andreas Tille at 2022-05-02T13:48:08+02:00
New upstream version 3.3.2
- - - - -


14 changed files:

- ChangeLog
- configure.ac
- scripts/benchmark.org
- src/alignment_parameters.c
- src/aln_controller.c
- src/aln_seqprofile.c
- src/aln_seqseq.c
- src/aln_task.c
- src/bisectingKmeans.c
- src/euclidean_dist.c
- src/misc.c
- src/run_kalign.c
- src/rwalign.c
- src/sequence_distance.c


Changes:

=====================================
ChangeLog
=====================================
@@ -1,3 +1,13 @@
+2022-03-21  Timo Lassmann  <timo.lassmann at telethonkids.org.au>
+
+	* version 3.3.2 - Bug Fix
+	There was a bug in building a guide tree from highly similar sequences. The fix
+	was involved distributing identical sequences equally among branches. This only happened
+	when there were thousands of identical sequences.
+
+	In addition Kalign now compiles on Apple's M1 chip and possibly on other ARM architectures
+	as well (although I did not test the latter).
+
 2021-04-16  Timo Lassmann  <timo.lassmann at telethonkids.org.au>
 
 	* version 3.3.1 - Bug Fix


=====================================
configure.ac
=====================================
@@ -1,4 +1,4 @@
-AC_INIT(kalign, 3.3.1)
+AC_INIT(kalign, 3.3.2)
 
 #AC_CONFIG_AUX_DIR([.])
 


=====================================
scripts/benchmark.org
=====================================
@@ -552,9 +552,6 @@
    #+END_SRC
 
 
-
-
-
    To plot the scores: 
    #+NAME: Rplotheader
    #+BEGIN_SRC R :session  one :results none :export none :noweb yes 


=====================================
src/alignment_parameters.c
=====================================
@@ -26,6 +26,7 @@
 
 int set_subm_gaps(struct aln_param* ap);
 int set_subm_gaps_DNA(struct aln_param* ap);
+int set_subm_gaps_blast(struct aln_param* ap);
 int set_param_number(struct aln_param* ap,int L, int sel);
 
 int new_aln_matrices(struct aln_param* ap);
@@ -54,6 +55,7 @@ int init_ap(struct aln_param** aln_param, struct parameters* param,int L)
         }
         if(L == ALPHA_defDNA){
                 RUN(set_subm_gaps_DNA(ap));
+                //set_subm_gaps_blast(ap);
         }else if(L == ALPHA_defPROTEIN){
 
                 RUN(set_subm_gaps(ap));
@@ -104,6 +106,22 @@ void free_ap(struct aln_param* ap)
         }
 }
 
+int set_subm_gaps_blast(struct aln_param* ap)
+{
+        int i,j;
+        for(i = 0; i < 5; i++){
+                for(j =0; j < 5;j++){
+                        ap->subm[i][j] = -3;
+                        if(i == j){
+                                ap->subm[i][j] = 4;
+                        }
+                }
+        }
+        ap->gpo = 5;
+        ap->gpe = 5;
+        ap->tgpe = 0;
+        return OK;
+}
 /* These are old parameters from kalign 2 */
 int set_subm_gaps_DNA(struct aln_param* ap)
 {
@@ -113,7 +131,7 @@ int set_subm_gaps_DNA(struct aln_param* ap)
                         ap->subm[i][j] = 283;
                 }
         }
-        //	A   91 -114  -31 -123    0  -43
+//	A   91 -114  -31 -123    0  -43
         ap->subm[0][0] += 91;
         ap->subm[0][1] += -114;
         ap->subm[0][2] += -31;


=====================================
src/aln_controller.c
=====================================
@@ -208,8 +208,6 @@ int aln_runner_serial(struct aln_mem* m)
         return OK;
 }
 
-
-
 int aln_continue(struct aln_mem* m,float input_states[],int old_cor[],int meet,int transition)
 {
         //fprintf(stderr,"Transition:%d	at:%d\n",transition,c);
@@ -433,8 +431,7 @@ int aln_continue(struct aln_mem* m,float input_states[],int old_cor[],int meet,i
         return OK;
 }
 
-
-int aln_continue_serial (struct aln_mem* m,float input_states[],int old_cor[],int meet,int transition)
+int aln_continue_serial(struct aln_mem* m,float input_states[],int old_cor[],int meet,int transition)
 {
         //fprintf(stderr,"Transition:%d	at:%d\n",transition,c);
         //LOG_MSG("MAX: %f",max);


=====================================
src/aln_seqprofile.c
=====================================
@@ -7,12 +7,9 @@
 #include "aln_struct.h"
 #define ALN_SEQPROF_IMPORT
 #include "aln_seqprofile.h"
-
-
 #define MAX(a, b) (a > b ? a : b)
 #define MAX3(a,b,c) MAX(MAX(a,b),c)
 
-
 int aln_seqprofile_foward(struct aln_mem* m)
 {
         struct states* s = m->f;
@@ -52,10 +49,8 @@ int aln_seqprofile_foward(struct aln_mem* m)
                         s[j].ga = MAX(s[j-1].ga,s[j-1].a) - text;
                         s[j].gb = -FLT_MAX;
                 }
-
         }
 
-
         s[m->endb].a = -FLT_MAX;
         s[m->endb].ga = -FLT_MAX;
         s[m->endb].gb = -FLT_MAX;
@@ -73,7 +68,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
                 xa = s[m->startb].a;
                 xga = s[m->startb].ga;
 
-
                 if(m->startb){
                         s[m->startb].gb = MAX(pgb+prof1[28],pa+prof1[27]);
                 }else{
@@ -86,7 +80,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
 
                         pa += prof1[32 + seq2[j]];
 
-
                         s[j].a = pa;
 
                         pga = s[j].ga;
@@ -94,7 +87,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
                         //s[j].ga = MAX(s[j-1].ga-ext,s[j-1].a-open);
                         s[j].ga = MAX(xga-ext,xa-open);
 
-
                         pgb = s[j].gb;
 
                         s[j].gb = MAX(pgb+prof1[28],ca+prof1[27]);
@@ -110,7 +102,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
 
                 pa += prof1[32 + seq2[j]];
 
-
                 s[j].a = pa;
 
                 s[j].ga = -FLT_MAX;//MAX(s[j-1].ga-ext,s[j-1].a-open);
@@ -120,7 +111,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
                 }else{
                         s[j].gb = MAX(s[j].gb,ca)+ prof1[29];
                 }
-
         }
         //prof1 -= m->enda << 6;
         return OK;
@@ -147,7 +137,6 @@ int aln_seqprofile_backward(struct aln_mem* m)
         const float ext  = m->ap->gpe  * (float)sip;
         const float text = m->ap->tgpe * (float)sip;
 
-
         prof1 += (m->enda_2 +1) << 6;
 
         s[m->endb].a = s[0].a;
@@ -184,7 +173,6 @@ int aln_seqprofile_backward(struct aln_mem* m)
                 xa = s[m->endb].a;
                 xga = s[m->endb].ga;
 
-
                 if(m->endb != m->len_b){
                         s[m->endb].gb = MAX(pgb+prof1[28],pa+prof1[27]);
                 }else{
@@ -211,8 +199,6 @@ int aln_seqprofile_backward(struct aln_mem* m)
                         pa = ca;
                         xa = s[j].a;
                         xga = s[j].ga;
-
-
                 }
                 ca = s[j].a;
 
@@ -221,14 +207,12 @@ int aln_seqprofile_backward(struct aln_mem* m)
 
                 s[j].a = pa;
 
-
                 s[j].ga = -FLT_MAX;//MAX(s[j+1].ga-ext,s[j+1].a-open);
                 if(m->startb){
                         s[j].gb = MAX(s[j].gb+prof1[28], ca+prof1[27]);
                 }else{
                         s[j].gb = MAX(s[j].gb,ca)+prof1[29];
                 }
-
         }
         return OK;
 }


=====================================
src/aln_seqseq.c
=====================================
@@ -72,6 +72,7 @@ int aln_seqseq_foward(struct aln_mem* m)
                 }else{
                         s[startb].gb = MAX(pgb,pa) - tgpe;
                 }
+
                 for (j = startb+1; j < endb;j++){
                         ca = s[j].a;
                         pa = MAX3(pa,pga-gpo,pgb-gpo);
@@ -90,8 +91,8 @@ int aln_seqseq_foward(struct aln_mem* m)
 
                         xa = s[j].a;
                         xga = s[j].ga;
-
                 }
+
                 ca = s[j].a;
                 pa = MAX3(pa,pga-gpo,pgb-gpo);
                 pa += subp[seq2[j]];
@@ -104,7 +105,6 @@ int aln_seqseq_foward(struct aln_mem* m)
                 }else{
                         s[j].gb = MAX(s[j].gb,ca)-tgpe;
                 }
-
         }
         return OK;
 }
@@ -137,14 +137,11 @@ int aln_seqseq_backward(struct aln_mem* m)
         const float tgpe = m->ap->tgpe;
         float** subm = m->ap->subm;
 
-
         s[endb].a = s[0].a ;
         s[endb].ga = s[0].ga;
         s[endb].gb = s[0].gb;
 
-
         //init of first row;
-
         //j = endb-startb;
         if(endb != m->len_b){
                 for(j = endb-1;j > startb;j--){
@@ -160,7 +157,6 @@ int aln_seqseq_backward(struct aln_mem* m)
                 }
         }
 
-
         s[startb].a = -FLT_MAX;
         s[startb].ga = -FLT_MAX;
         s[startb].gb = -FLT_MAX;


=====================================
src/aln_task.c
=====================================
@@ -7,7 +7,6 @@ static int sort_tasks_by_priority(const void *a, const void *b);
 static int sort_tasks_by_c(const void *a, const void *b);
 
 
-
 int sort_tasks(struct aln_tasks* t , int order)
 {
         ASSERT(t != NULL, "No tasks");


=====================================
src/bisectingKmeans.c
=====================================
@@ -19,15 +19,17 @@
   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 */
+#include "tldevel.h"
 #ifdef HAVE_OPENMP
 #include <omp.h>
 #endif
 
 #ifdef HAVE_AVX2
 #include <xmmintrin.h>
+#include <mm_malloc.h>
 #endif
 
-#include <mm_malloc.h>
+
 
 #include "tlrng.h"
 #include "msa.h"
@@ -68,8 +70,8 @@ static int label_internal(struct node*n, int label);
 static void create_tasks(struct node*n, struct aln_tasks* t);
 
 
-static struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm,int* samples,int numseq, int num_anchors,int num_samples);
-static struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float** dm,int* samples,int numseq, int num_anchors,int num_samples);
+static int bisecting_kmeans_serial(struct msa* msa, struct node** ret_n, float** dm,int* samples, int num_samples);
+static int bisecting_kmeans_parallel(struct msa* msa, struct node** ret_n, float** dm,int* samples, int num_samples);
 
 static int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick,struct kmeans_result** ret);
 
@@ -121,9 +123,9 @@ int build_tree_kmeans(struct msa* msa, struct aln_param* ap,struct aln_tasks** t
         LOG_MSG("Building guide tree.");
 
         if(ap->nthreads == 1){
-                root = bisecting_kmeans_serial(msa,root, dm, samples, numseq, num_anchors, numseq);
+                RUN(bisecting_kmeans_serial(msa,&root, dm, samples, numseq));
         }else{
-                root = bisecting_kmeans_parallel(msa,root, dm, samples, numseq, num_anchors, numseq);
+                RUN(bisecting_kmeans_parallel(msa,&root, dm, samples, numseq));
         }
 
         STOP_TIMER(timer);
@@ -142,7 +144,11 @@ int build_tree_kmeans(struct msa* msa, struct aln_param* ap,struct aln_tasks** t
                 }*/
         MFREE(root);
         for(i =0 ; i < msa->numseq;i++){
+#ifdef HAVE_AVX2
                 _mm_free(dm[i]);
+#else
+                MFREE(dm[i]);
+#endif
         }
         MFREE(dm);
         DESTROY_TIMER(timer);
@@ -151,15 +157,14 @@ ERROR:
         return FAIL;
 }
 
-
-
-
-
-struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float** dm,int* samples,int numseq, int num_anchors,int num_samples)
+int bisecting_kmeans_parallel(struct msa* msa, struct node** ret_n, float** dm,int* samples, int num_samples)
 {
         struct kmeans_result* res_tmp = NULL;
         struct kmeans_result* best = NULL;
         struct kmeans_result** res_ptr = NULL;
+        struct node* n = NULL;
+        int num_anchors = 0;
+
         int i,j;
         int tries = 40;
         /* int t_iter; */
@@ -167,17 +172,22 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
         int* sl = NULL;
         int* sr = NULL;
         int num_l,num_r;
+
+        num_anchors = MACRO_MIN(32, msa->numseq);
+
         if(num_samples < 100){
                 float** dm = NULL;
                 RUNP(dm = d_estimation(msa, samples, num_samples,1));// anchors, num_anchors,1));
                 n = upgma(dm,samples, num_samples);
-
+                *ret_n = n;
                 gfree(dm);
                 MFREE(samples);
-                return n;
+                return OK;
+                //return n;
         }else if(num_samples < 1000){
-                n = bisecting_kmeans_serial(msa, n, dm, samples, numseq, num_anchors, num_samples);
-                return n;
+                RUN(bisecting_kmeans_serial(msa, &n, dm, samples, num_samples));
+                *ret_n = n;
+                return OK;
         }
 
 
@@ -200,7 +210,6 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
                         split(dm,samples,num_anchors, num_samples, (i+ j)*step, &res_ptr[j]);
                 }
 
-
                 for(j = 0; j < 4;j++){
                         if(!best){
                                 change++;
@@ -215,9 +224,6 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
 
                                         change++;
                                 }
-
-
-
                         }
                 }
                 if(!change){
@@ -248,17 +254,17 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
                 {
                         //LOG_MSG("Done");
 
-#pragma omp task shared(msa,n,dm,numseq,num_anchors)
+#pragma omp task shared(msa,n,dm,num_anchors)
 #endif
                         {
-                                n->left = bisecting_kmeans_parallel(msa,n->left, dm, sl, numseq, num_anchors,num_l);
+                                bisecting_kmeans_parallel(msa,&n->left, dm, sl, num_l);
                         }
 
 #ifdef HAVE_OPENMP
-#pragma omp task shared(msa,n,dm,numseq,num_anchors)
+#pragma omp task shared(msa,n,dm,num_anchors)
 #endif
                         {
-                                n->right = bisecting_kmeans_parallel(msa,n->right, dm, sr, numseq, num_anchors, num_r);
+                                bisecting_kmeans_parallel(msa,&n->right, dm, sr, num_r);
                         }
 
 #ifdef HAVE_OPENMP
@@ -266,16 +272,20 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
                 }
         }
 #endif
-        return n;
+
+        *ret_n =n;
+        return OK;
 ERROR:
-        return NULL;
+        return FAIL;
 }
 
-struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm,int* samples,int numseq, int num_anchors,int num_samples)
+int bisecting_kmeans_serial(struct msa* msa, struct node** ret_n, float** dm,int* samples, int num_samples)
 {
         struct kmeans_result* res_tmp = NULL;
         struct kmeans_result* best = NULL;
         struct kmeans_result** res_ptr = NULL;
+        int num_anchors = 0;
+        struct node* n = NULL;
         int i,j;
         int tries = 40;
         /* int t_iter; */
@@ -283,14 +293,17 @@ struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm
         int* sl = NULL;
         int* sr = NULL;
         int num_l,num_r;
+
+        num_anchors = MACRO_MIN(32, msa->numseq);
+
         if(num_samples < 100){
                 float** dm = NULL;
                 RUNP(dm = d_estimation(msa, samples, num_samples,1));// anchors, num_anchors,1));
                 n = upgma(dm,samples, num_samples);
-
+                *ret_n = n;
                 gfree(dm);
                 MFREE(samples);
-                return n;
+                return OK;
         }
 
         best = NULL;
@@ -324,9 +337,6 @@ struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm
 
                                         change++;
                                 }
-
-
-
                         }
                 }
                 if(!change){
@@ -350,11 +360,12 @@ struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm
         MFREE(samples);
         n = alloc_node();
 
-        n->left = bisecting_kmeans_serial(msa,n->left, dm, sl, numseq, num_anchors,num_l);
-        n->right = bisecting_kmeans_serial(msa,n->right, dm, sr, numseq, num_anchors, num_r);
-        return n;
+        bisecting_kmeans_serial(msa,&n->left , dm, sl, num_l);
+        bisecting_kmeans_serial(msa,&n->right, dm, sr, num_r);
+        *ret_n = n;
+        return OK;
 ERROR:
-        return NULL;
+        return FAIL;
 }
 
 int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick,struct kmeans_result** ret)
@@ -384,14 +395,24 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
         num_var = num_var << 3;
 
 
+
+
+#ifdef HAVE_AVX2
         wr = _mm_malloc(sizeof(float) * num_var,32);
         wl = _mm_malloc(sizeof(float) * num_var,32);
         cr = _mm_malloc(sizeof(float) * num_var,32);
         cl = _mm_malloc(sizeof(float) * num_var,32);
         w = _mm_malloc(sizeof(float) * num_var,32);
+#else
+        MMALLOC(wr,sizeof(float) * num_var);
+        MMALLOC(wl,sizeof(float) * num_var);
+        MMALLOC(cr,sizeof(float) * num_var);
+        MMALLOC(cl,sizeof(float) * num_var);
+        MMALLOC(w,sizeof(float) * num_var);
+#endif
+
         if(*ret){
                 res = *ret;
-
         }else{
                 RUNP(res = alloc_kmeans_result(num_samples));
         }
@@ -433,7 +454,11 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
                 //      fprintf(stdout,"%f %f  %f\n", cl[j],cr[j],w[j]);
         }
 
+#ifdef HAVE_AVX2
         _mm_free(w);
+#else
+        MFREE(w);
+#endif
 
         /* check if cr == cl - we have identical sequences  */
         s = 0;
@@ -448,10 +473,19 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
                 score = 0.0F;
                 num_l = 0;
                 num_r = 0;
-                sl[num_l] = samples[0];
-                num_l++;
-
-                for(i =1 ; i <num_samples;i++){
+                /* The code below caused sequence sets of size 1 to be passed to clustering...  */
+                /* sl[num_l] = samples[0]; */
+                /* num_l++; */
+
+                /* for(i =1 ; i <num_samples;i++){ */
+                /*         sr[num_r] = samples[i]; */
+                /*         num_r++; */
+                /* } */
+                for(i = 0; i < num_samples/2;i++){
+                        sl[num_l] = samples[i];
+                        num_l++;
+                }
+                for(i = num_samples/2; i < num_samples;i++){
                         sr[num_r] = samples[i];
                         num_r++;
                 }
@@ -490,6 +524,16 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
                                         sl[num_l] = s;
                                         num_l++;
                                 }else{
+                                        /* Assign sequence to smaller group  */
+                                        /* if(num_l < num_r){ */
+                                        /*         w = wl; */
+                                        /*         sl[num_l] = s; */
+                                        /*         num_l++; */
+                                        /* }else{ */
+                                        /*         w = wr; */
+                                        /*         sr[num_r] = s; */
+                                        /*         num_r++; */
+                                        /* } */
                                         if(i & 1){
                                                 w = wr;
                                                 sr[num_r] = s;
@@ -500,11 +544,9 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
                                                 num_l++;
                                         }
                                 }
-
                                 for(j = 0; j < num_anchors;j++){
                                         w[j] += dm[s][j];
                                 }
-
                         }
 
                         for(j = 0; j < num_anchors;j++){
@@ -538,10 +580,18 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
                         }
                 }
         }
+
+#ifdef HAVE_AVX2
         _mm_free(wr);
         _mm_free(wl);
         _mm_free(cr);
         _mm_free(cl);
+#else
+        MFREE(wr);
+        MFREE(wl);
+        MFREE(cr);
+        MFREE(cl);
+#endif
 
         res->nl =  num_l;
         res->nr =  num_r;


=====================================
src/euclidean_dist.c
=====================================
@@ -25,9 +25,10 @@
 #ifdef HAVE_AVX2
 #include <xmmintrin.h>
 #include <immintrin.h>
+#include <mm_malloc.h>
 #endif
 
-#include <mm_malloc.h>
+
 
 #include "float.h"
 
@@ -55,7 +56,11 @@ int main(void)
 
         MMALLOC(mat, sizeof(float*)* 100);
         for(i = 0; i < 100;i++){
+#ifdef HAVE_AVX2
                 mat[i] = _mm_malloc(sizeof(float)*num_element, 32);
+#else
+                MMALLOC(mat[i],sizeof(float)*num_element);
+#endif
         }
 
         RUNP( rng =init_rng(0));
@@ -89,8 +94,6 @@ int main(void)
         for(i = 0; i < 100;i++){
                 for(j = 0; j <= i;j++){
                         edist_serial(mat[i], mat[j], num_element, &d1);
-
-
                 }
         }
         }
@@ -116,7 +119,11 @@ int main(void)
 
 #endif
         for(i = 0; i < 100;i++){
+#ifdef HAVE_AVX2
                 _mm_free(mat[i]);
+#else
+                MFREE(mat[i]);
+#endif
         }
         MFREE(mat);
 


=====================================
src/misc.c
=====================================
@@ -30,7 +30,7 @@
 #include "misc.h"
 #include  <stdalign.h>
 #include <string.h>
-#include <immintrin.h>
+/* #include <immintrin.h> */
 #ifdef ITEST_MISC
 
 


=====================================
src/run_kalign.c
=====================================
@@ -20,6 +20,8 @@
 
 */
 
+#include "tldevel.h"
+#include <stdio.h>
 #ifdef HAVE_OPENMP
 #include <omp.h>
 #endif
@@ -98,6 +100,7 @@ int print_kalign_help(char * argv[])
         fprintf(stdout,"Combining multiple input files:\n\n   kalign seqsA.fa seqsB.fa seqsC.fa -f fasta > combined.afa\n\n");
 
 
+
         if(basename){
                 MFREE(basename);
         }
@@ -416,7 +419,7 @@ int main(int argc, char *argv[])
                 }
         }
 
-        if (param->num_infiles == 0){
+        if(param->num_infiles == 0){
                 if (!isatty(fileno(stdin))){
                         LOG_MSG("Attempting stdin");
                         param->num_infiles =1;
@@ -500,6 +503,7 @@ int run_kalign(struct parameters* param)
                                 snprintf(msa->sequences[i]->name, 128, "SEQ%d", i+1);
                         }
                 }
+
                 if(param->unalign){
                         /* if(param->out_format == FORMAT_FA){ */
                         RUN(dealign_msa(msa));


=====================================
src/rwalign.c
=====================================
@@ -21,6 +21,7 @@
 */
 
 
+#include "tldevel.h"
 #include "tlmisc.h"
 
 #include "global.h"
@@ -65,7 +66,11 @@ struct out_line{
 
 
 
-static int aln_unknown_warning_message(struct msa* msa);
+
+
+static int aln_unknown_warning_message_gaps_but_len_diff(struct msa* msa);
+static int aln_unknown_warning_message_same_len_no_gaps(struct msa* msa);
+
 static int read_fasta(struct in_buffer* b, struct msa** msa);
 static int read_msf(struct in_buffer* b, struct msa** msa);
 static int read_clu(struct in_buffer* b, struct msa** msa);
@@ -216,10 +221,11 @@ int read_input(char* infile,struct msa** msa)
 
         RUN(detect_alignment_format(b, &type));
 
-        /* LOG_MSG("FORMAT: %d", type); */
+        //LOG_MSG("FORMAT: %d", type);
         if(type == FORMAT_FA){
                 //RUNP(msa = read_msf(infile,msa));
                 //RUNP(msa = read_clu(infile,msa));
+                LOG_MSG("reading fasta");
                 RUN(read_fasta(b,&m));
         }else if(type == FORMAT_MSF){
                 RUN(read_msf(b,&m));
@@ -249,7 +255,6 @@ int read_input(char* infile,struct msa** msa)
         *msa = m;
         return OK;
 ERROR:
-
         free_msa(m);
         return FAIL;
 }
@@ -468,27 +473,29 @@ int detect_aligned(struct msa* msa)
                 l = 0;
                 for (j = 0; j <= msa->sequences[i]->len;j++){
                         l += msa->sequences[i]->gaps[j];
+
+
                 }
                 gaps += l;
+
                 l += msa->sequences[i]->len;
                 min_len = MACRO_MIN(min_len, l);
                 max_len = MACRO_MAX(max_len, l);
+                /* LOG_MSG("%d %d", max_len, min_len); */
         }
         if(gaps){
                 if(min_len == max_len){ /* sequences have gaps and total length is identical - clearly aligned  */
                         msa->aligned = ALN_STATUS_ALIGNED;
                 }else{          /* odd there are gaps but total length differs - unknown status  */
-                        aln_unknown_warning_message(msa);
+                        aln_unknown_warning_message_gaps_but_len_diff(msa);
 
                         msa->aligned = ALN_STATUS_UNKNOWN;
                 }
         }else{
                 if(min_len == max_len){ /* no gaps and sequences have same length. Can' tell if they are aligned  */
-                        aln_unknown_warning_message(msa);
+                        aln_unknown_warning_message_same_len_no_gaps(msa);
                         msa->aligned = ALN_STATUS_UNKNOWN;
                 }else{          /* No gaps and sequences have different lengths - unaligned */
-
-
                         msa->aligned = ALN_STATUS_UNALIGNED;
                 }
         }
@@ -496,7 +503,7 @@ int detect_aligned(struct msa* msa)
         return OK;
 }
 
-static int aln_unknown_warning_message(struct msa* msa)
+static int aln_unknown_warning_message_gaps_but_len_diff(struct msa* msa)
 {
         int i;
         WARNING_MSG("--------------------------------------------");
@@ -508,7 +515,7 @@ static int aln_unknown_warning_message(struct msa* msa)
                 }
         }
 
-        WARNING_MSG("BUT the sequences do not seem to be aligned!");
+        WARNING_MSG("BUT the presumably aligned sequences do not have the length.");
         WARNING_MSG("                                            ");
         WARNING_MSG("Kalign will remove the gap characters and   ");
         WARNING_MSG("align the sequences.                        ");
@@ -517,6 +524,21 @@ static int aln_unknown_warning_message(struct msa* msa)
 }
 
 
+static int aln_unknown_warning_message_same_len_no_gaps(struct msa* msa)
+{
+        int i;
+        WARNING_MSG("--------------------------------------------");
+        WARNING_MSG("All input sequences have the same length.   ");
+        WARNING_MSG("BUT there are no gap characters.            ");
+        WARNING_MSG("                                            ");
+        WARNING_MSG("Unable to determine whether the sequences   ");
+        WARNING_MSG("are already aligned.                        ");
+        WARNING_MSG("Kalign will align the sequences.            ");
+        WARNING_MSG("--------------------------------------------");
+        return OK;
+}
+
+
 /* Checks if sequence names are duplicated */
 /* Checks if sequences are duplicated */
 int run_extra_checks_on_msa(struct msa* msa)
@@ -1234,6 +1256,11 @@ int read_clu(struct in_buffer* b , struct msa** m)
                                 p = line;
                                 j = 0;
                                 for(i = 0;i < line_len;i++){
+                                        if(i == MSA_NAME_LEN-1){
+                                                seq_ptr->name[i] = 0;
+                                                j = i;
+                                                break;
+                                        }
                                         if(isspace((int)p[i])){
                                                 j = i;
                                                 break;
@@ -1298,14 +1325,17 @@ int read_msf(struct in_buffer* b,struct msa** m)
                         if(msa->alloc_numseq == msa->numseq){
                                 RUN(resize_msa(msa));
                         }
-
-                        p+= 5;  /* length of name: */
+                        p += 5;  /* length of name: */
                         while( isspace((int)*p)){
                                 p++;
                         }
+                        /* LOG_MSG("Found name: %s len %d", p, strlen(p)); */
                         seq_ptr = msa->sequences[active_seq];
-
                         for(i = 0;i < line_len;i++){
+                                if(i == MSA_NAME_LEN-1){
+                                        seq_ptr->name[i] = 0;
+                                        break;
+                                }
                                 if(isspace((int)p[i])){
                                         seq_ptr->name[i] = 0;
                                         break;
@@ -1318,7 +1348,7 @@ int read_msf(struct in_buffer* b,struct msa** m)
                         //LOG_MSG("Got a name %s",p);
                 }
         }
-        active_seq =0;
+        active_seq = 0;
         for(nl = li; nl < b->n_lines;nl++){
                 line = b->l[nl]->line;
                 line_len = b->l[nl]->len;
@@ -1390,13 +1420,20 @@ int read_fasta( struct in_buffer* b,struct msa** m)
                         }
 
                         /* line[line_len-1] = 0; */
-                        for(i =0 ; i < line_len;i++){
+                        /*for(i = 0; i < line_len;i++){
                                 if(isspace(line[i])){
                                         line[i] = 0;
                                 }
-                        }
+                                }*/
+
                         seq_ptr = msa->sequences[msa->numseq];
-                        snprintf(seq_ptr->name ,MSA_NAME_LEN ,"%s",line+1);
+                        MFREE(seq_ptr->name);
+                        MMALLOC(seq_ptr->name, sizeof(char) * line_len);
+                        memcpy(seq_ptr->name, line+1, line_len);
+
+                        /* snprintf(seq_ptr->name ,MSA_NAME_LEN ,"%s",line+1); */
+                        //LOG_MSG("Name %d: len %d  name : %s ",msa->numseq, line_len, seq_ptr->name);
+                        //fprintf(stdout,"%s\n%s\n", line, seq_ptr->name);
                         msa->numseq++;
 
                 }else{
@@ -1415,6 +1452,7 @@ int read_fasta( struct in_buffer* b,struct msa** m)
                                         seq_ptr->gaps[seq_ptr->len]++;
                                 }
                         }
+
                 }
         }
         RUN(null_terminate_sequences(msa));


=====================================
src/sequence_distance.c
=====================================
@@ -21,11 +21,13 @@
 
 */
 
+#include "tldevel.h"
 #ifdef HAVE_AVX2
 #include <xmmintrin.h>
+#include <mm_malloc.h>
 #endif
 
-#include <mm_malloc.h>
+
 #include "sequence_distance.h"
 
 #include "alphabet.h"
@@ -88,7 +90,6 @@ float** d_estimation(struct msa* msa, int* samples, int num_samples,int pair)
                                 //dist = dist / (float) MACRO_MIN(len_a, len_b);
                                 dm[i][j] = dist;// + (float)( i * num_samples + j) / (float) ( num_samples * num_samples);
                                 dm[j][i] = dm[i][j];
-                                //fprintf(stdout,"%f ", dm[i][j]);
                         }
 
                         //fprintf(stdout,"\n");
@@ -106,8 +107,12 @@ float** d_estimation(struct msa* msa, int* samples, int num_samples,int pair)
 
                 for(i = 0; i < numseq;i++){
                         dm[i] = NULL;
-
+#ifdef HAVE_AVX2
                         dm[i] = _mm_malloc(sizeof(float) * a,32);
+#else
+                        MMALLOC(dm[i], sizeof(float) *a);
+#endif
+
                         for(j = 0; j < a;j++){
                                 dm[i][j] = 0.0F;
                         }
@@ -139,6 +144,8 @@ float** d_estimation(struct msa* msa, int* samples, int num_samples,int pair)
                                 //dm[i][j] = dist;
                         }
                 }
+
+
                 /* for(i = 0; i < numseq;i++){ */
                 /*         seq_a = msa->sequences[i]->s;// aln->s[i]; */
                 /*         len_a = msa->sequences[i]->len;//  aln->sl[i]; */
@@ -169,49 +176,13 @@ float calc_distance(uint8_t* seq_a, uint8_t* seq_b, int len_a,int len_b, int L)
         }
         return (float)dist;
 #else
-        struct bignode* hash[1024];
-        int i;
-        float dist;
-        unsigned int hv;
-        for (i = 0;i < 1024;i++){
-                hash[i] = 0;
-        }
-        /* Protein sequence  */
-        if( L > ALPHA_defDNA){
-
-                for (i = len_a-2;i--;){
-                        hv = (seq_a[i] << 5) + seq_a[i+1];
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = (seq_a[i] << 5) + seq_a[i+2];
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                }
-
-                dist = protein_wu_distance_calculation(hash,seq_b,len_b,len_a+len_b,58.9);
+        uint8_t dist;
+        if(len_a > len_b){
+                dist = bpm(seq_a, seq_b, len_a, len_b);
         }else{
-
-                for (i = len_a-5;i--;){
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4)  + ((seq_a[i+3]&3)<<2) + (seq_a[i+4]&3);//ABCDE
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4)  + ((seq_a[i+3]&3)<<2) + (seq_a[i+5]&3);//ABCDF
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4)  + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ABCEF
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+3]&3)<<4)  + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ABDEF
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+2]&3)<<6) + ((seq_a[i+3]&3)<<4) + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ACDEF
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                }
-                dist = dna_distance_calculation(hash,seq_b,len_b,len_a+len_b, 61.08);
-        }
-
-
-        for (i = 1024;i--;){
-                if (hash[i]){
-                        big_remove_nodes(hash[i]);
-                        hash[i] = 0;
-                }
+                dist = bpm(seq_b, seq_a, len_b, len_a);
         }
-        return dist;
+        return (float)dist;
 #endif
 
 }



View it on GitLab: https://salsa.debian.org/med-team/kalign/-/commit/c843a487f0145b2397c27d0f92f853a2746d4b47

-- 
View it on GitLab: https://salsa.debian.org/med-team/kalign/-/commit/c843a487f0145b2397c27d0f92f853a2746d4b47
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220502/929bcdb7/attachment-0001.htm>


More information about the debian-med-commit mailing list