[med-svn] [Git][med-team/kalign][upstream] New upstream version 3.3.1

Andreas Tille (@tille) gitlab at salsa.debian.org
Fri Oct 1 12:21:41 BST 2021

Andreas Tille pushed to branch upstream at Debian Med / kalign

7ebdac5a by Andreas Tille at 2021-10-01T13:15:34+02:00
New upstream version 3.3.1
- - - - -

11 changed files:

- ChangeLog
- configure.ac
- dev/run_io_test.sh
- src/alignment_parameters.c
- src/alignment_parameters.h
- src/aln_run.c
- src/run_kalign.c
- src/rwalign.c
- src/weave_alignment.c
- src/weave_alignment.h


@@ -1,3 +1,25 @@
+2021-04-16  Timo Lassmann  <timo.lassmann at telethonkids.org.au>
+	* version 3.3.1 - Bug Fix
+	The previous version kalign checked the top 50 sequences in inputs to determine
+	whether the sequences are aligned or not. If the first 50 sequences are not aligned,
+	but following sequences contain gaps (or other characters!) kalign can crash. In this
+	version (3.3.1) kalign checks all sequences, thereby avoiding this issue.
+	To alert users to the situation described above and to warn users about the presence of
+	odd characters, kalign now produces a warning message like this:
+	[Date Time] :     LOG : Start io tests.
+	[Date Time] :     LOG : reading: dev/data/a2m.good.1
+	[Date Time] :     LOG : Detected protein sequences.
+	[Date Time] : WARNING : -------------------------------------------- (rwalign.c line 505)
+	[Date Time] : WARNING : The input sequences contain gap characters:  (rwalign.c line 506)
+	[Date Time] : WARNING : "-" :   36 found                             (rwalign.c line 510)
+	[Date Time] : WARNING : BUT the sequences do not seem to be aligned! (rwalign.c line 514)
+	[Date Time] : WARNING :                                              (rwalign.c line 515)
+	[Date Time] : WARNING : Kalign will remove the gap characters and    (rwalign.c line 516)
+	[Date Time] : WARNING : align the sequences.                         (rwalign.c line 517)
+	[Date Time] : WARNING : -------------------------------------------- (rwalign.c line 518)
 2020-11-06  Timo Lassmann  <timo.lassmann at telethonkids.org.au>
 	* version 3.3 - Threading and more

@@ -1,48 +1,106 @@
-  Kalign version 2.03, Copyright (C) 2006 Timo Lassmann
-  http://msa.cgb.ki.se/
-  timolassmann at gmail.com
-	This program is free software; you can redistribute it and/or modify
-	it under the terms of the GNU General Public License as published by
-	the Free Software Foundation; either version 2 of the License, or
-	any later version.
-	This program is distributed in the hope that it will be useful,
-	but WITHOUT ANY WARRANTY; without even the implied warranty of
-	GNU General Public License for more details.
-	You should have received a copy of the GNU General Public License
-	along with this program; if not, write to the Free Software
-	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-	A copy of this license is in the COPYING file.
+    Kalign - a multiple sequence alignment program
+    Copyright 2006, 2019, 2020, 2021 Timo Lassmann
+    This file is part of kalign.
+    Kalign is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-% ./configure 
-% make
+Kalign is a fast multiple sequence alignment program for biological sequences.
+1) Installation
+1.1) Release Tarball
+Download tarball from [releases](https://github.com/TimoLassmann/kalign/releases). Then:
+tar -zxvf kalign-<version>.tar.gz
+cd kalign-<version>
+make check
+make install
+1.2) Homebrew
+brew install brewsci/bio/kalign
+1.3) Developer version
+git clone https://github.com/TimoLassmann/kalign.git
+cd kalign
+make check
+make install
+1.4) on macOS, install [brew](https://brew.sh/) then:
+brew install libtool
+brew install automake
+git clone https://github.com/TimoLassmann/kalign.git
+cd kalign
+make check
+make install
+2) Usage
+Usage: kalign  -i <seq file> -o <out aln>
+   --format           : Output format. [Fasta]
+   --reformat         : Reformat existing alignment. [NA]
+   --version          : Print version and exit
+Kalign expects the input to be a set of unaligned sequences in fasta format or aligned sequences in aligned fasta, MSF or clustal format. Kalign automatically detects whether the input sequences are protein, RNA or DNA.
+Since version 3.2.0 kalign supports passing sequence in via stdin and support alignment of sequences from multiple files.
+3) Examples
+Passing sequences via stdin:
+   cat input.fa | kalign -f fasta > out.afa
+Combining multiple input files:
+   kalign seqsA.fa seqsB.fa seqsC.fa -f fasta > combined.afa
+Align sequences and output the alignment in MSF format:
+   kalign -i BB11001.tfa -f msf  -o out.msf
-and as root:
+Align sequences and output the alignment in clustal format:
-% make install
+   kalign -i BB11001.tfa -f clu -o out.clu
+Re-align sequences in an existing alignment:
+   kalign -i BB11001.msf  -o out.afa
-        kalign [Options]  infile.fasta outfile.fasta
-        or:
-        kalign [Options] -i infile.fasta -o outfile.fasta
-        or:
-        kalign [Options] < infile.fasta > outfile.fasta
+Reformat existing alignment:
-        Options:
-	type: kalign -h 
\ No newline at end of file
+   kalign -i BB11001.msf -r afa -o out.afa

@@ -1,4 +1,4 @@
-AC_INIT(kalign, 3.3)
+AC_INIT(kalign, 3.3.1)

@@ -18,5 +18,4 @@ do
         printf "with ERROR $status and Message:\n\n$error\n\n";
         exit 1;

@@ -30,7 +30,7 @@ int set_param_number(struct aln_param* ap,int L, int sel);
 int new_aln_matrices(struct aln_param* ap);
-int init_ap(struct aln_param** aln_param, struct parameters* param, int numseq,int L)
+int init_ap(struct aln_param** aln_param, struct parameters* param,int L)
         struct aln_param* ap = NULL;
         int i,j;

@@ -55,6 +55,6 @@ struct aln_param{
-extern int init_ap(struct aln_param** aln_param, struct parameters* param, int numseq,int L);
+extern int init_ap(struct aln_param** aln_param, struct parameters* param,int L);
 extern void free_ap(struct aln_param* ap);

@@ -868,6 +868,7 @@ int do_align_serial(struct msa* msa,struct aln_tasks* t,struct aln_mem* m, int t
         t->profile[c] = tmp;
         msa->plen[c] = m->path[0];

@@ -1,7 +1,7 @@
     Kalign - a multiple sequence alignment program
-    Copyright 2006, 2019, 2020 Timo Lassmann
+    Copyright 2006, 2019, 2020, 2021 Timo Lassmann
     This file is part of kalign.
@@ -132,7 +132,7 @@ int print_kalign_header(void)
         fprintf(stdout,"Kalign (%s)\n", PACKAGE_VERSION);
-        fprintf(stdout,"Copyright (C) 2006,2019,2020 Timo Lassmann\n");
+        fprintf(stdout,"Copyright (C) 2006,2019,2020,2021 Timo Lassmann\n");
         fprintf(stdout,"This program comes with ABSOLUTELY NO WARRANTY; for details type:\n");
         fprintf(stdout,"`kalign -showw'.\n");
@@ -520,7 +520,7 @@ int run_kalign(struct parameters* param)
         /* allocate aln parameters  */
-        RUN(init_ap(&ap,param,msa->numseq,msa->L ));
+        RUN(init_ap(&ap,param,msa->L ));
                 double* s;
@@ -555,7 +555,7 @@ int run_kalign(struct parameters* param)
                 RUN(convert_msa_to_internal(msa, ALPHA_ambigiousPROTEIN));
         /* allocate aln parameters  */
-        RUN(init_ap(&ap,param,msa->numseq,msa->L ));
+        RUN(init_ap(&ap,param,msa->L ));
         /* Start alignment stuff */

@@ -65,29 +65,29 @@ struct out_line{
+static int aln_unknown_warning_message(struct msa* msa);
+static int read_fasta(struct in_buffer* b, struct msa** msa);
+static int read_msf(struct in_buffer* b, struct msa** msa);
+static int read_clu(struct in_buffer* b, struct msa** msa);
-int read_fasta(struct in_buffer* b, struct msa** msa);
-int read_msf(struct in_buffer* b, struct msa** msa);
-int read_clu(struct in_buffer* b, struct msa** msa);
-int write_msa_fasta(struct msa* msa,char* outfile);
-int write_msa_clustal(struct msa* msa,char* outfile);
-int write_msa_msf(struct msa* msa,char* outfile);
+static int write_msa_fasta(struct msa* msa,char* outfile);
+static int write_msa_clustal(struct msa* msa,char* outfile);
+static int write_msa_msf(struct msa* msa,char* outfile);
 /* memory functions  */
-struct msa* alloc_msa(void);
-int resize_msa(struct msa* msa);
+static struct msa* alloc_msa(void);
+static int resize_msa(struct msa* msa);
-struct msa_seq* alloc_msa_seq(void);
-int resize_msa_seq(struct msa_seq* seq);
-void free_msa_seq(struct msa_seq* seq);
+static struct msa_seq* alloc_msa_seq(void);
+static int resize_msa_seq(struct msa_seq* seq);
+static void free_msa_seq(struct msa_seq* seq);
-struct line_buffer* alloc_line_buffer(int max_line_len);
-int resize_line_buffer(struct line_buffer* lb);
-void free_line_buffer(struct line_buffer* lb);
+static struct line_buffer* alloc_line_buffer(int max_line_len);
+static int resize_line_buffer(struct line_buffer* lb);
+static void free_line_buffer(struct line_buffer* lb);
 static int read_file_stdin(struct in_buffer** buffer,char* infile);
 static int alloc_in_buffer(struct in_buffer** buffer, int n);
@@ -106,8 +106,6 @@ static int GCGMultchecksum(struct msa* msa);
 /* Taken from squid library by Sean Eddy  */
 static int GCGchecksum(char *seq, int len);
 static int sort_by_name(const void *a, const void *b);
 static int sort_by_chksum(const void *a, const void *b);
@@ -248,7 +246,6 @@ int read_input(char* infile,struct msa** msa)
-        //LOG_MSG("Done reading input sequences in %f seconds.", GET_TIMING(timer));
         *msa = m;
         return OK;
@@ -465,7 +462,8 @@ int detect_aligned(struct msa* msa)
         min_len = INT32_MAX;
         max_len = 0;
         gaps = 0;
-        n = MACRO_MIN(50, msa->numseq);
+        /* n = MACRO_MIN(50, msa->numseq); */
+        n = msa->numseq;
         for(i = 0; i < n;i++){
                 l = 0;
                 for (j = 0; j <= msa->sequences[i]->len;j++){
@@ -480,12 +478,17 @@ int detect_aligned(struct msa* msa)
                 if(min_len == max_len){ /* sequences have gaps and total length is identical - clearly aligned  */
                         msa->aligned = ALN_STATUS_ALIGNED;
                 }else{          /* odd there are gaps but total length differs - unknown status  */
+                        aln_unknown_warning_message(msa);
                         msa->aligned = ALN_STATUS_UNKNOWN;
                 if(min_len == max_len){ /* no gaps and sequences have same length. Can' tell if they are aligned  */
+                        aln_unknown_warning_message(msa);
                         msa->aligned = ALN_STATUS_UNKNOWN;
                 }else{          /* No gaps and sequences have different lengths - unaligned */
                         msa->aligned = ALN_STATUS_UNALIGNED;
@@ -493,15 +496,36 @@ int detect_aligned(struct msa* msa)
         return OK;
+static int aln_unknown_warning_message(struct msa* msa)
+        int i;
+        WARNING_MSG("--------------------------------------------");
+        WARNING_MSG("The input sequences contain gap characters: ");
+        for(i = 0; i < 128;i++){
+                if(msa->letter_freq[i] && ispunct(i)){
+                         WARNING_MSG("\"%c\" : %4d found                            ", (char)i,msa->letter_freq[i] );
+                }
+        }
+        WARNING_MSG("BUT the sequences do not seem to be aligned!");
+        WARNING_MSG("                                            ");
+        WARNING_MSG("Kalign will remove the gap characters and   ");
+        WARNING_MSG("align the sequences.                        ");
+        WARNING_MSG("--------------------------------------------");
+        return OK;
 /* Checks if sequence names are duplicated */
 /* Checks if sequences are duplicated */
 int run_extra_checks_on_msa(struct msa* msa)
         char* tmp_name = NULL;
-        char* tmp_ptr;
+        /* char* tmp_ptr; */
         struct sort_struct_name_chksum** a = NULL;
         int i;
-        int j;
+        /* int j; */
         int c;
         int l;
@@ -1174,50 +1198,30 @@ int read_clu(struct in_buffer* b , struct msa** m)
         struct msa* msa = NULL;
         struct msa_seq* seq_ptr = NULL;
-        //FILE* f_ptr = NULL;
         char* line = NULL;
-        //size_t b_len = 0;
-        //ssize_t nread;
         int i,j;
         char* p;
         int active_seq = 0;
         int line_len;
         int nl,ni;
-        /* sanity checks  */
-        //if(!my_file_exists(infile)){
-        //ERROR_MSG("File: %s does not exist.",infile);
-        //}
         if(msa == NULL){
                 msa = alloc_msa();
-        //RUNP(f_ptr = fopen(infile, "r"));
-        //LOG_MSG("GAGA");
-        /* scan through first line header  */
-        //while(fgets(line, BUFFER_LEN, f_ptr)){
-        //while ((nread = getline(&line, &b_len, f_ptr)) != -1){
-                //fprintf(stdout,"LINE: %s", line);
-                //line_len = strnlen(line, BUFFER_LEN);
         ni =0;
         for(nl = 0; nl < b->n_lines;nl++){
                 line = b->l[nl]->line;
                 line_len = b->l[nl]->len;
-                //line_len = nread;
-                //line[line_len-1] = 0;
-                /* line_len--; */
         active_seq =0;
         for(nl = ni; nl < b->n_lines;nl++){
                 line = b->l[nl]->line;
                 line_len = b->l[nl]->len;
-                //while ((nread = getline(&line, &b_len, f_ptr)) != -1){
-                //while(fgets(line, BUFFER_LEN, f_ptr)){
-                //line_len = strnlen(line, BUFFER_LEN);
-                //line_len = nread;
-                //line[line_len-1] = 0;
-                /* line_len--;     /\* last character is newline  *\/ */
                         active_seq = 0;
@@ -1226,9 +1230,6 @@ int read_clu(struct in_buffer* b , struct msa** m)
                                 seq_ptr = msa->sequences[active_seq];
-                                //p = strstr(line,seq_ptr->name);
-                                //if(p){
-                                //LOG_MSG("Found bitsof seq %s", seq_ptr->name);
                                 p = line;
                                 j = 0;
@@ -1254,11 +1255,8 @@ int read_clu(struct in_buffer* b , struct msa** m)
                                 msa->numseq = MACRO_MAX(msa->numseq, active_seq);
-                //fprintf(stdout,"%d \"%s\"\n",line_len,line);
@@ -1372,35 +1370,19 @@ int read_fasta( struct in_buffer* b,struct msa** m)
         struct msa* msa = NULL;
         struct msa_seq* seq_ptr = NULL;
-        //FILE* f_ptr = NULL;
-        char* line = NULL;
-        //size_t b_len = 0;
-        //ssize_t nread;
-        //char line[BUFFER_LEN];
+        char* line = NULL;
         int line_len;
         int i;
         int nl;
-        /* sanity checks  */
-        //if(!my_file_exists(infile)){
-        //ERROR_MSG("File: %s does not exist.",infile);
-        //}
         if(msa == NULL){
                 msa = alloc_msa();
         for(nl = 0; nl < b->n_lines;nl++){
                 line = b->l[nl]->line;
                 line_len = b->l[nl]->len;
-                //RUNP(f_ptr = fopen(infile, "r"));
-                //while ((nread = getline(&line, &b_len, f_ptr)) != -1){
-                //while(fgets(line, BUFFER_LEN, f_ptr)){
-                //line_len = nread;
-                //fprintf(stdout,"%d %s\n",line_len,line);
                 if(line[0] == '>'){
                         /* alloc seq if buffer is full */
                         if(msa->alloc_numseq == msa->numseq){
@@ -1424,12 +1406,11 @@ int read_fasta( struct in_buffer* b,struct msa** m)
                                                 ERROR_MSG("Encountered a sequence before encountering it's name");
+                                        seq_ptr->seq[seq_ptr->len] = line[i];
+                                        seq_ptr->len++;
                                         if(seq_ptr->alloc_len == seq_ptr->len){
-                                        seq_ptr->seq[seq_ptr->len] = line[i];
-                                        seq_ptr->len++;
                                 }else if(ispunct((int)line[i])){
@@ -1440,17 +1421,10 @@ int read_fasta( struct in_buffer* b,struct msa** m)
         *m = msa;
-        //fclose(f_ptr);
-        //MFREE(line);
         return OK;
-        //if(line){
-        //MFREE(line);
-        //}
-        //if(f_ptr){
-        //fclose(f_ptr);
-        //}
         return FAIL;

@@ -28,32 +28,6 @@
 //int update_gaps(int old_len,int*gis,int new_len,int *newgaps);
 int update_gaps(int old_len,int*gis,int *newgaps);
-int weave(struct msa* msa,struct aln_tasks*t)
-        int i;
-        int a,b,c;
-        //RUN(clean_aln(aln)
-        for(i = 0; i < t->n_tasks;i++){
-                a = t->list[i]->a;
-                b = t->list[i]->b;
-                c = t->list[i]->c;
-                /* fprintf(stdout,"%3d %3d -> %3d (p: %d)\n", t->list[i]->a, t->list[i]->b, t->list[i]->c, t->list[i]->p); */
-                /* RUN(make_seq(msa,a,b,t->map[c])); */
-        }
-        /*for (i = 0; i < (msa->numseq-1)*3;i +=3){
-                a = tree[i];
-                b = tree[i+1];
-                RUN(make_seq(msa,a,b,map[tree[i+2]]));
-                }*/
-        return OK;
-        return FAIL;
 int clean_aln(struct msa* msa)
         int i,j;

@@ -29,7 +29,7 @@
 //extern int weave(struct msa* msa, int** map, int* tree);
-extern int weave(struct msa* msa,struct aln_tasks*t);
+/* extern int weave(struct aln_tasks* t); */
 extern int make_seq(struct msa* msa,int a,int b,int* path);
 extern int clean_aln(struct msa* msa);

View it on GitLab: https://salsa.debian.org/med-team/kalign/-/commit/7ebdac5ad2b6823ab8972de5e525d333aa9b4d51

View it on GitLab: https://salsa.debian.org/med-team/kalign/-/commit/7ebdac5ad2b6823ab8972de5e525d333aa9b4d51
You're receiving this email because of your account on salsa.debian.org.

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211001/50ff751d/attachment-0001.htm>

More information about the debian-med-commit mailing list