[med-svn] [Git][med-team/kalign][master] 5 commits: routine-update: New upstream version
Andreas Tille (@tille)
gitlab at salsa.debian.org
Mon May 2 14:40:45 BST 2022
Andreas Tille pushed to branch master at Debian Med / kalign
Commits:
7e4f3dcd by Andreas Tille at 2022-05-02T13:48:07+02:00
routine-update: New upstream version
- - - - -
c843a487 by Andreas Tille at 2022-05-02T13:48:08+02:00
New upstream version 3.3.2
- - - - -
8081f4ce by Andreas Tille at 2022-05-02T13:48:10+02:00
Update upstream source from tag 'upstream/3.3.2'
Update to upstream version '3.3.2'
with Debian dir cd58cd03c2121aafc23f755de7cc61e8eeec4761
- - - - -
ebd5cc16 by Andreas Tille at 2022-05-02T15:05:12+02:00
Refresh SIMDE patch
- - - - -
363527dd by Andreas Tille at 2022-05-02T15:06:53+02:00
Upload to unstable
- - - - -
16 changed files:
- ChangeLog
- configure.ac
- debian/changelog
- debian/patches/simde
- scripts/benchmark.org
- src/alignment_parameters.c
- src/aln_controller.c
- src/aln_seqprofile.c
- src/aln_seqseq.c
- src/aln_task.c
- src/bisectingKmeans.c
- src/euclidean_dist.c
- src/misc.c
- src/run_kalign.c
- src/rwalign.c
- src/sequence_distance.c
Changes:
=====================================
ChangeLog
=====================================
@@ -1,3 +1,13 @@
+2022-03-21 Timo Lassmann <timo.lassmann at telethonkids.org.au>
+
+ * version 3.3.2 - Bug Fix
+ There was a bug in building a guide tree from highly similar sequences. The fix
+ was involved distributing identical sequences equally among branches. This only happened
+ when there were thousands of identical sequences.
+
+ In addition Kalign now compiles on Apple's M1 chip and possibly on other ARM architectures
+ as well (although I did not test the latter).
+
2021-04-16 Timo Lassmann <timo.lassmann at telethonkids.org.au>
* version 3.3.1 - Bug Fix
=====================================
configure.ac
=====================================
@@ -1,4 +1,4 @@
-AC_INIT(kalign, 3.3.1)
+AC_INIT(kalign, 3.3.2)
#AC_CONFIG_AUX_DIR([.])
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+kalign (1:3.3.2-1) unstable; urgency=medium
+
+ * New upstream version
+ * Refresh SIMDE patch
+
+ -- Andreas Tille <tille at debian.org> Mon, 02 May 2022 14:39:41 +0200
+
kalign (1:3.3.1-1) unstable; urgency=medium
* Fix watchfile to detect new versions on github
=====================================
debian/patches/simde
=====================================
@@ -1,8 +1,8 @@
Author: Michael R. Crusoe <michael.crusoe at gmail.com>
Description: leverage the SIMD Everywhere library
Forwarded: https://github.com/TimoLassmann/kalign/pull/20
---- kalign.orig/src/alignment.c
-+++ kalign/src/alignment.c
+--- a/src/alignment.c
++++ b/src/alignment.c
@@ -20,7 +20,8 @@
*/
@@ -13,14 +13,15 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
#include "alignment.h"
---- kalign.orig/src/bisectingKmeans.c
-+++ kalign/src/bisectingKmeans.c
-@@ -23,12 +23,16 @@
+--- a/src/bisectingKmeans.c
++++ b/src/bisectingKmeans.c
+@@ -24,13 +24,16 @@
#include <omp.h>
#endif
-#ifdef HAVE_AVX2
-#include <xmmintrin.h>
+-#include <mm_malloc.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
+#if !defined(_SSE_NATIVE)
@@ -31,12 +32,12 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
+ #include <mm_malloc.h>
#endif
--#include <mm_malloc.h>
+-
-
#include "tlrng.h"
#include "msa.h"
-@@ -472,13 +476,8 @@
+@@ -506,13 +509,8 @@ int split(float** dm,int* samples, int n
score = 0.0f;
for(i = 0; i < num_samples;i++){
s = samples[i];
@@ -50,8 +51,8 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
score += MACRO_MIN(dl,dr);
if(dr < dl){
---- kalign.orig/src/bpm.c
-+++ kalign/src/bpm.c
+--- a/src/bpm.c
++++ b/src/bpm.c
@@ -25,10 +25,8 @@
#include "tlrng.h"
@@ -65,7 +66,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
__m256i BROADCAST_MASK[16];
-@@ -37,7 +35,6 @@
+@@ -37,7 +35,6 @@ __m256i bitShiftRight256ymm (__m256i *da
/* taken from Alexander Yee: http://www.numberworld.org/y-cruncher/internals/addition.html#ks_add */
__m256i add256(uint32_t carry, __m256i A, __m256i B);
@@ -73,7 +74,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
/* Below are test functions */
#ifdef BPM_UTEST
-@@ -51,11 +48,9 @@
+@@ -51,11 +48,9 @@ uint8_t dyn_256(const uint8_t* t,const u
uint8_t dyn_256_print(const uint8_t* t,const uint8_t* p,int n,int m);
int mutate_seq(uint8_t* s, int len,int k,int L, struct rng_state* rng);
@@ -85,7 +86,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
/* The actual test. */
int bpm_test(void);
-@@ -64,9 +59,7 @@
+@@ -64,9 +59,7 @@ int main(void)
{
/* Important set_broadcast_mask has to be called before using bpm_256!!! */
@@ -95,7 +96,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
RUN(bpm_test());
return EXIT_SUCCESS;
ERROR:
-@@ -149,11 +142,7 @@
+@@ -149,11 +142,7 @@ int bpm_test(void)
for (j =0 ; j < test_iter; j++){
RUN(mutate_seq(b,len,i,alphabet->L,rng));
dyn_score = dyn_256(a,b,len,len);
@@ -107,7 +108,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
if( abs( dyn_score - bpm_score) != 0){
fprintf(stdout,"Scores differ: %d (dyn) %d (bpm) (%d out of %d)\n", dyn_score,bpm_score, calc_errors , total_calc);
calc_errors++;
-@@ -200,13 +189,9 @@
+@@ -200,13 +189,9 @@ int bpm_test(void)
for(i = 0; i < 100;i+=10){
RUN(mutate_seq(b,len,i,alphabet->L,rng));
@@ -121,7 +122,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
//ASSERT(dyn_score == bpm_score, "Scores differ: %d %d.",dyn_score, bpm_score);
-@@ -373,7 +358,6 @@
+@@ -373,7 +358,6 @@ ERROR:
}
@@ -129,7 +130,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
void print_256(__m256i X)
{
alignas(32) uint64_t debug[4];
-@@ -394,7 +378,6 @@
+@@ -394,7 +378,6 @@ void print_256_all(__m256i X)
}
#endif
@@ -137,7 +138,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
-@@ -462,7 +445,6 @@
+@@ -462,7 +445,6 @@ uint8_t bpm(const uint8_t* t,const uint8
}
@@ -145,33 +146,33 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
uint8_t bpm_256(const uint8_t* t,const uint8_t* p,int n,int m)
{
__m256i VP,VN,D0,HN,HP,X,NOTONE;
-@@ -658,4 +640,3 @@
+@@ -658,4 +640,3 @@ __m256i bitShiftRight256ymm (__m256i *da
carryOut = _mm256_xor_si256 (innerCarry, rotate); //FIXME: not sure if this is correct!!!
return carryOut;
}
-#endif
---- kalign.orig/src/euclidean_dist.c
-+++ kalign/src/euclidean_dist.c
-@@ -22,22 +22,24 @@
+--- a/src/euclidean_dist.c
++++ b/src/euclidean_dist.c
+@@ -22,23 +22,24 @@
#include "euclidean_dist.h"
#include "tlrng.h"
-#ifdef HAVE_AVX2
-#include <xmmintrin.h>
-#include <immintrin.h>
--#endif
+-#include <mm_malloc.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx.h>
-
--#include <mm_malloc.h>
+#if !defined(_SSE_NATIVE)
+ #include <stdlib.h>
+ #define _mm_malloc(size, align) aligned_alloc(align, size)
+ #define _mm_free free
+#else
+ #include <mm_malloc.h>
-+#endif
+ #endif
+
+-
#include "float.h"
#include "esl_stopwatch.h"
@@ -184,7 +185,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
#ifdef ITEST_EDIST
int main(void)
-@@ -69,7 +71,6 @@
+@@ -74,7 +75,6 @@ int main(void)
}
}
LOG_MSG("Check for correctness.");
@@ -192,7 +193,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
for(i = 0; i < 100;i++){
for(j = 0; j <= i;j++){
edist_serial(mat[i], mat[j], num_element, &d1);
-@@ -80,7 +81,6 @@
+@@ -85,7 +85,6 @@ int main(void)
}
}
}
@@ -200,7 +201,7 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
DECLARE_TIMER(t);
LOG_MSG("Timing serial");
-@@ -98,7 +98,6 @@
+@@ -101,7 +100,6 @@ int main(void)
GET_TIMING(t);
//LOG_MSG("%f\tsec.",GET_TIMING(t));
@@ -208,15 +209,15 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
LOG_MSG("Timing AVX");
START_TIMER(t);
for(c = 0; c < max_iter; c++){
-@@ -114,7 +113,6 @@
+@@ -117,7 +115,6 @@ int main(void)
GET_TIMING(t);
//LOG_MSG("%f\tsec.",GET_TIMING(t));
-#endif
for(i = 0; i < 100;i++){
+ #ifdef HAVE_AVX2
_mm_free(mat[i]);
- }
-@@ -162,7 +160,6 @@
+@@ -169,7 +166,6 @@ int edist_serial_d(const double* a,const
return OK;
}
@@ -224,14 +225,14 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
int edist_256(const float* a,const float* b, const int len, float* ret)
{
-@@ -211,4 +208,3 @@
+@@ -218,4 +214,3 @@ float hsum_ps_sse3(__m128 v)
return _mm_cvtss_f32(sums);
}
-#endif
---- kalign.orig/src/misc.c
-+++ kalign/src/misc.c
-@@ -23,14 +23,12 @@
+--- a/src/misc.c
++++ b/src/misc.c
+@@ -23,9 +23,8 @@
#include "tldevel.h"
#include "tlrng.h"
@@ -243,35 +244,27 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
#include "misc.h"
#include <stdalign.h>
- #include <string.h>
--#include <immintrin.h>
- #ifdef ITEST_MISC
-
-
---- kalign.orig/src/sequence_distance.c
-+++ kalign/src/sequence_distance.c
-@@ -21,11 +21,16 @@
-
+--- a/src/sequence_distance.c
++++ b/src/sequence_distance.c
+@@ -22,9 +22,14 @@
*/
+ #include "tldevel.h"
-#ifdef HAVE_AVX2
-#include <xmmintrin.h>
--#endif
+-#include <mm_malloc.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
-
--#include <mm_malloc.h>
+#if !defined(_SSE_NATIVE)
+ #include <stdlib.h>
+ #define _mm_malloc(size, align) aligned_alloc(align, size)
+ #define _mm_free free
+#else
+ #include <mm_malloc.h>
-+#endif
- #include "sequence_distance.h"
+ #endif
- #include "alphabet.h"
-@@ -64,9 +69,7 @@
+
+@@ -66,9 +71,7 @@ float** d_estimation(struct msa* msa, in
int len_b;
int i,j;
@@ -281,63 +274,3 @@ Forwarded: https://github.com/TimoLassmann/kalign/pull/20
if(pair){
-@@ -160,7 +163,6 @@
-
- float calc_distance(uint8_t* seq_a, uint8_t* seq_b, int len_a,int len_b, int L)
- {
--#ifdef HAVE_AVX2
- uint8_t dist;
- if(len_a > len_b){
- dist = bpm_256(seq_a, seq_b, len_a, len_b);
-@@ -168,51 +170,6 @@
- dist = bpm_256(seq_b, seq_a, len_b, len_a);
- }
- return (float)dist;
--#else
-- struct bignode* hash[1024];
-- int i;
-- float dist;
-- unsigned int hv;
-- for (i = 0;i < 1024;i++){
-- hash[i] = 0;
-- }
-- /* Protein sequence */
-- if( L > ALPHA_defDNA){
--
-- for (i = len_a-2;i--;){
-- hv = (seq_a[i] << 5) + seq_a[i+1];
-- hash[hv] = big_insert_hash(hash[hv],i);
-- hv = (seq_a[i] << 5) + seq_a[i+2];
-- hash[hv] = big_insert_hash(hash[hv],i);
-- }
--
-- dist = protein_wu_distance_calculation(hash,seq_b,len_b,len_a+len_b,58.9);
-- }else{
--
-- for (i = len_a-5;i--;){
-- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4) + ((seq_a[i+3]&3)<<2) + (seq_a[i+4]&3);//ABCDE
-- hash[hv] = big_insert_hash(hash[hv],i);
-- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4) + ((seq_a[i+3]&3)<<2) + (seq_a[i+5]&3);//ABCDF
-- hash[hv] = big_insert_hash(hash[hv],i);
-- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4) + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ABCEF
-- hash[hv] = big_insert_hash(hash[hv],i);
-- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+3]&3)<<4) + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ABDEF
-- hash[hv] = big_insert_hash(hash[hv],i);
-- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+2]&3)<<6) + ((seq_a[i+3]&3)<<4) + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ACDEF
-- hash[hv] = big_insert_hash(hash[hv],i);
-- }
-- dist = dna_distance_calculation(hash,seq_b,len_b,len_a+len_b, 61.08);
-- }
--
--
-- for (i = 1024;i--;){
-- if (hash[i]){
-- big_remove_nodes(hash[i]);
-- hash[i] = 0;
-- }
-- }
-- return dist;
--#endif
-
- }
-
=====================================
scripts/benchmark.org
=====================================
@@ -552,9 +552,6 @@
#+END_SRC
-
-
-
To plot the scores:
#+NAME: Rplotheader
#+BEGIN_SRC R :session one :results none :export none :noweb yes
=====================================
src/alignment_parameters.c
=====================================
@@ -26,6 +26,7 @@
int set_subm_gaps(struct aln_param* ap);
int set_subm_gaps_DNA(struct aln_param* ap);
+int set_subm_gaps_blast(struct aln_param* ap);
int set_param_number(struct aln_param* ap,int L, int sel);
int new_aln_matrices(struct aln_param* ap);
@@ -54,6 +55,7 @@ int init_ap(struct aln_param** aln_param, struct parameters* param,int L)
}
if(L == ALPHA_defDNA){
RUN(set_subm_gaps_DNA(ap));
+ //set_subm_gaps_blast(ap);
}else if(L == ALPHA_defPROTEIN){
RUN(set_subm_gaps(ap));
@@ -104,6 +106,22 @@ void free_ap(struct aln_param* ap)
}
}
+int set_subm_gaps_blast(struct aln_param* ap)
+{
+ int i,j;
+ for(i = 0; i < 5; i++){
+ for(j =0; j < 5;j++){
+ ap->subm[i][j] = -3;
+ if(i == j){
+ ap->subm[i][j] = 4;
+ }
+ }
+ }
+ ap->gpo = 5;
+ ap->gpe = 5;
+ ap->tgpe = 0;
+ return OK;
+}
/* These are old parameters from kalign 2 */
int set_subm_gaps_DNA(struct aln_param* ap)
{
@@ -113,7 +131,7 @@ int set_subm_gaps_DNA(struct aln_param* ap)
ap->subm[i][j] = 283;
}
}
- // A 91 -114 -31 -123 0 -43
+// A 91 -114 -31 -123 0 -43
ap->subm[0][0] += 91;
ap->subm[0][1] += -114;
ap->subm[0][2] += -31;
=====================================
src/aln_controller.c
=====================================
@@ -208,8 +208,6 @@ int aln_runner_serial(struct aln_mem* m)
return OK;
}
-
-
int aln_continue(struct aln_mem* m,float input_states[],int old_cor[],int meet,int transition)
{
//fprintf(stderr,"Transition:%d at:%d\n",transition,c);
@@ -433,8 +431,7 @@ int aln_continue(struct aln_mem* m,float input_states[],int old_cor[],int meet,i
return OK;
}
-
-int aln_continue_serial (struct aln_mem* m,float input_states[],int old_cor[],int meet,int transition)
+int aln_continue_serial(struct aln_mem* m,float input_states[],int old_cor[],int meet,int transition)
{
//fprintf(stderr,"Transition:%d at:%d\n",transition,c);
//LOG_MSG("MAX: %f",max);
=====================================
src/aln_seqprofile.c
=====================================
@@ -7,12 +7,9 @@
#include "aln_struct.h"
#define ALN_SEQPROF_IMPORT
#include "aln_seqprofile.h"
-
-
#define MAX(a, b) (a > b ? a : b)
#define MAX3(a,b,c) MAX(MAX(a,b),c)
-
int aln_seqprofile_foward(struct aln_mem* m)
{
struct states* s = m->f;
@@ -52,10 +49,8 @@ int aln_seqprofile_foward(struct aln_mem* m)
s[j].ga = MAX(s[j-1].ga,s[j-1].a) - text;
s[j].gb = -FLT_MAX;
}
-
}
-
s[m->endb].a = -FLT_MAX;
s[m->endb].ga = -FLT_MAX;
s[m->endb].gb = -FLT_MAX;
@@ -73,7 +68,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
xa = s[m->startb].a;
xga = s[m->startb].ga;
-
if(m->startb){
s[m->startb].gb = MAX(pgb+prof1[28],pa+prof1[27]);
}else{
@@ -86,7 +80,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
pa += prof1[32 + seq2[j]];
-
s[j].a = pa;
pga = s[j].ga;
@@ -94,7 +87,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
//s[j].ga = MAX(s[j-1].ga-ext,s[j-1].a-open);
s[j].ga = MAX(xga-ext,xa-open);
-
pgb = s[j].gb;
s[j].gb = MAX(pgb+prof1[28],ca+prof1[27]);
@@ -110,7 +102,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
pa += prof1[32 + seq2[j]];
-
s[j].a = pa;
s[j].ga = -FLT_MAX;//MAX(s[j-1].ga-ext,s[j-1].a-open);
@@ -120,7 +111,6 @@ int aln_seqprofile_foward(struct aln_mem* m)
}else{
s[j].gb = MAX(s[j].gb,ca)+ prof1[29];
}
-
}
//prof1 -= m->enda << 6;
return OK;
@@ -147,7 +137,6 @@ int aln_seqprofile_backward(struct aln_mem* m)
const float ext = m->ap->gpe * (float)sip;
const float text = m->ap->tgpe * (float)sip;
-
prof1 += (m->enda_2 +1) << 6;
s[m->endb].a = s[0].a;
@@ -184,7 +173,6 @@ int aln_seqprofile_backward(struct aln_mem* m)
xa = s[m->endb].a;
xga = s[m->endb].ga;
-
if(m->endb != m->len_b){
s[m->endb].gb = MAX(pgb+prof1[28],pa+prof1[27]);
}else{
@@ -211,8 +199,6 @@ int aln_seqprofile_backward(struct aln_mem* m)
pa = ca;
xa = s[j].a;
xga = s[j].ga;
-
-
}
ca = s[j].a;
@@ -221,14 +207,12 @@ int aln_seqprofile_backward(struct aln_mem* m)
s[j].a = pa;
-
s[j].ga = -FLT_MAX;//MAX(s[j+1].ga-ext,s[j+1].a-open);
if(m->startb){
s[j].gb = MAX(s[j].gb+prof1[28], ca+prof1[27]);
}else{
s[j].gb = MAX(s[j].gb,ca)+prof1[29];
}
-
}
return OK;
}
=====================================
src/aln_seqseq.c
=====================================
@@ -72,6 +72,7 @@ int aln_seqseq_foward(struct aln_mem* m)
}else{
s[startb].gb = MAX(pgb,pa) - tgpe;
}
+
for (j = startb+1; j < endb;j++){
ca = s[j].a;
pa = MAX3(pa,pga-gpo,pgb-gpo);
@@ -90,8 +91,8 @@ int aln_seqseq_foward(struct aln_mem* m)
xa = s[j].a;
xga = s[j].ga;
-
}
+
ca = s[j].a;
pa = MAX3(pa,pga-gpo,pgb-gpo);
pa += subp[seq2[j]];
@@ -104,7 +105,6 @@ int aln_seqseq_foward(struct aln_mem* m)
}else{
s[j].gb = MAX(s[j].gb,ca)-tgpe;
}
-
}
return OK;
}
@@ -137,14 +137,11 @@ int aln_seqseq_backward(struct aln_mem* m)
const float tgpe = m->ap->tgpe;
float** subm = m->ap->subm;
-
s[endb].a = s[0].a ;
s[endb].ga = s[0].ga;
s[endb].gb = s[0].gb;
-
//init of first row;
-
//j = endb-startb;
if(endb != m->len_b){
for(j = endb-1;j > startb;j--){
@@ -160,7 +157,6 @@ int aln_seqseq_backward(struct aln_mem* m)
}
}
-
s[startb].a = -FLT_MAX;
s[startb].ga = -FLT_MAX;
s[startb].gb = -FLT_MAX;
=====================================
src/aln_task.c
=====================================
@@ -7,7 +7,6 @@ static int sort_tasks_by_priority(const void *a, const void *b);
static int sort_tasks_by_c(const void *a, const void *b);
-
int sort_tasks(struct aln_tasks* t , int order)
{
ASSERT(t != NULL, "No tasks");
=====================================
src/bisectingKmeans.c
=====================================
@@ -19,15 +19,17 @@
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
+#include "tldevel.h"
#ifdef HAVE_OPENMP
#include <omp.h>
#endif
#ifdef HAVE_AVX2
#include <xmmintrin.h>
+#include <mm_malloc.h>
#endif
-#include <mm_malloc.h>
+
#include "tlrng.h"
#include "msa.h"
@@ -68,8 +70,8 @@ static int label_internal(struct node*n, int label);
static void create_tasks(struct node*n, struct aln_tasks* t);
-static struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm,int* samples,int numseq, int num_anchors,int num_samples);
-static struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float** dm,int* samples,int numseq, int num_anchors,int num_samples);
+static int bisecting_kmeans_serial(struct msa* msa, struct node** ret_n, float** dm,int* samples, int num_samples);
+static int bisecting_kmeans_parallel(struct msa* msa, struct node** ret_n, float** dm,int* samples, int num_samples);
static int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick,struct kmeans_result** ret);
@@ -121,9 +123,9 @@ int build_tree_kmeans(struct msa* msa, struct aln_param* ap,struct aln_tasks** t
LOG_MSG("Building guide tree.");
if(ap->nthreads == 1){
- root = bisecting_kmeans_serial(msa,root, dm, samples, numseq, num_anchors, numseq);
+ RUN(bisecting_kmeans_serial(msa,&root, dm, samples, numseq));
}else{
- root = bisecting_kmeans_parallel(msa,root, dm, samples, numseq, num_anchors, numseq);
+ RUN(bisecting_kmeans_parallel(msa,&root, dm, samples, numseq));
}
STOP_TIMER(timer);
@@ -142,7 +144,11 @@ int build_tree_kmeans(struct msa* msa, struct aln_param* ap,struct aln_tasks** t
}*/
MFREE(root);
for(i =0 ; i < msa->numseq;i++){
+#ifdef HAVE_AVX2
_mm_free(dm[i]);
+#else
+ MFREE(dm[i]);
+#endif
}
MFREE(dm);
DESTROY_TIMER(timer);
@@ -151,15 +157,14 @@ ERROR:
return FAIL;
}
-
-
-
-
-struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float** dm,int* samples,int numseq, int num_anchors,int num_samples)
+int bisecting_kmeans_parallel(struct msa* msa, struct node** ret_n, float** dm,int* samples, int num_samples)
{
struct kmeans_result* res_tmp = NULL;
struct kmeans_result* best = NULL;
struct kmeans_result** res_ptr = NULL;
+ struct node* n = NULL;
+ int num_anchors = 0;
+
int i,j;
int tries = 40;
/* int t_iter; */
@@ -167,17 +172,22 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
int* sl = NULL;
int* sr = NULL;
int num_l,num_r;
+
+ num_anchors = MACRO_MIN(32, msa->numseq);
+
if(num_samples < 100){
float** dm = NULL;
RUNP(dm = d_estimation(msa, samples, num_samples,1));// anchors, num_anchors,1));
n = upgma(dm,samples, num_samples);
-
+ *ret_n = n;
gfree(dm);
MFREE(samples);
- return n;
+ return OK;
+ //return n;
}else if(num_samples < 1000){
- n = bisecting_kmeans_serial(msa, n, dm, samples, numseq, num_anchors, num_samples);
- return n;
+ RUN(bisecting_kmeans_serial(msa, &n, dm, samples, num_samples));
+ *ret_n = n;
+ return OK;
}
@@ -200,7 +210,6 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
split(dm,samples,num_anchors, num_samples, (i+ j)*step, &res_ptr[j]);
}
-
for(j = 0; j < 4;j++){
if(!best){
change++;
@@ -215,9 +224,6 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
change++;
}
-
-
-
}
}
if(!change){
@@ -248,17 +254,17 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
{
//LOG_MSG("Done");
-#pragma omp task shared(msa,n,dm,numseq,num_anchors)
+#pragma omp task shared(msa,n,dm,num_anchors)
#endif
{
- n->left = bisecting_kmeans_parallel(msa,n->left, dm, sl, numseq, num_anchors,num_l);
+ bisecting_kmeans_parallel(msa,&n->left, dm, sl, num_l);
}
#ifdef HAVE_OPENMP
-#pragma omp task shared(msa,n,dm,numseq,num_anchors)
+#pragma omp task shared(msa,n,dm,num_anchors)
#endif
{
- n->right = bisecting_kmeans_parallel(msa,n->right, dm, sr, numseq, num_anchors, num_r);
+ bisecting_kmeans_parallel(msa,&n->right, dm, sr, num_r);
}
#ifdef HAVE_OPENMP
@@ -266,16 +272,20 @@ struct node* bisecting_kmeans_parallel(struct msa* msa, struct node* n, float**
}
}
#endif
- return n;
+
+ *ret_n =n;
+ return OK;
ERROR:
- return NULL;
+ return FAIL;
}
-struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm,int* samples,int numseq, int num_anchors,int num_samples)
+int bisecting_kmeans_serial(struct msa* msa, struct node** ret_n, float** dm,int* samples, int num_samples)
{
struct kmeans_result* res_tmp = NULL;
struct kmeans_result* best = NULL;
struct kmeans_result** res_ptr = NULL;
+ int num_anchors = 0;
+ struct node* n = NULL;
int i,j;
int tries = 40;
/* int t_iter; */
@@ -283,14 +293,17 @@ struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm
int* sl = NULL;
int* sr = NULL;
int num_l,num_r;
+
+ num_anchors = MACRO_MIN(32, msa->numseq);
+
if(num_samples < 100){
float** dm = NULL;
RUNP(dm = d_estimation(msa, samples, num_samples,1));// anchors, num_anchors,1));
n = upgma(dm,samples, num_samples);
-
+ *ret_n = n;
gfree(dm);
MFREE(samples);
- return n;
+ return OK;
}
best = NULL;
@@ -324,9 +337,6 @@ struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm
change++;
}
-
-
-
}
}
if(!change){
@@ -350,11 +360,12 @@ struct node* bisecting_kmeans_serial(struct msa* msa, struct node* n, float** dm
MFREE(samples);
n = alloc_node();
- n->left = bisecting_kmeans_serial(msa,n->left, dm, sl, numseq, num_anchors,num_l);
- n->right = bisecting_kmeans_serial(msa,n->right, dm, sr, numseq, num_anchors, num_r);
- return n;
+ bisecting_kmeans_serial(msa,&n->left , dm, sl, num_l);
+ bisecting_kmeans_serial(msa,&n->right, dm, sr, num_r);
+ *ret_n = n;
+ return OK;
ERROR:
- return NULL;
+ return FAIL;
}
int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick,struct kmeans_result** ret)
@@ -384,14 +395,24 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
num_var = num_var << 3;
+
+
+#ifdef HAVE_AVX2
wr = _mm_malloc(sizeof(float) * num_var,32);
wl = _mm_malloc(sizeof(float) * num_var,32);
cr = _mm_malloc(sizeof(float) * num_var,32);
cl = _mm_malloc(sizeof(float) * num_var,32);
w = _mm_malloc(sizeof(float) * num_var,32);
+#else
+ MMALLOC(wr,sizeof(float) * num_var);
+ MMALLOC(wl,sizeof(float) * num_var);
+ MMALLOC(cr,sizeof(float) * num_var);
+ MMALLOC(cl,sizeof(float) * num_var);
+ MMALLOC(w,sizeof(float) * num_var);
+#endif
+
if(*ret){
res = *ret;
-
}else{
RUNP(res = alloc_kmeans_result(num_samples));
}
@@ -433,7 +454,11 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
// fprintf(stdout,"%f %f %f\n", cl[j],cr[j],w[j]);
}
+#ifdef HAVE_AVX2
_mm_free(w);
+#else
+ MFREE(w);
+#endif
/* check if cr == cl - we have identical sequences */
s = 0;
@@ -448,10 +473,19 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
score = 0.0F;
num_l = 0;
num_r = 0;
- sl[num_l] = samples[0];
- num_l++;
-
- for(i =1 ; i <num_samples;i++){
+ /* The code below caused sequence sets of size 1 to be passed to clustering... */
+ /* sl[num_l] = samples[0]; */
+ /* num_l++; */
+
+ /* for(i =1 ; i <num_samples;i++){ */
+ /* sr[num_r] = samples[i]; */
+ /* num_r++; */
+ /* } */
+ for(i = 0; i < num_samples/2;i++){
+ sl[num_l] = samples[i];
+ num_l++;
+ }
+ for(i = num_samples/2; i < num_samples;i++){
sr[num_r] = samples[i];
num_r++;
}
@@ -490,6 +524,16 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
sl[num_l] = s;
num_l++;
}else{
+ /* Assign sequence to smaller group */
+ /* if(num_l < num_r){ */
+ /* w = wl; */
+ /* sl[num_l] = s; */
+ /* num_l++; */
+ /* }else{ */
+ /* w = wr; */
+ /* sr[num_r] = s; */
+ /* num_r++; */
+ /* } */
if(i & 1){
w = wr;
sr[num_r] = s;
@@ -500,11 +544,9 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
num_l++;
}
}
-
for(j = 0; j < num_anchors;j++){
w[j] += dm[s][j];
}
-
}
for(j = 0; j < num_anchors;j++){
@@ -538,10 +580,18 @@ int split(float** dm,int* samples, int num_anchors,int num_samples,int seed_pick
}
}
}
+
+#ifdef HAVE_AVX2
_mm_free(wr);
_mm_free(wl);
_mm_free(cr);
_mm_free(cl);
+#else
+ MFREE(wr);
+ MFREE(wl);
+ MFREE(cr);
+ MFREE(cl);
+#endif
res->nl = num_l;
res->nr = num_r;
=====================================
src/euclidean_dist.c
=====================================
@@ -25,9 +25,10 @@
#ifdef HAVE_AVX2
#include <xmmintrin.h>
#include <immintrin.h>
+#include <mm_malloc.h>
#endif
-#include <mm_malloc.h>
+
#include "float.h"
@@ -55,7 +56,11 @@ int main(void)
MMALLOC(mat, sizeof(float*)* 100);
for(i = 0; i < 100;i++){
+#ifdef HAVE_AVX2
mat[i] = _mm_malloc(sizeof(float)*num_element, 32);
+#else
+ MMALLOC(mat[i],sizeof(float)*num_element);
+#endif
}
RUNP( rng =init_rng(0));
@@ -89,8 +94,6 @@ int main(void)
for(i = 0; i < 100;i++){
for(j = 0; j <= i;j++){
edist_serial(mat[i], mat[j], num_element, &d1);
-
-
}
}
}
@@ -116,7 +119,11 @@ int main(void)
#endif
for(i = 0; i < 100;i++){
+#ifdef HAVE_AVX2
_mm_free(mat[i]);
+#else
+ MFREE(mat[i]);
+#endif
}
MFREE(mat);
=====================================
src/misc.c
=====================================
@@ -30,7 +30,7 @@
#include "misc.h"
#include <stdalign.h>
#include <string.h>
-#include <immintrin.h>
+/* #include <immintrin.h> */
#ifdef ITEST_MISC
=====================================
src/run_kalign.c
=====================================
@@ -20,6 +20,8 @@
*/
+#include "tldevel.h"
+#include <stdio.h>
#ifdef HAVE_OPENMP
#include <omp.h>
#endif
@@ -98,6 +100,7 @@ int print_kalign_help(char * argv[])
fprintf(stdout,"Combining multiple input files:\n\n kalign seqsA.fa seqsB.fa seqsC.fa -f fasta > combined.afa\n\n");
+
if(basename){
MFREE(basename);
}
@@ -416,7 +419,7 @@ int main(int argc, char *argv[])
}
}
- if (param->num_infiles == 0){
+ if(param->num_infiles == 0){
if (!isatty(fileno(stdin))){
LOG_MSG("Attempting stdin");
param->num_infiles =1;
@@ -500,6 +503,7 @@ int run_kalign(struct parameters* param)
snprintf(msa->sequences[i]->name, 128, "SEQ%d", i+1);
}
}
+
if(param->unalign){
/* if(param->out_format == FORMAT_FA){ */
RUN(dealign_msa(msa));
=====================================
src/rwalign.c
=====================================
@@ -21,6 +21,7 @@
*/
+#include "tldevel.h"
#include "tlmisc.h"
#include "global.h"
@@ -65,7 +66,11 @@ struct out_line{
-static int aln_unknown_warning_message(struct msa* msa);
+
+
+static int aln_unknown_warning_message_gaps_but_len_diff(struct msa* msa);
+static int aln_unknown_warning_message_same_len_no_gaps(struct msa* msa);
+
static int read_fasta(struct in_buffer* b, struct msa** msa);
static int read_msf(struct in_buffer* b, struct msa** msa);
static int read_clu(struct in_buffer* b, struct msa** msa);
@@ -216,10 +221,11 @@ int read_input(char* infile,struct msa** msa)
RUN(detect_alignment_format(b, &type));
- /* LOG_MSG("FORMAT: %d", type); */
+ //LOG_MSG("FORMAT: %d", type);
if(type == FORMAT_FA){
//RUNP(msa = read_msf(infile,msa));
//RUNP(msa = read_clu(infile,msa));
+ LOG_MSG("reading fasta");
RUN(read_fasta(b,&m));
}else if(type == FORMAT_MSF){
RUN(read_msf(b,&m));
@@ -249,7 +255,6 @@ int read_input(char* infile,struct msa** msa)
*msa = m;
return OK;
ERROR:
-
free_msa(m);
return FAIL;
}
@@ -468,27 +473,29 @@ int detect_aligned(struct msa* msa)
l = 0;
for (j = 0; j <= msa->sequences[i]->len;j++){
l += msa->sequences[i]->gaps[j];
+
+
}
gaps += l;
+
l += msa->sequences[i]->len;
min_len = MACRO_MIN(min_len, l);
max_len = MACRO_MAX(max_len, l);
+ /* LOG_MSG("%d %d", max_len, min_len); */
}
if(gaps){
if(min_len == max_len){ /* sequences have gaps and total length is identical - clearly aligned */
msa->aligned = ALN_STATUS_ALIGNED;
}else{ /* odd there are gaps but total length differs - unknown status */
- aln_unknown_warning_message(msa);
+ aln_unknown_warning_message_gaps_but_len_diff(msa);
msa->aligned = ALN_STATUS_UNKNOWN;
}
}else{
if(min_len == max_len){ /* no gaps and sequences have same length. Can' tell if they are aligned */
- aln_unknown_warning_message(msa);
+ aln_unknown_warning_message_same_len_no_gaps(msa);
msa->aligned = ALN_STATUS_UNKNOWN;
}else{ /* No gaps and sequences have different lengths - unaligned */
-
-
msa->aligned = ALN_STATUS_UNALIGNED;
}
}
@@ -496,7 +503,7 @@ int detect_aligned(struct msa* msa)
return OK;
}
-static int aln_unknown_warning_message(struct msa* msa)
+static int aln_unknown_warning_message_gaps_but_len_diff(struct msa* msa)
{
int i;
WARNING_MSG("--------------------------------------------");
@@ -508,7 +515,7 @@ static int aln_unknown_warning_message(struct msa* msa)
}
}
- WARNING_MSG("BUT the sequences do not seem to be aligned!");
+ WARNING_MSG("BUT the presumably aligned sequences do not have the length.");
WARNING_MSG(" ");
WARNING_MSG("Kalign will remove the gap characters and ");
WARNING_MSG("align the sequences. ");
@@ -517,6 +524,21 @@ static int aln_unknown_warning_message(struct msa* msa)
}
+static int aln_unknown_warning_message_same_len_no_gaps(struct msa* msa)
+{
+ int i;
+ WARNING_MSG("--------------------------------------------");
+ WARNING_MSG("All input sequences have the same length. ");
+ WARNING_MSG("BUT there are no gap characters. ");
+ WARNING_MSG(" ");
+ WARNING_MSG("Unable to determine whether the sequences ");
+ WARNING_MSG("are already aligned. ");
+ WARNING_MSG("Kalign will align the sequences. ");
+ WARNING_MSG("--------------------------------------------");
+ return OK;
+}
+
+
/* Checks if sequence names are duplicated */
/* Checks if sequences are duplicated */
int run_extra_checks_on_msa(struct msa* msa)
@@ -1234,6 +1256,11 @@ int read_clu(struct in_buffer* b , struct msa** m)
p = line;
j = 0;
for(i = 0;i < line_len;i++){
+ if(i == MSA_NAME_LEN-1){
+ seq_ptr->name[i] = 0;
+ j = i;
+ break;
+ }
if(isspace((int)p[i])){
j = i;
break;
@@ -1298,14 +1325,17 @@ int read_msf(struct in_buffer* b,struct msa** m)
if(msa->alloc_numseq == msa->numseq){
RUN(resize_msa(msa));
}
-
- p+= 5; /* length of name: */
+ p += 5; /* length of name: */
while( isspace((int)*p)){
p++;
}
+ /* LOG_MSG("Found name: %s len %d", p, strlen(p)); */
seq_ptr = msa->sequences[active_seq];
-
for(i = 0;i < line_len;i++){
+ if(i == MSA_NAME_LEN-1){
+ seq_ptr->name[i] = 0;
+ break;
+ }
if(isspace((int)p[i])){
seq_ptr->name[i] = 0;
break;
@@ -1318,7 +1348,7 @@ int read_msf(struct in_buffer* b,struct msa** m)
//LOG_MSG("Got a name %s",p);
}
}
- active_seq =0;
+ active_seq = 0;
for(nl = li; nl < b->n_lines;nl++){
line = b->l[nl]->line;
line_len = b->l[nl]->len;
@@ -1390,13 +1420,20 @@ int read_fasta( struct in_buffer* b,struct msa** m)
}
/* line[line_len-1] = 0; */
- for(i =0 ; i < line_len;i++){
+ /*for(i = 0; i < line_len;i++){
if(isspace(line[i])){
line[i] = 0;
}
- }
+ }*/
+
seq_ptr = msa->sequences[msa->numseq];
- snprintf(seq_ptr->name ,MSA_NAME_LEN ,"%s",line+1);
+ MFREE(seq_ptr->name);
+ MMALLOC(seq_ptr->name, sizeof(char) * line_len);
+ memcpy(seq_ptr->name, line+1, line_len);
+
+ /* snprintf(seq_ptr->name ,MSA_NAME_LEN ,"%s",line+1); */
+ //LOG_MSG("Name %d: len %d name : %s ",msa->numseq, line_len, seq_ptr->name);
+ //fprintf(stdout,"%s\n%s\n", line, seq_ptr->name);
msa->numseq++;
}else{
@@ -1415,6 +1452,7 @@ int read_fasta( struct in_buffer* b,struct msa** m)
seq_ptr->gaps[seq_ptr->len]++;
}
}
+
}
}
RUN(null_terminate_sequences(msa));
=====================================
src/sequence_distance.c
=====================================
@@ -21,11 +21,13 @@
*/
+#include "tldevel.h"
#ifdef HAVE_AVX2
#include <xmmintrin.h>
+#include <mm_malloc.h>
#endif
-#include <mm_malloc.h>
+
#include "sequence_distance.h"
#include "alphabet.h"
@@ -88,7 +90,6 @@ float** d_estimation(struct msa* msa, int* samples, int num_samples,int pair)
//dist = dist / (float) MACRO_MIN(len_a, len_b);
dm[i][j] = dist;// + (float)( i * num_samples + j) / (float) ( num_samples * num_samples);
dm[j][i] = dm[i][j];
- //fprintf(stdout,"%f ", dm[i][j]);
}
//fprintf(stdout,"\n");
@@ -106,8 +107,12 @@ float** d_estimation(struct msa* msa, int* samples, int num_samples,int pair)
for(i = 0; i < numseq;i++){
dm[i] = NULL;
-
+#ifdef HAVE_AVX2
dm[i] = _mm_malloc(sizeof(float) * a,32);
+#else
+ MMALLOC(dm[i], sizeof(float) *a);
+#endif
+
for(j = 0; j < a;j++){
dm[i][j] = 0.0F;
}
@@ -139,6 +144,8 @@ float** d_estimation(struct msa* msa, int* samples, int num_samples,int pair)
//dm[i][j] = dist;
}
}
+
+
/* for(i = 0; i < numseq;i++){ */
/* seq_a = msa->sequences[i]->s;// aln->s[i]; */
/* len_a = msa->sequences[i]->len;// aln->sl[i]; */
@@ -169,49 +176,13 @@ float calc_distance(uint8_t* seq_a, uint8_t* seq_b, int len_a,int len_b, int L)
}
return (float)dist;
#else
- struct bignode* hash[1024];
- int i;
- float dist;
- unsigned int hv;
- for (i = 0;i < 1024;i++){
- hash[i] = 0;
- }
- /* Protein sequence */
- if( L > ALPHA_defDNA){
-
- for (i = len_a-2;i--;){
- hv = (seq_a[i] << 5) + seq_a[i+1];
- hash[hv] = big_insert_hash(hash[hv],i);
- hv = (seq_a[i] << 5) + seq_a[i+2];
- hash[hv] = big_insert_hash(hash[hv],i);
- }
-
- dist = protein_wu_distance_calculation(hash,seq_b,len_b,len_a+len_b,58.9);
+ uint8_t dist;
+ if(len_a > len_b){
+ dist = bpm(seq_a, seq_b, len_a, len_b);
}else{
-
- for (i = len_a-5;i--;){
- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4) + ((seq_a[i+3]&3)<<2) + (seq_a[i+4]&3);//ABCDE
- hash[hv] = big_insert_hash(hash[hv],i);
- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4) + ((seq_a[i+3]&3)<<2) + (seq_a[i+5]&3);//ABCDF
- hash[hv] = big_insert_hash(hash[hv],i);
- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4) + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ABCEF
- hash[hv] = big_insert_hash(hash[hv],i);
- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+3]&3)<<4) + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ABDEF
- hash[hv] = big_insert_hash(hash[hv],i);
- hv = ((seq_a[i]&3)<<8) + ((seq_a[i+2]&3)<<6) + ((seq_a[i+3]&3)<<4) + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ACDEF
- hash[hv] = big_insert_hash(hash[hv],i);
- }
- dist = dna_distance_calculation(hash,seq_b,len_b,len_a+len_b, 61.08);
- }
-
-
- for (i = 1024;i--;){
- if (hash[i]){
- big_remove_nodes(hash[i]);
- hash[i] = 0;
- }
+ dist = bpm(seq_b, seq_a, len_b, len_a);
}
- return dist;
+ return (float)dist;
#endif
}
View it on GitLab: https://salsa.debian.org/med-team/kalign/-/compare/1aca4ab7ceb78ea10e561372f8b8a7bb5852b79a...363527dd308263476c5d14ff9da32f9b056378dd
--
View it on GitLab: https://salsa.debian.org/med-team/kalign/-/compare/1aca4ab7ceb78ea10e561372f8b8a7bb5852b79a...363527dd308263476c5d14ff9da32f9b056378dd
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220502/aeb8a71f/attachment-0001.htm>
More information about the debian-med-commit
mailing list