[med-svn] [Git][med-team/kma][upstream] New upstream version 1.2.11

Steffen Möller gitlab at salsa.debian.org
Sat Aug 31 10:25:39 BST 2019



Steffen Möller pushed to branch upstream at Debian Med / kma


Commits:
c3ae9084 by Steffen Moeller at 2019-08-31T09:18:20Z
New upstream version 1.2.11
- - - - -


15 changed files:

- Makefile
- hashmapkmers.c
- kma.c
- + kmeranker.c
- + kmeranker.h
- qseqs.c
- qseqs.h
- sam.c
- savekmers.c
- savekmers.h
- + seqmenttree
- + seqmenttree.c
- + seqmenttree.h
- sparse.c
- version.h


Changes:

=====================================
Makefile
=====================================
@@ -1,5 +1,5 @@
 CFLAGS = -Wall -O3 -std=c99
-LIBS = align.o alnfrags.o ankers.o assembly.o chain.o compdna.o compkmers.o compress.o decon.o ef.o filebuff.o frags.o hashmap.o hashmapindex.o hashmapkma.o hashmapkmers.o hashtable.o index.o kma.o kmapipe.o kmers.o kmmap.o loadupdate.o makeindex.o mt1.o nw.o pherror.o printconsensus.o qseqs.o qualcheck.o runinput.o runkma.o sam.o savekmers.o seq2fasta.o seqparse.o shm.o sparse.o spltdb.o stdnuc.o stdstat.o tmp.o update.o updateindex.o updatescores.o valueshash.o vcf.o
+LIBS = align.o alnfrags.o ankers.o assembly.o chain.o compdna.o compkmers.o compress.o decon.o ef.o filebuff.o frags.o hashmap.o hashmapindex.o hashmapkma.o hashmapkmers.o hashtable.o index.o kma.o kmapipe.o kmeranker.o kmers.o kmmap.o loadupdate.o makeindex.o mt1.o nw.o pherror.o printconsensus.o qseqs.o qualcheck.o runinput.o runkma.o sam.o savekmers.o seq2fasta.o seqmenttree.o seqparse.o shm.o sparse.o spltdb.o stdnuc.o stdstat.o tmp.o update.o updateindex.o updatescores.o valueshash.o vcf.o
 PROGS = kma kma_index kma_shm kma_update
 
 .c .o:
@@ -45,6 +45,7 @@ hashtable.o: hashtable.h hashmapkma.h hashmapkmers.h pherror.h
 index.o: index.h compress.h decon.h hashmap.h hashmapkma.h loadupdate.h makeindex.h pherror.h stdstat.h version.h
 kma.o: kma.h ankers.h assembly.h chain.h hashmapkma.h kmers.h mt1.h penalties.h pherror.h qseqs.h runinput.h runkma.h savekmers.h sparse.h spltdb.h tmp.h version.h
 kmapipe.o: kmapipe.h pherror.h
+kmeranker.o: kmeranker.h penalties.h
 kmers.o: kmers.h ankers.h compdna.h hashmapkma.h kmapipe.h pherror.h qseqs.h savekmers.h spltdb.h
 kmmap.o: kmmap.h hashmapkma.h
 loadupdate.o: loadupdate.h pherror.h hashmap.h hashmapkma.h updateindex.h
@@ -58,8 +59,9 @@ qualcheck.o: qualcheck.h compdna.h hashmap.h pherror.h stdnuc.h stdstat.h
 runinput.o: runinput.h compdna.h filebuff.h pherror.h qseqs.h seqparse.h
 runkma.o: runkma.h align.h alnfrags.h assembly.h chain.h compdna.h ef.h filebuff.h frags.h hashmapindex.h kmapipe.h nw.h pherror.h printconsensus.h qseqs.h stdnuc.h stdstat.h tmp.h vcf.h
 sam.o: sam.h nw.h pherror.h qseqs.h runkma.h
-savekmers.o: savekmers.h ankers.h compdna.h hashmapkma.h penalties.h pherror.h qseqs.h stdnuc.h stdstat.h threader.h
+savekmers.o: savekmers.h ankers.h compdna.h hashmapkma.h kmeranker.h penalties.h pherror.h qseqs.h stdnuc.h stdstat.h threader.h
 seq2fasta.o: seq2fasta.h pherror.h qseqs.h runkma.h stdnuc.h
+seqmenttree.o: seqmenttree.h pherror.h
 seqparse.o: seqparse.h filebuff.h qseqs.h
 shm.o: shm.h pherror.h hashmapkma.h version.h
 sparse.o: sparse.h compkmers.h hashtable.h kmapipe.h pherror.h runinput.h savekmers.h stdnuc.h stdstat.h


=====================================
hashmapkmers.c
=====================================
@@ -39,7 +39,7 @@ void reallocHashMap_kmers(HashMap_kmers *dest) {
 	
 	/* save buckets */
 	table = 0;
-	index = ++dest->size;
+	index = dest->size;
 	while(index--) {
 		for(node = dest->table[index]; node; node = node_next) {
 			node_next = node->next;
@@ -54,12 +54,11 @@ void reallocHashMap_kmers(HashMap_kmers *dest) {
 	if(!dest->table) {
 		ERROR();
 	}
-	--dest->size;
 	
 	/* refill table */
 	for(node = table; node; node = node_next) {
 		node_next = node->next;
-		index = node->key & dest->size;
+		index = node->key % dest->size;
 		node->next = dest->table[index];
 		dest->table[index] = node;
 	}
@@ -96,7 +95,7 @@ int hashMap_CountKmer(HashMap_kmers *dest, long unsigned key) {
 	long unsigned index;
 	HashTable_kmers *node;
 	
-	index = key & dest->size;
+	index = key % dest->size;
 	for(node = dest->table[index]; node != 0; node = node->next) {
 		if(node->key == key) {
 			return 0;


=====================================
kma.c
=====================================
@@ -816,7 +816,7 @@ int kma_main(int argc, char *argv[]) {
 		}
 		preseed(0, 0, exhaustive);
 		
-		if(sam) {
+		if(sam && kmaPipe != &kmaPipeThread) {
 			fprintf(stderr, "\"-sam\" and \"-status\" cannot coincide.\n");
 			kmaPipe = &kmaPipeThread;
 		}


=====================================
kmeranker.c
=====================================
@@ -0,0 +1,213 @@
+/* Philip T.L.C. Clausen Jan 2017 plan at dtu.dk */
+
+/*
+ * Copyright (c) 2017, Philip Clausen, Technical University of Denmark
+ * All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *		http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+#define _XOPEN_SOURCE 600
+#include <stdlib.h>
+#include "kmeranker.h"
+#include "penalties.h"
+
+KmerAnker * getBestChainTemplates(KmerAnker *src, const Penalties *rewards, int *bests, int *Score, int *extendScore, char *include) {
+	
+	/* get set of best templates and silences the chain, except for the initial anker */
+	int template, score, bestScore, Wl, W1, U, M, MM, gaps;
+	unsigned i, j, start, end, SU;
+	unsigned *values;
+	short unsigned *values_s;
+	KmerAnker *node, *prev;
+	
+	/* mark template candidates */
+	if(!src) {
+		return 0;
+	} else if((SU = *include)) {
+		values_s = (short unsigned *) src->values;
+		i = *values_s + 1;
+		*bests += i;
+		values_s += i;
+		bests += i;
+		while(--i) {
+			include[(*--bests = *--values_s)] ^= 1;
+		}
+	} else {
+		values = src->values;
+		i = *values + 1;
+		*bests += i;
+		values += i;
+		bests += i;
+		while(--i) {
+			include[(*--bests = *--values)] ^= 1;
+		}
+	}
+	M = rewards->M;
+	MM = rewards->MM;
+	U = rewards->U;
+	W1 = rewards->W1;
+	Wl = rewards->Wl;
+	bestScore = src->score;
+	
+	/* get chaining scores */
+	for(node = src; node != 0; node = node->next) {
+		if(SU) {
+			values_s = (short unsigned *) node->values;
+			i = *values_s + 1;
+			values_s += i;
+		} else {
+			values = node->values;
+			i = *values + 1;
+			values += i;
+		}
+		
+		start = node->start;
+		end = node->end;
+		
+		while(--i) {
+			template = SU ? *--values_s : *--values;
+			if(!include[template]) {
+				score = Score[template];
+				gaps = start - extendScore[template];
+				if(gaps == 0) {
+					/* continued match */
+					score += M;
+				} else if(gaps < 0) {
+					/* deletion */
+					score += (W1 + abs(gaps + 1) * U);
+				} else if(gaps == 1) {
+					/* one unmatched between */
+					score += MM;
+				} else {
+					/* unmatched bases between */
+					if((gaps = (gaps - 2) * M) < 0) {
+						score += gaps;
+					}
+					score += (MM << 1);
+				}
+				
+				/* check local chain is needed */
+				if(score < Wl) {
+					score = Wl + node->weight;
+					/* mark as terminating */
+					extendScore[template] = 0;
+				} else {
+					score += node->weight;
+					/* mark descendant */
+					extendScore[template] = end;
+				}
+				
+				/* update Scores */
+				Score[template] = score;
+			}
+		}
+		
+		/* mark as used */
+		node->score = 0;
+		prev = node;
+	}
+	src->score = bestScore;
+	
+	/* get best templates */
+	j = 0;
+	for(i = 1; i <= *bests; ++i) {
+		if(Score[(template = bests[i])] == bestScore) {
+			bests[++j] = template;
+		}
+		/* clear Score arrays */
+		Score[template] = 0;
+		extendScore[template] = 0;
+		include[template] = 0;
+	}
+	*bests = j;
+	
+	return prev;
+}
+
+KmerAnker * pruneAnkers(KmerAnker *V_score, int kmersize) {
+	
+	KmerAnker *node, *prev;
+	
+	while(V_score->score < kmersize) {
+		++V_score;
+	}
+	
+	prev = V_score;
+	node = V_score->descend;
+	while(node) {
+		if(kmersize <= node->score) {
+			prev->descend = node;
+			prev = node;
+		}
+		node = node->descend;
+	}
+	prev->descend = 0;
+	
+	return V_score;
+}
+
+unsigned getStartAnker(KmerAnker *src) {
+	
+	while(src->next) {
+		src = src->next;
+	}
+	return src->start;
+}
+
+KmerAnker * getBestAnker(KmerAnker **src, unsigned *ties) {
+	
+	KmerAnker *node, *prev, *best;
+	
+	*ties = 0;
+	prev = *src;
+	while(prev && prev->score == 0) {
+		prev = prev->descend;
+	}
+	*src = prev;
+	best = prev;
+	node = prev->descend;
+	
+	while(node) {
+		if(best->score < node->score) {
+			best = node;
+			*ties = 0;
+		} else if(best->score == node->score) {
+			if((node->end - node->start) < (best->end - best->start)) {
+				best = node;
+				*ties = 0;
+			} else if((node->end - node->start) == (best->end - best->start)) {
+				best = node;
+				++*ties;
+			}
+		} else if(node->score != 0) {
+			prev->descend = node;
+			prev = node;
+		}
+		node = node->descend;
+	}
+	prev->descend = 0;
+	
+	return best;
+}
+
+KmerAnker * getTieAnker(KmerAnker *stop, KmerAnker *src, int score, int len) {
+	
+	/* search downwards */
+	while(--src != stop) {
+		if(src->score == score && (src->end - src->start) == len) {
+			return src;
+		}
+	}
+	
+	return 0;
+}


=====================================
kmeranker.h
=====================================
@@ -0,0 +1,41 @@
+/* Philip T.L.C. Clausen Jan 2017 plan at dtu.dk */
+
+/*
+ * Copyright (c) 2017, Philip Clausen, Technical University of Denmark
+ * All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *		http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+#define _XOPEN_SOURCE 600
+#include "penalties.h"
+
+#ifndef KMERANKER
+#define KMERANKER 1;
+typedef struct kmerAnker KmerAnker;
+struct kmerAnker {
+	int score;
+	int weight;
+	unsigned start;
+	unsigned end;
+	unsigned flag;
+	unsigned *values;
+	struct kmerAnker *descend; /* descending anker */
+	struct kmerAnker *next; /* next anker in chain */
+};
+#endif
+
+KmerAnker * getBestChainTemplates(KmerAnker *src, const Penalties *rewards, int *bests, int *Score, int *extendScore, char *include);
+KmerAnker * pruneAnkers(KmerAnker *V_score, int kmersize);
+unsigned getStartAnker(KmerAnker *src);
+KmerAnker * getBestAnker(KmerAnker **src, unsigned *ties);
+KmerAnker * getTieAnker(KmerAnker *stop, KmerAnker *src, int score, int len);


=====================================
qseqs.c
=====================================
@@ -37,3 +37,22 @@ void destroyQseqs(Qseqs *dest) {
 	free(dest->seq);
 	free(dest);
 }
+
+void insertKmerBound(Qseqs *header, int start, int end) {
+	
+	int *seq;
+	
+	if((header->len + 3 * sizeof(int)) < header->size) {
+		header->size = (header->len + 3 * sizeof(int)) << 1;
+		if(!(header->seq = realloc(header->seq, header->size))) {
+			ERROR();
+		}
+	}
+	
+	seq = (int *) (header->seq + header->len + 1);
+	*seq = start;
+	*++seq = end;
+	*++seq = 0;
+	header->len += (2 * sizeof(int) + 1);
+	
+}


=====================================
qseqs.h
=====================================
@@ -31,3 +31,4 @@ struct qseqs {
 Qseqs * setQseqs(int size);
 /* destroy Qseqs */
 void destroyQseqs(Qseqs *dest);
+void insertKmerBound(Qseqs *header, int start, int end);


=====================================
sam.c
=====================================
@@ -32,8 +32,8 @@ char * makeCigar(Qseqs *Cigar, const Aln *aligned) {
 	char op, pop, *s, *cigar;
 	unsigned char *t, *q;
 	
-	if(Cigar->size < aligned->len << 1) {
-		Cigar->size <<= 1;
+	if(Cigar->size < (aligned->len << 1)) {
+		Cigar->size = (aligned->len << 1);
 		free(Cigar->seq);
 		Cigar->seq = smalloc(Cigar->size);
 	} else if(aligned->len == 0) {


=====================================
savekmers.c
=====================================
@@ -25,11 +25,13 @@
 #include "ankers.h"
 #include "compdna.h"
 #include "hashmapkma.h"
+#include "kmeranker.h"
 #include "penalties.h"
 #include "pherror.h"
 #include "qseqs.h"
 #include "sam.h"
 #include "savekmers.h"
+#include "seqmenttree.h"
 #include "stdnuc.h"
 #include "stdstat.h"
 #include "threader.h"
@@ -4760,17 +4762,19 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
 	
 	/* return 0 for match, 1 otherwise */
 	static int *Sizes = 0;
+	static double mrs = 0.5, coverT = 0.5;
 	static KmerAnker **tVF_scores, **tVR_scores;
+	static SeqmentTree **tSeqments;
 	
-	int Wl, W1, U, M, MM, Ms, MMs, Us, W1s, score, gaps, template, maxS;
-	int *bests;
+	int Wl, W1, U, M, MM, Ms, MMs, Us, W1s, score, gaps, template, *bests;
 	unsigned i, j, rc, shifter, kmersize, DB_size, SU, HIT, start, end, pos;
-	unsigned hitCounter, hitCounter_r;
-	unsigned *values, *last;
+	unsigned hitCounter, hitCounter_r, ties, len, *values, *last;
 	short unsigned *values_s;
 	char *include;
 	CompDNA *qseqP;
 	KmerAnker *V_score, *V_scores, *VF_scores, *VR_scores, *tmp_score;
+	KmerAnker *best_score, *best_score_r;
+	SeqmentTree *chainSeqments;
 	
 	if(qseq == 0) {
 		if(Sizes) {
@@ -4782,10 +4786,18 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
 			}
 			free(tVF_scores);
 		} else {
+			/* here */
+			/* set mrs and coverT */
+			
+			
 			i = *bestTemplates;
 			Sizes = smalloc(i * sizeof(int));
 			tVF_scores = smalloc(2 * i * sizeof(KmerAnker *));
 			tVR_scores = tVF_scores + i;
+			tSeqments = calloc(i, sizeof(SeqmentTree *));
+			if(!tSeqments) {
+				ERROR();
+			}
 			while(i--) {
 				Sizes[i] = 256;
 				tVF_scores[i] = calloc(512, sizeof(KmerAnker));
@@ -4828,6 +4840,7 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
 	}
 	VF_scores = tVF_scores[*Score];
 	VR_scores = tVR_scores[*Score];
+	chainSeqments = tSeqments[*Score];
 	
 	V_scores = VF_scores;
 	qseqP = qseq;
@@ -4857,16 +4870,20 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
 			}
 		}
 		
+		V_score = V_scores;
+		V_score->start = 0;
+		V_score->end = 0;
+		V_score->values = 0;
+		V_score->next = 0;
+		V_score->descend = 0;
 		if(HIT) {
-			V_score = V_scores;
-			V_score->start = (j = 0);
-			V_score->values = (last = 0);
-			V_score->next = 0;
 			Ms = 0;
 			MMs = 0;
 			Us = 0;
 			W1s = 0;
 			gaps = 0;
+			last = 0;
+			j = 0;
 			for(i = 1; i <= *(qseqP->N); ++i) {
 				end = qseqP->N[i] - kmersize + 1;
 				while(j < end) {
@@ -4901,10 +4918,12 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
 								/* update and link between ankers */
 								V_score->weight = Ms * M + MMs * MM + Us * U + W1s * W1;
 								V_score->end = j - gaps + kmersize;
+								V_score->descend = V_score + 1;
 								++V_score;
 							}
 							V_score->start = j;
 							V_score->values = (last = values);
+							V_score->descend = 0;
 							Ms = 0;
 							MMs = 0;
 							Us = 0;
@@ -4927,34 +4946,27 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
 		--*(qseqP->N);
 	}
 	
-	/* here */
-	/* chaining */
-	
-	/*
-	while(chain is good) {
-		1. make chain. OK
-		2. choose best.
-		3. silence "used" ankers.
-		4. redo chaining if next best hits a silenced anker.
+	/* no matches */
+	if(!hitCounter && !hitCounter_r) {
+		*include = 0;
+		return 1;
 	}
-	*/
-	
-	/*
-	for each anker
-		mark last anker w.r.t. highest score on anker
-	*/
 	
 	/* make chains */
-	maxS = 0;
 	V_score = VF_scores;
+	best_score = 0;
+	best_score_r = V_score;
 	HIT = hitCounter + 1;
 	bests = bestTemplates;
+	ties = 0;
 	rc = 2;
 	while(rc--) {
 		if(rc == 0) {
 			V_score = VR_scores;
 			HIT = hitCounter_r + 1;
 			bests = bestTemplates_r;
+			best_score = best_score_r;
+			best_score_r = V_score;
 		}
 		*bests = 0;
 		while(--HIT) {
@@ -5006,48 +5018,556 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
 				if(score < Wl) {
 					score = Wl + V_score->weight;
 					/* mark as terminating */
-					V_score->next = 0;
+					tmp_score = 0;
 					extendScore[template] = 0;
 				} else {
 					score += V_score->weight;
 					/* mark descendant */
 					tmp_score = V_score;
-					while((--tmp_score)->end != pos);
-					V_score->next = tmp_score;
 					extendScore[template] = end;
 				}
 				
 				/* update Scores */
 				if(V_score->score < score) {
 					V_score->score = score;
-					/* here */
-					/* consider several bests */
-					if(maxS < score) {
-						maxS = score;
+					/* find link chain */
+					if(tmp_score) {
+						while((--tmp_score)->end != pos);
 					}
+					V_score->next = tmp_score;	
 				}
 				Score[template] = score;
 			}
+			
+			/* Mark (first) best hit */
+			if(best_score_r->score < V_score->score) {
+				best_score_r = V_score;
+				ties = 0;
+			} else if(best_score_r->score == V_score->score) {
+				/* lower uncertainty in shorter chains */
+				if((V_score->end - V_score->start) < (best_score_r->end - best_score_r->start)) {
+					best_score_r = V_score;
+					ties = 0;
+				} else if((V_score->end - V_score->start) == (best_score_r->end - best_score_r->start)) {
+					/* equals */
+					++ties;
+					/* first hit on rc is likely last hit on forward */
+					if(rc) {
+						best_score_r = V_score;
+					}
+				}
+			}
+			
+			V_score->flag = 0;
 			++V_score;
 		}
 		
-		/* clear Score */
+		/* clear Score arrays */
+		clearScore(bests, Score);
+		clearScore(bests, extendScore);
 		i = *bests + 1;
 		while(--i) {
 			include[*++bests] = 0;
 		}
-		clearScore(bests, Score);
-		clearScore(bests, extendScore);
 	}
 	
+	/* no good hits */
+	if(best_score->score < kmersize && best_score_r->score < kmersize) {
+		*include = 0;
+		return 1;
+	}
+	
+	/* get best chain, 
+	start/end, score, template(s)
+	and zero out ankers */
+	*include = SU;
+	*bestTemplates = 0;
+	if(best_score_r->score < best_score->score ||
+	(best_score_r->score < best_score->score && 
+	(best_score->end - best_score->start) <= (best_score_r->end - best_score->start))) {
+		tmp_score = getBestChainTemplates(best_score, rewards, bestTemplates, Score, extendScore, include);
+		score = best_score->score;
+		start = tmp_score->start;
+		len = best_score->end - start;
+		rc = 0;
+	} else {
+		tmp_score = getBestChainTemplates(best_score_r, rewards, bestTemplates, Score, extendScore, include);
+		score = best_score_r->score;
+		start = tmp_score->start;
+		len = best_score_r->end - start;
+		rc = 1;
+	}
+	
+	if(score < mrs * len) {
+		*include = 0;
+		return 1;
+	}
+	
+	/* prune hits */
+	VF_scores = pruneAnkers(VF_scores, kmersize);
+	VR_scores = pruneAnkers(VR_scores, kmersize);
+	
+	/* get best chains */
+	while(best_score || best_score_r) {
+		/* get equal ankers inside chain,
+		that lies under the threshold */
+		/* here */
+		if(ties && best_score != tmp_score) {
+			V_score = best_score;
+			while(V_score && (V_score = getTieAnker(tmp_score, V_score, best_score->score, len))) {
+				/*
+				1. tie anker -> tie_len == best_len
+				2. Best match is last on seq -> tie_start < best_start
+				1. && 2. -> cover = |tie_start - best_end| / len
+				*/
+				if((V_score->end - start) < coverT * len ) {
+					/* not a co-match / overlap is insufficient ->
+					no more equal matches */
+					V_score = 0;
+				} else { /* Anker is equal, update with templates */
+					/* Mark current templates to avoid double hitting */
+					i = *bestTemplates + 1;
+					bests = bestTemplates;
+					while(--i) {
+						include[*++bests] = 1;
+					}
+					/* update best template candidates */
+					template = *bests;
+					*bests = 0;
+					getBestChainTemplates(V_score, rewards, bests, Score, extendScore, include);
+					*bestTemplates += *bests;
+					*bests = template;
+					start = V_score->end - len;
+				}
+			}
+			
+			
+		}
+		
+		
+		
+	}
+	
+	
+	
+	
+	
+	
+	/*
+	Best template(s) must in values on best anker, 
+	remove non-union of A and B, while backtracking.
+	Where A and B are templates on ankerA and ankerB.
+	For proxi scoring the score has to be imputed while tracking.
+	*/
+	
+	/* here */
+	/* what happens when two equal chains overlap */
+	/*
+	-------------       -------------       -------------
+	          -------------        -------------
+	
+	------------- ------------- -------------
+	  -------------       -------------
+	*/
+	
+	
+	
+	
+	/* no rechaining */
+	
+	/*
+	Q = set of best chains
+	
+	while(score is good) 
+		1. if in Q.
+			if(<50%)
+				add to Q
+			else if(scores equal)
+				add with match
+			else
+				skip chain
+		   else
+			add to Q.
+		2. Silence it.
+		3. find best chain.
+	*/
+	*include = 0;
+	
+	return 1;
+}
+
+int save_kmers_sparse_chain(const HashMapKMA *templates, const Penalties *rewards, int *bestTemplates, int *bestTemplates_r, int *Score, int *Score_r, CompDNA *qseq, CompDNA *qseq_r, const Qseqs *header, int *extendScore, const int exhaustive, volatile int *excludeOut, FILE *out) {
+	
+	/* return 0 for match, 1 otherwise */
+	static int *Sizes = 0;
+	static double mrs = 0.5, coverT = 0.5;
+	static KmerAnker **tVF_scores;
+	static SeqmentTree **tSeqments;
+	int Wl, W1, U, M, MM, Ms, MMs, Us, W1s, score, gaps, template, *bests;
+	unsigned i, j, shifter, kmersize, DB_size, SU, HIT, start, end, pos;
+	unsigned len, cover, hitCounter, ties, *values, *last;
+	short unsigned *values_s;
+	char *include;
+	KmerAnker *V_score, *VF_scores, *tmp_score, *best_score;
+	SeqmentTree *chainSeqments;
+	
+	if(qseq == 0) {
+		if(Sizes) {
+			free(Sizes);
+			Sizes = 0;
+			i = *bestTemplates;
+			while(i--) {
+				free(tVF_scores[i]);
+			}
+			free(tVF_scores);
+		} else {
+			/* here */
+			/* set mrs and coverT */
+			
+			
+			i = *bestTemplates;
+			Sizes = smalloc(i * sizeof(int));
+			tVF_scores = smalloc(i * sizeof(KmerAnker *));
+			tSeqments = calloc(i, sizeof(SeqmentTree *));
+			if(!tSeqments) {
+				ERROR();
+			}
+			while(i--) {
+				Sizes[i] = 256;
+				tVF_scores[i] = calloc(512, sizeof(KmerAnker));
+				if(!tVF_scores[i]) {
+					ERROR();
+				}
+			}
+		}
+		return 0;
+	} else if(qseq->seqlen < (kmersize = templates->kmersize)) {
+		return 1;
+	} else if((DB_size = templates->DB_size) < USHRT_MAX) {
+		SU = 1;
+	} else {
+		SU = 0;
+	}
+	shifter = sizeof(long unsigned) * sizeof(long unsigned) - (templates->kmersize << 1);
+	M = rewards->M;
+	MM = rewards->MM;
+	U = rewards->U;
+	W1 = rewards->W1;
+	Wl = rewards->Wl;
+	include = (char *) (extendScore + (templates->DB_size + 1));
+	hitCounter = 0;
+	values = 0;
+	values_s = 0;
+	
+	if(Sizes[*Score] < qseq->size) {
+		Sizes[*Score] = qseq->size << 1;
+		free(tVF_scores[*Score]);
+		tVF_scores[*Score] = calloc(qseq->size, sizeof(KmerAnker));
+		if(!tVF_scores[*Score]) {
+			ERROR();
+		}
+	}
+	VF_scores = tVF_scores[*Score];
+	chainSeqments = tSeqments[*Score];
 	
+	HIT = exhaustive;
+	j = 0;
+	++*(qseq->N);
+	qseq->N[*(qseq->N)] = qseq->seqlen;
+	for(i = 1; i <= *(qseq->N) && !HIT; ++i) {
+		end = qseq->N[i] - kmersize + 1;
+		while(j < end && hashMap_get(templates, getKmer(qseq->seq, j, shifter)) == 0) {
+			j += kmersize;
+		}
+		if(j < end) {
+			HIT = 1;
+		} else {
+			j = qseq->N[i] + 1;
+		}
+	}
 	
+	if(HIT) {
+		V_score = VF_scores;
+		V_score->start = (j = 0);
+		V_score->values = (last = 0);
+		V_score->next = 0;
+		Ms = 0;
+		MMs = 0;
+		Us = 0;
+		W1s = 0;
+		gaps = 0;
+		for(i = 1; i <= *(qseq->N); ++i) {
+			end = qseq->N[i] - kmersize + 1;
+			while(j < end) {
+				if((values = hashMap_get(templates, getKmer(qseq->seq, j, shifter)))) {
+					if(values == last) {
+						if(kmersize < gaps) {
+							Ms += kmersize;
+							gaps -= kmersize;
+							if(gaps) {
+								/* go for best scenario */
+								if(gaps == 1) {
+									MMs += 2;
+								} else {
+									gaps -= 2;
+									if((MM << 1) + gaps * M < 0) {
+										Ms += gaps;
+										MMs += 2;
+									}
+								}
+							} else {
+								++MMs;
+							}
+						} else if(gaps) {
+							--gaps;
+							++W1s;
+							Us += gaps;
+						} else {
+							++Ms;
+						}
+					} else {
+						if(last) {
+							/* update and link between ankers */
+							V_score->weight = Ms * M + MMs * MM + Us * U + W1s * W1;
+							V_score->end = j - gaps + kmersize;
+							V_score->descend = V_score + 1;
+							++V_score;
+						}
+						V_score->start = j;
+						V_score->values = (last = values);
+						V_score->descend = 0;
+						Ms = 0;
+						MMs = 0;
+						Us = 0;
+						W1s = 0;
+						++hitCounter;
+					}
+					gaps = 0;
+				} else {
+					++gaps;
+				}
+				j += kmersize;
+			}
+			j = qseq->N[i] + 1;
+		}
+		if(last) {
+			/* update anker */
+			V_score->weight = Ms * M + MMs * MM + Us * U + W1s * W1;
+		}
+	}
+	--*(qseq->N);
 	
+	/* no matches */
+	if(!hitCounter) {
+		return 1;
+	}
 	
+	/* make chains */
+	V_score = VF_scores;
+	best_score = V_score;
+	HIT = hitCounter + 1;
+	bests = bestTemplates;
+	*bests = (ties = 0);
+	while(--HIT) {
+		/*
+		Save pos of best score, and chain it (0 if local).
+		Score is always current / w.r.t. the anker you are on.
+		extendScore contains endpos of last hit anker.
+		*/
+		start = V_score->start;
+		end = V_score->end;
+		
+		/* chain anker */
+		V_score->score = 0;
+		if(SU) {
+			values_s = (short unsigned *) V_score->values;
+			i = *values_s + 1;
+			values_s += i;
+		} else {
+			values = V_score->values;
+			i = *values + 1;
+			values += i;
+		}
+		while(--i) {
+			template = SU ? *--values_s : *--values;
+			if(!include[template]) {
+				include[template] = 1;
+				bests[++*bests] = template;
+			}
+			score = Score[template];
+			gaps = start - (pos = extendScore[template]);
+			if(gaps == 0) {
+				/* continued match */
+				score += M;
+			} else if(gaps < 0) {
+				/* deletion */
+				score += (W1 + abs(gaps + 1) * U);
+			} else if(gaps == 1) {
+				/* one unmatched between */
+				score += MM;
+			} else {
+				/* unmatched bases between */
+				if((gaps = (gaps - 2) * M) < 0) {
+					score += gaps;
+				}
+				score += (MM << 1);
+			}
+			
+			/* check local chain is needed */
+			if(score < Wl) {
+				score = Wl + V_score->weight;
+				/* mark as terminating */
+				tmp_score = 0;
+				extendScore[template] = 0;
+			} else {
+				score += V_score->weight;
+				/* mark descendant */
+				tmp_score = V_score;
+				extendScore[template] = end;
+			}
+			
+			/* update Scores */
+			if(V_score->score < score) {
+				V_score->score = score;
+				/* find link chain */
+				if(tmp_score) {
+					while((--tmp_score)->end != pos);
+				}
+				V_score->next = tmp_score;	
+			}
+			Score[template] = score;
+		}
+		
+		/* Mark (last) best hit */
+		if(best_score->score < V_score->score) {
+			best_score = V_score;
+			ties = 0;
+		} else if(best_score->score == V_score->score) {
+			if((V_score->end - V_score->start) < (best_score->end - best_score->start)) {
+				/* lower uncertainty in shorter chains */
+				best_score = V_score;
+				ties = 0;
+			} else if((V_score->end - V_score->start) == (best_score->end - best_score->start)) {
+				/* equals */
+				best_score = V_score;
+				++ties;
+			}
+		}
+		
+		V_score->flag = 0;
+		++V_score;
+	}
 	
+	/* clear Score arrays */
+	clearScore(bests, Score);
+	clearScore(bests, extendScore);
+	i = *bests + 1;
+	while(--i) {
+		include[*++bests] = 0;
+	}
 	
+	/* no good hits */
+	if((score = best_score->score) < kmersize) {
+		*include = 0;
+		return 1;
+	}
 	
+	/* get best chain, 
+	start/end, score, template(s)
+	and zero out ankers */
+	*include = SU;
+	*bestTemplates = 0;
+	tmp_score = getBestChainTemplates(best_score, rewards, bestTemplates, Score, extendScore, include);
+	start = tmp_score->start;
+	len = best_score->end - start;
+	if(score < mrs * len) {
+		*include = 0;
+		return 1;
+	}
 	
+	/* prune hits */
+	VF_scores = pruneAnkers(VF_scores, kmersize);
+	
+	/* get best chains */
+	while(best_score) {
+		/* get equal ankers inside chain,
+		that lies under the threshold */
+		if(ties && best_score != tmp_score) {
+			V_score = best_score;
+			while(V_score && (V_score = getTieAnker(tmp_score, V_score, best_score->score, len))) {
+				/*
+				1. tie anker -> tie_len == best_len
+				2. Best match is last on seq -> tie_start < best_start
+				1. && 2. -> cover = |tie_start - best_end| / len
+				*/
+				if((V_score->end - start) < coverT * len ) {
+					/* not a co-match / overlap is insufficient ->
+					no more equal matches */
+					V_score = 0;
+				} else { /* Anker is equal, update with templates */
+					/* Mark current templates to avoid double hitting */
+					i = *bestTemplates + 1;
+					bests = bestTemplates;
+					while(--i) {
+						include[*++bests] = 1;
+					}
+					/* update best template candidates */
+					template = *bests;
+					*bests = 0;
+					getBestChainTemplates(V_score, rewards, bests, Score, extendScore, include);
+					*bestTemplates += *bests;
+					*bests = template;
+					start = V_score->end - len;
+				}
+			}
+		}
+		
+		/* use segment trees to mark "used" regions of query.
+		Allow for quick check of intersection with chosen chains. */
+		growSeqmentTree(chainSeqments, start, best_score->end);
+		
+		/* insert anker-boundaries */
+		insertKmerBound((Qseqs *) header, start, best_score->end);
+		
+		/* print match */
+		lock(excludeOut);
+		i = deConPrintPtr(bestTemplates, qseq, best_score->score, header, 0, out);
+		unlock(excludeOut);
+		
+		/* get next match */
+		best_score->score = 0;
+		*bestTemplates = 0;
+		while(best_score->score == 0) {
+			/* find best anker */
+			if(!(best_score = getBestAnker(&VF_scores, &ties))) {
+				*include = 0;
+				return 0;	
+			}
+			
+			/* check chain */
+			start = getStartAnker(best_score);
+			cover = queSeqmentTree(chainSeqments->root, start, best_score->end);
+			
+			/* verify chain */
+			if(cover < coverT * (best_score->end - start)) {
+				/* get chain */
+				tmp_score = getBestChainTemplates(best_score, rewards, bestTemplates, Score, extendScore, include);
+				len = best_score->end - start;
+				if(best_score->score < mrs * len) {
+					/* silence anker */
+					best_score->score = 0;
+				}
+			} else {
+				/* silence anker */
+				best_score->score = 0;
+			}
+		}
+	}
+	*include = 0;
 	
 	return 1;
 }
+
+
+
+


=====================================
savekmers.h
=====================================
@@ -25,7 +25,6 @@
 
 #ifndef SAVEKMERS
 typedef struct kmerScan_thread KmerScan_thread;
-typedef struct kmerAnker KmerAnker;
 
 struct kmerScan_thread {
 	pthread_t id;
@@ -43,15 +42,6 @@ struct kmerScan_thread {
 	struct kmerScan_thread *next;
 };
 
-struct kmerAnker {
-	int score;
-	int weight;
-	unsigned start;
-	unsigned end;
-	unsigned *values;
-	struct kmerAnker *next;
-};
-
 #define SAVEKMERS 1;
 #endif
 


=====================================
seqmenttree
=====================================
Binary files /dev/null and b/seqmenttree differ


=====================================
seqmenttree.c
=====================================
@@ -0,0 +1,172 @@
+/* Philip T.L.C. Clausen Jan 2017 plan at dtu.dk */
+
+/*
+ * Copyright (c) 2017, Philip Clausen, Technical University of Denmark
+ * All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *		http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+#define _XOPEN_SOURCE 600
+#include <stdlib.h>
+#include "pherror.h"
+#include "seqmenttree.h"
+
+SeqmentTree * initializeSeqmentTree(const long unsigned size) {
+	
+	SeqmentTree *dest;
+	
+	dest = smalloc(sizeof(SeqmentTree));
+	dest->n = 0;
+	dest->root = smalloc((dest->size = size) * sizeof(SeqmentTrees));
+	
+	return dest;
+}
+
+SeqmentTree * initSeqmentTree(SeqmentTree *src, const unsigned start, const unsigned end) {
+	
+	SeqmentTrees *node;
+	
+	if(!src) {
+		src = initializeSeqmentTree(64);
+	}
+	
+	src->n = 1;
+	node = src->root;
+	node->start = start;
+	node->end = end;
+	node->covered = end - start;
+	node->branch[0] = 0;
+	node->branch[1] = 0;
+	
+	return src;
+}
+
+void reallocSeqmentTree(SeqmentTree *src) {
+	
+	src->root = realloc(src->root, (src->size <<= 1) * sizeof(SeqmentTrees));
+	if(!src->root) {
+		ERROR();
+	}
+}
+
+unsigned addSeqmentTrees(SeqmentTrees *root, SeqmentTrees *node) {
+	
+	unsigned pos, covered;
+	SeqmentTrees *bud;
+	
+	if(*(root->branch)) { /* search */
+		/* adjust limits of root */
+		if(node->start < root->start && root->end < node->end) {
+			root->start = node->start;
+			root->end = node->end;
+			root->covered = node->covered;
+			node->covered = 0;
+			*(root->branch) = 0;
+			return root->covered;
+		} else if(root->end < node->end) {
+			root->end = node->end;
+		} else if(node->start < root->start) {
+			root->start = node->start;
+		}
+		
+		/* search tree */
+		if(node->end < (pos = root->branch[1]->start)) { /* left */
+			root->covered = root->branch[1]->covered + addSeqmentTrees(root->branch[0], node);
+		} else if(pos <= node->start) { /* right */
+			root->covered = root->branch[0]->covered + addSeqmentTrees(root->branch[1], node);
+		} else { /* split */
+			/* calculate right side */
+			pos = node->start;
+			node->start = root->branch[0]->end + 1;
+			node->covered = node->end - node->start;
+			covered = addSeqmentTrees(root->branch[1], node);
+			
+			/* here */
+			if(node->covered) {
+				fprintf(stderr, "I was wrong about split seqmenttrees.\n");
+				exit(1);
+			}
+			
+			/* calculate left side */
+			node->start = pos;
+			node->end = root->branch[0]->end;
+			node->covered = node->end - node->start;
+			root->covered = covered + addSeqmentTrees(root->branch[0], node);
+			
+			/* here */
+			if(node->covered) {
+				fprintf(stderr, "I was wrong about split seqmenttrees.\n");
+				exit(1);
+			}
+		}
+	} else if((pos = node->end < root->start) || (pos = root->end < node->start)) { /* new leaf */
+		/* create and grow bud */
+		bud = node + 1;
+		root->branch[pos] = node;
+		root->branch[pos] = bud;
+		
+		/* form new leaf */
+		bud->start = root->start;
+		bud->end = root->end;
+		bud->covered = root->covered;
+		*(bud->branch) = 0;
+		
+		/* update the root */
+		root->covered += node->covered;
+	} else { /* extend leaf */
+		if(node->start < root->start) { /* extend left */
+			root->start = node->start;
+		} else if(root->end < node->end) { /* extend right */
+			root->end = node->end;
+		}
+		node->covered = 0;
+		root->covered = root->end - root->start;
+	}
+	
+	
+	return root->covered;
+}
+
+void growSeqmentTree(SeqmentTree *src, const unsigned start, const unsigned end) {
+	
+	SeqmentTrees *node;
+	
+	/* make room for new anker */
+	if(src->size <= src->n + 2) {
+		reallocSeqmentTree(src);
+	}
+	
+	/* make new leaf */
+	node = src->root + src->n;
+	node->start = start;
+	node->end = end;
+	node->covered = end - start;
+	*(node->branch) = 0;
+	
+	src->root->covered = addSeqmentTrees(src->root, node);
+	
+	if(node->covered) {
+		src->n += 2;
+	}
+}
+
+unsigned queSeqmentTree(SeqmentTrees *src, const unsigned start, const unsigned end) {
+	
+	if(end < src->start || src->end < start) {
+		return 0;
+	} else if(src->start <= start && end <= src->end) {
+		return src->covered;
+	} else {
+		return queSeqmentTree(src->branch[0], start, end) + queSeqmentTree(src->branch[1], start, end);
+	}
+}


=====================================
seqmenttree.h
=====================================
@@ -0,0 +1,43 @@
+/* Philip T.L.C. Clausen Jan 2017 plan at dtu.dk */
+
+/*
+ * Copyright (c) 2017, Philip Clausen, Technical University of Denmark
+ * All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *		http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+#define _XOPEN_SOURCE 600
+//cc -Wall -O3 -std=c99 -c -o seqmenttree seqmenttree.c
+#ifndef SEQMENTTREE
+#define SEQMENTTREE 1
+typedef struct seqmentTree SeqmentTree;
+typedef struct seqmentTrees SeqmentTrees;
+struct seqmentTree {
+	unsigned n;
+	unsigned size;
+	struct seqmentTrees *root;
+};
+struct seqmentTrees {
+	unsigned start;
+	unsigned end;
+	unsigned covered;
+	struct seqmentTrees *branch[2];
+};
+#endif
+
+SeqmentTree * initializeSeqmentTree(long unsigned size);
+SeqmentTree * initSeqmentTree(SeqmentTree *src, const unsigned start, const unsigned end);
+void reallocSeqmentTree(SeqmentTree *src);
+unsigned addSeqmentTrees(SeqmentTrees *root, SeqmentTrees *node);
+void growSeqmentTree(SeqmentTree *src, const unsigned start, const unsigned end);
+unsigned queSeqmentTree(SeqmentTrees *src, const unsigned start, const unsigned end);


=====================================
sparse.c
=====================================
@@ -400,14 +400,12 @@ int save_kmers_sparse_batch(char *templatefilename, char *outputfilename, char *
 	Kmers = smalloc(sizeof(CompKmers));
 	allocCompKmers(Kmers, 1024);
 	Ntot = 0;
-	
 	/* count kmers */
 	while((Kmers->n = fread(Kmers->kmers, sizeof(long unsigned), Kmers->size, inputfile))) {
 		Ntot += Kmers->n;
 		save_kmers_sparse(templates, foundKmers, Kmers);
 	}
 	kmaPipe(0, 0, inputfile, &status);
-	
 	if(kmaPipe == &kmaPipeFork) {
 		t1 = clock();
 		fprintf(stderr, "#\n# Total time used to identify k-mers in query: %.2f s.\n#\n", difftime(t1, t0) / 1000000);


=====================================
version.h
=====================================
@@ -17,4 +17,4 @@
  * limitations under the License.
 */
 
-#define KMA_VERSION "1.2.10a"
+#define KMA_VERSION "1.2.11"



View it on GitLab: https://salsa.debian.org/med-team/kma/commit/c3ae9084e4ade7f9e2a89400c4588fd50602eb38

-- 
View it on GitLab: https://salsa.debian.org/med-team/kma/commit/c3ae9084e4ade7f9e2a89400c4588fd50602eb38
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190831/3dd0ea38/attachment-0001.html>


More information about the debian-med-commit mailing list