[med-svn] [Git][med-team/kma][master] 4 commits: routine-update: New upstream version
Steffen Möller
gitlab at salsa.debian.org
Mon Jul 13 22:57:32 BST 2020
Steffen Möller pushed to branch master at Debian Med / kma
Commits:
859d6303 by Steffen Moeller at 2020-07-13T23:54:39+02:00
routine-update: New upstream version
- - - - -
694b4807 by Steffen Moeller at 2020-07-13T23:54:40+02:00
New upstream version 1.3.2
- - - - -
48c6f5bd by Steffen Moeller at 2020-07-13T23:54:40+02:00
Update upstream source from tag 'upstream/1.3.2'
Update to upstream version '1.3.2'
with Debian dir 5dbfb806fcdbe7354b30f7646c76d8609c9e3ce0
- - - - -
4ad339db by Steffen Moeller at 2020-07-13T23:55:00+02:00
routine-update: Ready to upload to unstable
- - - - -
9 changed files:
- debian/changelog
- dist.c
- dist.h
- kmers.c
- matrix.c
- matrix.h
- runkma.c
- savekmers.c
- version.h
Changes:
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+kma (1.3.2-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version
+
+ -- Steffen Moeller <moeller at debian.org> Mon, 13 Jul 2020 23:54:45 +0200
+
kma (1.3.0-1) unstable; urgency=medium
* Team upload.
=====================================
dist.c
=====================================
@@ -22,10 +22,15 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/mman.h>
+#include <sys/param.h>
+#include "dist.h"
#include "hashmapkma.h"
#include "matrix.h"
#include "pherror.h"
#include "runkma.h"
+#include "threader.h"
+#include "tmp.h"
#define missArg(opt) fprintf(stderr, "Missing argument at %s.\n", opt); exit(1);
#define invaArg(opt) fprintf(stderr, "Invalid value parsed at %s.\n", opt); exit(1);
@@ -111,7 +116,6 @@ HashMapKMA * loadValues(const char *filename) {
return 0;
}
dest->values_s = (short unsigned *)(dest->values);
- dest->shmFlag |= 2;
/* check for megaMap */
if(dest->exist) {
@@ -147,7 +151,6 @@ HashMapKMA * loadValues(const char *filename) {
return 0;
}
dest->exist_l = (long unsigned *)(dest->value_index);
- dest->shmFlag |= 1;
fclose(file);
return dest;
@@ -162,7 +165,7 @@ void destroyValues(HashMapKMA *src) {
void kmerSimilarity(HashMapKMA *DB, Matrix *Dist, int *N) {
- int i, j, el, vs, **D;
+ int i, j, el, vs, v_i, **D, *Di;
unsigned *exist, *values_i, *values_j;
long unsigned n, pos, *exist_l;
short unsigned *values_si, *values_sj;
@@ -186,10 +189,12 @@ void kmerSimilarity(HashMapKMA *DB, Matrix *Dist, int *N) {
while(--i) {
j = i;
values_sj = --values_si;
+ v_i = *values_si - 1;
+ Di = D[v_i];
while(--j) {
- ++D[*values_si - 1][*--values_sj - 1];
+ ++Di[*--values_sj - 1];
}
- ++N[*values_si - 1];
+ ++N[v_i];
}
} else {
values_i = DB->values + pos;
@@ -198,10 +203,12 @@ void kmerSimilarity(HashMapKMA *DB, Matrix *Dist, int *N) {
while(--i) {
j = i;
values_j = --values_i;
+ v_i = *values_i - 1;
+ Di = D[v_i];
while(--j) {
- ++D[*values_i - 1][*--values_j - 1];
+ ++Di[*--values_j - 1];
}
- ++N[*values_i - 1];
+ ++N[v_i];
}
}
--n;
@@ -209,192 +216,555 @@ void kmerSimilarity(HashMapKMA *DB, Matrix *Dist, int *N) {
}
Dist->n = DB->DB_size - 1;
+ destroyValues(DB);
}
-void kmerDist(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format) {
+void kmerSimilarity_thread(HashMapKMA *DB, Matrix *Dist, int *N, int thread_num, volatile int *lock) {
- int i, j, *D;
- char *name;
+ static volatile int thread_wait = 0;
+ static volatile long unsigned next;
+ static long unsigned n;
+ int i, j, el, vs, v_i, chunk, chunkster, **D, *Di, *Dptr;
+ unsigned *exist, *values_i, *values_j;
+ long unsigned pos, size, *exist_l;
+ short unsigned *values_si, *values_sj;
/* init */
- D = *(Dist->mat) - 1;
-
- if(format & 4) {
- fprintf(outfile, "#%s\n", "k-mer distance");
+ el = DB->v_index < UINT_MAX;
+ vs = DB->DB_size < USHRT_MAX;
+ D = Dist->mat;
+ chunk = 1024;
+ size = ((DB->size - 1) == DB->mask) ? DB->size : DB->n;
+
+ /* static init */
+ lock(lock);
+ if(!thread_wait) {
+ thread_wait = thread_num;
+ next = 0;
+ n = DB->n;
+ Dist->n = DB->DB_size - 1;
}
- fprintf(outfile, "%10d\n", Dist->n);
- for(i = 0; i < Dist->n; ++i) {
- name = nameLoad(template_name, name_file);
- if(format & 1) {
- fprintf(outfile, "%s", name);
- } else {
- fprintf(outfile, "%-10.10s", name);
+ unlock(lock);
+
+ /* get values */
+ while(n && chunk) {
+ /* get next chunk */
+ lock(lock);
+ pos = next;
+ if(size < (next += chunk)) {
+ next = size;
}
+ unlock(lock);
- for(j = 0; j < i; ++j) {
- fprintf(outfile, "\t%d", N[i] + N[j] - 2 * *++D);
+ /* update chunk */
+ exist = DB->exist + pos - 1;
+ exist_l = DB->exist_l + pos - 1;
+ if(size < pos + chunk) {
+ chunkster = size - pos + 1;
+ chunk = 0;
+ } else {
+ chunkster = (chunk + 1);
}
- fprintf(outfile, "\n");
+ while(n && --chunkster) {
+ pos = el ? *++exist : *++exist_l;
+ if(pos != 1) {
+ if(vs) {
+ values_si = DB->values_s + pos;
+ i = *values_si + 1;
+ values_si += i;
+ while(--i) {
+ j = i;
+ values_sj = --values_si;
+ v_i = *values_si - 1;
+ Di = D[v_i];
+ while(--j) {
+ Dptr = Di + (*--values_sj - 1);
+ __sync_add_and_fetch(Dptr, 1);
+ }
+ Dptr = N + v_i;
+ __sync_add_and_fetch(Dptr, 1);
+ }
+ } else {
+ values_i = DB->values + pos;
+ i = *values_i + 1;
+ values_i += i;
+ while(--i) {
+ j = i;
+ values_j = --values_i;
+ v_i = *values_i - 1;
+ Di = D[v_i];
+ while(--j) {
+ Dptr = Di + (*--values_j - 1);
+ __sync_add_and_fetch(Dptr, 1);
+ }
+ Dptr = N + v_i;
+ __sync_add_and_fetch(Dptr, 1);
+ }
+ }
+ __sync_sub_and_fetch(&n, 1);
+ }
+ }
+ }
+
+ if(__sync_sub_and_fetch(&thread_wait, 1)) {
+ wait_atomic(thread_wait);
+ } else {
+ destroyValues(DB);
}
- fseek(name_file, 0, SEEK_SET);
}
-void kmerShared(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format) {
+int kmerDist(int Ni, int Nj, int D) {
+ return Ni + Nj - (D << 1);
+}
+
+int kmerShared(int Ni, int Nj, int D) {
+ return D;
+}
+
+int chi2dist(int Ni, int Nj, int D) {
+ D = (Ni + Nj - (D << 1));
+ return D * D / (Ni + Nj);
+}
+
+void printIntLtdPhy(char *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format, int thread_num, volatile int *lock, const char *method, int (*distPtr)(int, int, int)) {
- int i, j, *D;
- char *name;
+ static volatile int thread_wait1 = 0, thread_wait2 = 0;
+ static int next_i, next_j, entrance = 0;
+ static long unsigned row_bias = 0;
+ volatile int *thread_wait;
+ int i, j, j_end, chunk, N_i, *Nj, **D, *Di;
+ char *outfile_chunk, *name;
/* init */
- D = *(Dist->mat) - 1;
-
- if(format & 4) {
- fprintf(outfile, "#%s\n", "shared k-mers");
+ D = Dist->mat;
+ chunk = 1048576;
+ lock(lock);
+ if(!row_bias) {
+ if(format & 4) {
+ row_bias += sprintf(outfile, "# %-35s\n", method);
+ }
+ row_bias += sprintf(outfile + row_bias, "%10d", Dist->n);
+ next_i = -1;
+ next_j = 0;
+ if(++entrance & 1) {
+ thread_wait1 = thread_num;
+ thread_wait = &thread_wait1;
+ } else {
+ thread_wait2 = thread_num;
+ thread_wait = &thread_wait2;
+ }
+ } else if(entrance & 1) {
+ thread_wait = &thread_wait1;
+ } else {
+ thread_wait = &thread_wait2;
}
- fprintf(outfile, "%10d\n", Dist->n);
- for(i = 0; i < Dist->n; ++i) {
- name = nameLoad(template_name, name_file);
- if(format & 1) {
- fprintf(outfile, "%s", name);
+ unlock(lock);
+
+ while(next_i < Dist->n) {
+ lock(lock);
+ /* check for new row */
+ if(next_i <= next_j) {
+ i = ++next_i;
+ if(i) {
+ row_bias += (i - 1) * 11;
+ }
+ j = 0;
+ next_j = chunk;
+ if(next_i < Dist->n) {
+ name = nameLoad(template_name, name_file);
+ if(format & 1) {
+ row_bias += sprintf(outfile + row_bias, "\n%s", name);
+ } else {
+ row_bias += sprintf(outfile + row_bias, "\n%-10.10s", name);
+ }
+ } else {
+ outfile[row_bias] = '\n';
+ ++row_bias;
+ chunk = 0;
+ }
} else {
- fprintf(outfile, "%-10.10s", name);
+ i = next_i;
+ j = next_j;
+ next_j += chunk;
}
+ outfile_chunk = outfile + row_bias + j * 11;
+ unlock(lock);
- for(j = 0; j < i; ++j) {
- fprintf(outfile, "\t%d", *++D);
+ if(chunk) {
+ N_i = N[i];
+ j_end = (i < j + chunk) ? i : (j + chunk);
+ Di = D[i] + --j;
+ Nj = N + j;
+ while(++j < j_end) {
+ outfile_chunk += sprintf(outfile_chunk, "\t%10d", distPtr(N_i, *++Nj, *++Di));
+ }
+ /* fix null character */
+ *outfile_chunk = (j == i) ? '\n' : '\t';
}
- fprintf(outfile, "\n");
}
- fseek(name_file, 0, SEEK_SET);
+
+ /* wait for matrix to finish */
+ lock(lock);
+ if(--*thread_wait) {
+ unlock(lock);
+ wait_atomic(*thread_wait);
+ } else {
+ fseek(name_file, 0, SEEK_SET);
+ row_bias = 0;
+ unlock(lock);
+ }
+}
+
+double kmerQuery(int Ni, int Nj, int D) {
+ return 100.0 * D / Ni;
+}
+
+double kmerTemplate(int Ni, int Nj, int D) {
+ return 100.0 * D / Nj;
}
-void kmerQuery(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format) {
+double kmerAvg(int Ni, int Nj, int D) {
+ return 200.0 * D / (Ni + Nj);
+}
+
+double kmerInvAvg(int Ni, int Nj, int D) {
+ return 100.0 - 200.0 * D / (Ni + Nj);
+}
+
+double kmerJaccardDist(int Ni, int Nj, int D) {
+ return 1.0 - (double)(D) / (Ni + Nj - D);
+}
+
+double kmerJaccardSim(int Ni, int Nj, int D) {
+ return (double)(D) / (Ni + Nj - D);
+}
+
+double kmerCosineDist(int Ni, int Nj, int D) {
+ return 1.0 - (double)(D) / (Ni + Nj);
+}
+
+double kmerCosineSim(int Ni, int Nj, int D) {
+ return (double)(D) / (Ni + Nj);
+}
+
+double kmerOverlapCoef(int Ni, int Nj, int D) {
+ return (double)(D) / (Ni < Nj ? Ni : Nj);
+}
+
+double kmerInvOverlapCoef(int Ni, int Nj, int D) {
+ return 1.0 - (double)(D) / (Ni < Nj ? Ni : Nj);
+}
+
+void printDoublePhy(char *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format, const char *formatString, int ltd, int thread_num, volatile int *lock, const char *method, double (*distPtr)(int, int, int)) {
- int i, j, **D;
- double d;
- char *name;
+ static volatile int thread_wait1 = 0, thread_wait2 = 0;
+ static int next_i, next_j, entrance = 0;
+ static long unsigned row_bias = 0;
+ volatile int *thread_wait;
+ int i, j, j_end, chunk, N_i, *Nj, **D, *Di;
+ char endChar, *outfile_chunk, *name;
/* init */
D = Dist->mat;
-
- if(format & 4) {
- fprintf(outfile, "#%s\n", "Query k-mer coverage");
+ chunk = 1048576;
+ lock(lock);
+ if(!row_bias) {
+ if(format & 4) {
+ row_bias += sprintf(outfile, "# %-35s\n", method);
+ }
+ row_bias += sprintf(outfile + row_bias, "%10d", Dist->n);
+ next_i = -1;
+ next_j = Dist->n;
+ if(++entrance & 1) {
+ thread_wait1 = thread_num;
+ thread_wait = &thread_wait1;
+ } else {
+ thread_wait2 = thread_num;
+ thread_wait = &thread_wait2;
+ }
+ } else if(entrance & 1) {
+ thread_wait = &thread_wait1;
+ } else {
+ thread_wait = &thread_wait2;
}
- fprintf(outfile, "%10d\n", Dist->n);
- for(i = 0; i < Dist->n; ++i) {
- name = nameLoad(template_name, name_file);
- if(format & 1) {
- fprintf(outfile, "%s", name);
+ unlock(lock);
+
+ while(next_i < Dist->n) {
+ lock(lock);
+ /* check for new row */
+ if((ltd && next_i <= next_j) || (Dist->n <= next_j)) {
+ i = ++next_i;
+ if(i) {
+ row_bias += (ltd ? (i - 1) : Dist->n) * 11;
+ }
+ j = 0;
+ next_j = chunk;
+ if(next_i < Dist->n) {
+ name = nameLoad(template_name, name_file);
+ if(format & 1) {
+ row_bias += sprintf(outfile + row_bias, "\n%s", name);
+ } else {
+ row_bias += sprintf(outfile + row_bias, "\n%-10.10s", name);
+ }
+ } else {
+ outfile[row_bias] = '\n';
+ ++row_bias;
+ chunk = 0;
+ }
} else {
- fprintf(outfile, "%-10.10s", name);
+ i = next_i;
+ j = next_j;
+ next_j += chunk;
}
+ outfile_chunk = outfile + row_bias + j * 11;
+ unlock(lock);
- for(j = 0; j < Dist->n; ++j) {
- if(i == j) {
- fprintf(outfile, "\t%.2f", 100.0);
+ if(chunk) {
+ if(ltd) {
+ j_end = (i < j + chunk) ? i : (j + chunk);
+ endChar = (j_end == i) ? '\n' : '\t';
} else {
- d = i < j ? D[j][i] : D[i][j];
- fprintf(outfile, "\t%.2f", 100.0 * d / N[i]);
+ j_end = (Dist->n < j + chunk) ? Dist->n : (j + chunk);
+ endChar = (j_end == Dist->n) ? '\n' : '\t';
}
+ N_i = N[i];
+ Di = D[i] + --j;
+ Nj = N + j;
+
+ if(j_end < i) {
+ while(++j < j_end) {
+ outfile_chunk += sprintf(outfile_chunk, formatString, distPtr(N_i, *++Nj, *++Di));
+ }
+ } else if(i < j) {
+ while(++j < j_end) {
+ outfile_chunk += sprintf(outfile_chunk, formatString, distPtr(N_i, *++Nj, D[j][i]));
+ }
+ } else {
+ while(++j < j_end) {
+ if(j < i) {
+ outfile_chunk += sprintf(outfile_chunk, formatString, distPtr(N_i, *++Nj, *++Di));
+ } else if(i < j) {
+ outfile_chunk += sprintf(outfile_chunk, formatString, distPtr(N_i, *++Nj, D[j][i]));
+ } else {
+ outfile_chunk += sprintf(outfile_chunk, formatString, 100.0);
+ ++Nj;
+ }
+ }
+ }
+ /* fix null character */
+ *outfile_chunk = endChar;
}
- fprintf(outfile, "\n");
}
- fseek(name_file, 0, SEEK_SET);
+
+ /* wait for matrix to finish */
+ lock(lock);
+ if(--*thread_wait) {
+ unlock(lock);
+ wait_atomic(*thread_wait);
+ } else {
+ fseek(name_file, 0, SEEK_SET);
+ row_bias = 0;
+ unlock(lock);
+ }
}
-void kmerTemplate(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format) {
-
- int i, j, **D;
- double d;
- char *name;
+long unsigned getPhySize(int flag, int format, long unsigned n, long unsigned *ltdMat, long unsigned *covMat, FILE *name_file) {
- /* init */
- D = Dist->mat;
+ long unsigned size;
+ /* get name name */
+ if(format & 1) {
+ fseek(name_file, 0, SEEK_END);
+ size = ftell(name_file);
+ rewind(name_file);
+ } else {
+ size = n * 11;
+ }
if(format & 4) {
- fprintf(outfile, "#%s\n", "Template k-mer coverage");
+ size += 38;
}
- fprintf(outfile, "%10d\n", Dist->n);
- for(i = 0; i < Dist->n; ++i) {
- name = nameLoad(template_name, name_file);
- if(format & 1) {
- fprintf(outfile, "%s", name);
- } else {
- fprintf(outfile, "%-10.10s", name);
- }
-
- for(j = 0; j < Dist->n; ++j) {
- if(i == j) {
- fprintf(outfile, "\t%.2f", 100.0);
- } else {
- d = i < j ? D[j][i] : D[i][j];
- fprintf(outfile, "\t%.2f", 100.0 * d / N[j]);
- }
- }
- fprintf(outfile, "\n");
+
+ /* phy size */
+ size += 11;
+
+ /* ltd cell size */
+ *ltdMat = size + (((n - 1) * (n - 2)) >> 1) * 11;
+
+ /* cov cell size */
+ *covMat = size + (n - 1) * (n - 1) * 11;
+
+ /* get number of matrices to make */
+ size = 0;
+ if(flag & 4) {
+ size += *covMat;
+ flag ^= 4;
}
- fseek(name_file, 0, SEEK_SET);
+ if(flag & 8) {
+ size += *covMat;
+ flag ^= 8;
+ }
+ n = 0;
+ while(flag) {
+ n += flag & 1;
+ flag >>= 1;
+ }
+
+ return size + n * *ltdMat;
}
-void kmerAvg(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format) {
+char * mfile(FILE *outfile, long unsigned size) {
- int i, j, *D;
- char *name;
+ char *outfileM;
+
+ if(fseek(outfile, size - 1, SEEK_SET) || putc(0, outfile) == EOF) {
+ ERROR();
+ }
+ rewind(outfile);
+ outfileM = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fileno(outfile), 0);
+ if(outfileM == MAP_FAILED) {
+ ERROR();
+ }
+ posix_madvise(outfileM, size, POSIX_MADV_SEQUENTIAL);
+
+ return outfileM;
+}
+
+void * threadDist(void *arg) {
+
+ static volatile int lock[1] = {0};
+ DistThread *thread = arg;
+ int flag, format, thread_num, *N;
+ long unsigned ltdMat, covMat;
+ char *outfileM;
+ FILE *name_file;
+ HashMapKMA *DB;
+ Matrix *Dist;
+ Qseqs *template_name;
/* init */
- D = *(Dist->mat) - 1;
+ flag = thread->flag;
+ format = thread->format;
+ thread_num = thread->thread_num;
+ N = thread->N;
+ ltdMat = thread->ltdMat;
+ covMat = thread->covMat;
+ outfileM = thread->outfileM;
+ name_file = thread->name_file;
+ DB = thread->DB;
+ Dist = thread->Dist;
+ template_name = thread->template_name;
- if(format & 4) {
- fprintf(outfile, "#%s\n", "Avg. k-mer coverage");
+ /* get kmer similarities and lengths */
+ if(thread_num != 1) {
+ kmerSimilarity_thread(DB, Dist, N, thread_num, lock);
+ } else {
+ kmerSimilarity(DB, Dist, N);
}
- fprintf(outfile, "%10d\n", Dist->n);
- for(i = 0; i < Dist->n; ++i) {
- name = nameLoad(template_name, name_file);
- if(format & 1) {
- fprintf(outfile, "%s", name);
- } else {
- fprintf(outfile, "%-10.10s", name);
- }
-
- for(j = 0; j < i; ++j) {
- fprintf(outfile, "\t%.2f", 200.0 * *++D / (N[i] + N[j]));
- }
- fprintf(outfile, "\n");
+
+ /* k-mer dist, lt */
+ if(flag & 1) {
+ printIntLtdPhy(outfileM, Dist, N, name_file, template_name, format, thread_num, lock, "k-mer distance", &kmerDist);
+ outfileM += ltdMat;
+ }
+
+ /* k-mer shared, lt */
+ if(flag & 2) {
+ printIntLtdPhy(outfileM, Dist, N, name_file, template_name, format, thread_num, lock, "shared k-mers", &kmerShared);
+ outfileM += ltdMat;
+ }
+
+ /* query cov, asym */
+ if(flag & 4) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%10.6f", 0, thread_num, lock, "Query k-mer coverage [%]", &kmerQuery);
+ outfileM += covMat;
+ }
+
+ /* template cov, asym */
+ if(flag & 8) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%10.6f", 0, thread_num, lock, "Template k-mer coverage [%]", &kmerTemplate);
+ outfileM += covMat;
+ }
+
+ /* avg. cov, lt */
+ if(flag & 16) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%10.6f", 1, thread_num, lock, "Avg. k-mer coverage [%]", &kmerAvg);
+ outfileM += ltdMat;
+ }
+
+ /* inv. avg. cov, lt */
+ if(flag & 32) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%10.6f", 1, thread_num, lock, "Inverse Avg. k-mer coverage", &kmerInvAvg);
+ outfileM += ltdMat;
+ }
+
+ /* Jaccard dist */
+ if(flag & 64) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%.8f", 1, thread_num, lock, "Jaccard Distance", &kmerJaccardDist);
+ outfileM += ltdMat;
+ }
+
+ /* Jaccard similarity */
+ if(flag & 128) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%.8f", 1, thread_num, lock, "Jaccard Similarity", &kmerJaccardSim);
+ outfileM += ltdMat;
+ }
+
+ /* Cosine dist */
+ if(flag & 256) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%.8f", 1, thread_num, lock, "Cosine distance", &kmerCosineDist);
+ outfileM += ltdMat;
}
- fseek(name_file, 0, SEEK_SET);
+
+ /* Cosine similarity */
+ if(flag & 512) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%.8f", 1, thread_num, lock, "Cosine similarity", &kmerCosineSim);
+ outfileM += ltdMat;
+ }
+
+ /* Szymkiewicz–Simpson similarity */
+ if(flag & 1024) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%.8f", 1, thread_num, lock, "Szymkiewicz–Simpson similarity", &kmerOverlapCoef);
+ outfileM += ltdMat;
+ }
+
+ /* Szymkiewicz–Simpson dissimilarity */
+ if(flag & 2048) {
+ printDoublePhy(outfileM, Dist, N, name_file, template_name, format, "\t%.8f", 1, thread_num, lock, "Szymkiewicz–Simpson dissimilarity", &kmerInvOverlapCoef);
+ outfileM += ltdMat;
+ }
+
+ /* Chi-square distance */
+ if(flag & 4096) {
+ printIntLtdPhy(outfileM, Dist, N, name_file, template_name, format, thread_num, lock, "Chi-square distance", &chi2dist);
+ outfileM += ltdMat;
+ }
+
+ return NULL;
}
-void runDist(char *templatefilename, char *outputfilename, int flag, int format) {
+void runDist(char *templatefilename, char *outputfilename, int flag, int format, int disk, int thread_num) {
- int file_len, *N;
+ int i, file_len, *N;
unsigned DB_size;
+ long unsigned out_size, ltdMat, covMat;
+ char *outfileM;
FILE *outfile, *name_file;
+ DistThread *thread, *threads;
HashMapKMA *DB;
Matrix *Dist;
Qseqs *template_name;
/* init */
- if(*outputfilename == '-' && outputfilename[1] == '-' && outputfilename[2] == 0) {
- outfile = stdout;
- } else {
- outfile = sfopen(outputfilename, "wb");
- }
+ outfile = sfopen(outputfilename, "wb+");
file_len = strlen(templatefilename);
+
+ /* load k-mer links from KMA DB */
strcpy(templatefilename + file_len, ".comp.b");
if(!(DB = loadValues(templatefilename))) {
fprintf(stderr, "Wrong format of DB.\n");
exit(1);
}
templatefilename[file_len] = 0;
- DB_size = DB->DB_size;
- N = calloc(DB_size, sizeof(int));
- if(!N) {
- ERROR();
- }
- Dist = ltdMatrix_init(DB_size);
-
- /* get kmer similarities and lengths */
- kmerSimilarity(DB, Dist, N);
- destroyValues(DB);
/* load names */
strcpy(templatefilename + file_len, ".name");
@@ -402,36 +772,75 @@ void runDist(char *templatefilename, char *outputfilename, int flag, int format)
templatefilename[file_len] = 0;
template_name = setQseqs(1024);
- /* k-mer dist, lt */
- if(flag & 1) {
- kmerDist(outfile, Dist, N, name_file, template_name, format);
- }
+ /* allocate output file */
+ DB_size = DB->DB_size;
+ out_size = getPhySize(flag, format, DB_size, <dMat, &covMat, name_file);
+ outfileM = mfile(outfile, out_size);
- /* k-mer shared, lt */
- if(flag & 2) {
- kmerShared(outfile, Dist, N, name_file, template_name, format);
+ /* allocate matrices */
+ if(!(N = calloc(DB_size, sizeof(int)))) {
+ ERROR();
}
-
- /* query cov, asym */
- if(flag & 4) {
- kmerQuery(outfile, Dist, N, name_file, template_name, format);
+ if(disk) {
+ Dist = ltdMatrix_minit(DB_size);
+ } else {
+ Dist = ltdMatrix_init(DB_size);
}
- /* template cov, asym */
- if(flag & 8) {
- kmerTemplate(outfile, Dist, N, name_file, template_name, format);
+ /* thread out */
+ thread = 0;
+ threads = 0;
+ i = thread_num;
+ while(i--) {
+ /* init */
+ thread = smalloc(sizeof(DistThread));
+ thread->flag = flag;
+ thread->format = format;
+ thread->thread_num = thread_num;
+ thread->N = N;
+ thread->ltdMat = ltdMat;
+ thread->covMat = covMat;
+ thread->outfileM = outfileM;
+ thread->name_file = name_file;
+ thread->DB = DB;
+ thread->Dist = Dist;
+ thread->template_name = template_name;
+ thread->next = threads;
+ threads = thread;
+ /* start */
+ if(i && (errno = pthread_create(&thread->id, NULL, &threadDist, thread))) {
+ fprintf(stderr, "Error: %d (%s)\n", errno, strerror(errno));
+ fprintf(stderr, "Will continue with %d threads.\n", thread_num - i);
+ i = 0;
+ }
}
-
- /* avg. cov, lt */
- if(flag & 16) {
- kmerAvg(outfile, Dist, N, name_file, template_name, format);
+ /* start main thread */
+ thread->id = 0;
+ threadDist(thread);
+
+ /* join threads */
+ while((thread = thread->next)) {
+ /* join thread */
+ if((errno = pthread_join(thread->id, NULL))) {
+ ERROR();
+ }
}
/* clean */
+ while(threads) {
+ thread = threads->next;
+ free(threads);
+ threads = thread;
+ }
+ munmap(outfileM - out_size, out_size);
fclose(outfile);
fclose(name_file);
free(N);
- Matrix_destroy(Dist);
+ if(disk) {
+ Matrix_mdestroy(Dist);
+ } else {
+ Matrix_destroy(Dist);
+ }
destroyQseqs(template_name);
}
@@ -440,11 +849,14 @@ static int helpMessage(FILE *out) {
fprintf(out, "#kma dist calculates distances between templates from a kma index\n");
fprintf(out, "# %16s\t%-32s\t%s\n", "Options are:", "Desc:", "Default:");
fprintf(out, "# %16s\t%-32s\t%s\n", "-t_db", "Template DB", "");
- fprintf(out, "# %16s\t%-32s\t%s\n", "-o", "Output file", "stdout");
+ fprintf(out, "# %16s\t%-32s\t%s\n", "-o", "Output file", "DB");
fprintf(out, "# %16s\t%-32s\t%s\n", "-f", "Output flags", "1");
fprintf(out, "# %16s\t%-32s\t%s\n", "-fh", "Help on option \"-f\"", "");
fprintf(out, "# %16s\t%-32s\t%s\n", "-d", "Distance method", "1");
fprintf(out, "# %16s\t%-32s\t%s\n", "-dh", "Help on option \"-d\"", "");
+ fprintf(out, "# %16s\t%-32s\t%s\n", "-m", "Allocate matrix on the disk", "False");
+ fprintf(out, "# %16s\t%-32s\t%s\n", "-tmp", "Set directory for temporary files", "");
+ fprintf(out, "# %16s\t%-32s\t%s\n", "-t", "Number of threads", "1");
fprintf(out, "# %16s\t%-32s\t%s\n", "-h", "Shows this helpmessage", "");
return (out == stderr);
}
@@ -452,15 +864,19 @@ static int helpMessage(FILE *out) {
/* main */
int dist_main(int argc, char *argv[]) {
- int args, flag, format, file_len;
+ int args, flag, format, mmap, thread_num, file_len;
char *arg, *errorMsg, *templatefilename, *outputfilename;
/* init */
flag = 1;
format = 1;
+ mmap = 0;
+ thread_num = 1;
+ file_len = 0;
templatefilename = 0;
outputfilename = 0;
+ /* parse cmd-line */
args = 1;
while(args < argc) {
arg = argv[args];
@@ -491,29 +907,55 @@ int dist_main(int argc, char *argv[]) {
} else if(strcmp(arg, "fh") == 0) {
fprintf(stdout, "# Format flags output, add them to combine them.\n");
fprintf(stdout, "#\n");
- fprintf(stdout, "# 1:\tRelaxed Phylip\n");
- fprintf(stdout, "# 4:\tInclude template name in phylip file\n");
- fprintf(stdout, "#\n");
+ fprintf(stdout, "#%9d\t%s\n", 1, "Relaxed Phylip");
+ fprintf(stdout, "#%9d\t%s\n", 4, "Include distance method(s) in phylip file");
return 0;
} else if(strcmp(arg, "d") == 0) {
if(++args < argc) {
flag = strtoul(argv[args], &errorMsg, 10);
if(*errorMsg != 0) {
- invaArg("\"-f\"");
+ invaArg("\"-d\"");
}
} else {
missArg("\"-d\"");
}
} else if(strcmp(arg, "dh") == 0) {
- fprintf(stdout, "# Distance calculation methods, add them to combine them:\n");
+ fprintf(stdout, "# Distance / Similarity calculation methods, add them to combine them:\n");
fprintf(stdout, "#\n");
- fprintf(stdout, "# 1:\tk-mer distance\n");
- fprintf(stdout, "# 2:\tShared k-mers\n");
- fprintf(stdout, "# 4:\tk-mer query coverage\n");
- fprintf(stdout, "# 8:\tk-mer template coverage\n");
- fprintf(stdout, "# 16:\tk-mer avg. coverage\n");
+ fprintf(stdout, "#%9d\t%s\n", 1, "k-mer hamming distance");
+ fprintf(stdout, "#%9d\t%s\n", 2, "Shared k-mers");
+ fprintf(stdout, "#%9d\t%s\n", 4, "k-mer query coverage");
+ fprintf(stdout, "#%9d\t%s\n", 8, "k-mer template coverage");
+ fprintf(stdout, "#%9d\t%s\n", 16, "k-mer avg. coverage");
+ fprintf(stdout, "#%9d\t%s\n", 32, "k-mer inv. avg. coverage");
+ fprintf(stdout, "#%9d\t%s\n", 64, "Jaccard distance");
+ fprintf(stdout, "#%9d\t%s\n", 128, "Jaccard similarity");
+ fprintf(stdout, "#%9d\t%s\n", 256, "Cosine distance");
+ fprintf(stdout, "#%9d\t%s\n", 512, "Cosine similarity");
+ fprintf(stdout, "#%9d\t%s\n", 1024, "Szymkiewicz–Simpson similarity");
+ fprintf(stdout, "#%9d\t%s\n", 2048, "Szymkiewicz–Simpson dissimilarity");
+ fprintf(stdout, "#%9d\t%s\n", 4096, "Chi-square distance");
fprintf(stdout, "#\n");
return 0;
+ } else if(strcmp(arg, "m") == 0) {
+ mmap = 1;
+ } else if(strcmp(argv[args], "-tmp") == 0) {
+ if(++args < argc) {
+ if(argv[args][0] != '-') {
+ tmpF(argv[args]);
+ } else {
+ invaArg("\"-tmp\"");
+ }
+ }
+ } else if(strcmp(argv[args], "-t") == 0) {
+ if(++args < argc) {
+ thread_num = strtoul(argv[args], &errorMsg, 10);
+ if(*errorMsg != 0) {
+ invaArg("\"-t\"");
+ }
+ } else {
+ missArg("\"-t\"");
+ }
} else if(strcmp(arg, "h") == 0) {
return helpMessage(stdout);
} else {
@@ -532,10 +974,11 @@ int dist_main(int argc, char *argv[]) {
exit(1);
}
if(!outputfilename) {
- outputfilename = "--";
+ outputfilename = smalloc(file_len + 64);
+ sprintf(outputfilename, "%s.phy", templatefilename);
}
- runDist(templatefilename, outputfilename, flag, format);
+ runDist(templatefilename, outputfilename, flag, format, mmap, thread_num);
return 0;
}
=====================================
dist.h
=====================================
@@ -19,6 +19,7 @@
#define _XOPEN_SOURCE 600
#include <limits.h>
+#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -27,13 +28,47 @@
#include "pherror.h"
#include "runkma.h"
+#ifndef DIST
+typedef struct distThread DistThread;
+struct distThread {
+ pthread_t id;
+ int flag;
+ int format;
+ int thread_num;
+ int *N;
+ long unsigned ltdMat;
+ long unsigned covMat;
+ char *outfileM;
+ FILE *name_file;
+ HashMapKMA *DB;
+ Matrix *Dist;
+ Qseqs *template_name;
+ struct distThread *next;
+};
+#define DIST 1
+#endif
+
HashMapKMA * loadValues(const char *filename);
void destroyValues(HashMapKMA *src);
void kmerSimilarity(HashMapKMA *DB, Matrix *Dist, int *N);
-void kmerDist(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format);
-void kmerShared(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format);
-void kmerQuery(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format);
-void kmerTemplate(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format);
-void kmerAvg(FILE *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format);
-void runDist(char *templatefilename, char *outputfilename, int flag, int format);
+void kmerSimilarity_thread(HashMapKMA *DB, Matrix *Dist, int *N, int thread_num, volatile int *lock);
+int kmerDist(int Ni, int Nj, int D);
+int kmerShared(int Ni, int Nj, int D);
+int chi2dist(int Ni, int Nj, int D);
+void printIntLtdPhy(char *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format, int thread_num, volatile int *lock, const char *method, int (*distPtr)(int, int, int));
+double kmerQuery(int Ni, int Nj, int D);
+double kmerTemplate(int Ni, int Nj, int D);
+double kmerAvg(int Ni, int Nj, int D);
+double kmerInvAvg(int Ni, int Nj, int D);
+double kmerJaccardDist(int Ni, int Nj, int D);
+double kmerJaccardSim(int Ni, int Nj, int D);
+double kmerCosineDist(int Ni, int Nj, int D);
+double kmerCosineSim(int Ni, int Nj, int D);
+double kmerOverlapCoef(int Ni, int Nj, int D);
+double kmerInvOverlapCoef(int Ni, int Nj, int D);
+void printDoublePhy(char *outfile, Matrix *Dist, int *N, FILE *name_file, Qseqs *template_name, unsigned format, const char *formatString, int ltd, int thread_num, volatile int *lock, const char *method, double (*distPtr)(int, int, int));
+long unsigned getPhySize(int flag, int format, long unsigned n, long unsigned *ltdMat, long unsigned *covMat, FILE *name_file);
+char * mfile(FILE *outfile, long unsigned size);
+void * threadDist(void *arg);
+void runDist(char *templatefilename, char *outputfilename, int flag, int format, int disk, int thread_num);
int dist_main(int argc, char *argv[]);
=====================================
kmers.c
=====================================
@@ -104,18 +104,28 @@ int save_kmers_batch(char *templatefilename, char *exePrev, unsigned shm, int th
deConPrintPtr = printPtr;
}
if(templates->prefix_len == 0 && get_kmers_for_pair_ptr != &get_kmers_for_pair_count) {
+ /* here */
+ /*
if(kmerScan == &save_kmers) {
kmerScan = &save_kmers_pseuodeSparse;
} else {
kmerScan = &save_kmers_sparse_chain;
}
+ */
+ kmerScan = &save_kmers_pseuodeSparse;
+
get_kmers_for_pair_ptr = &get_kmers_for_pair_pseoudoSparse;
} else {
+ /* here */
+ /*
if(kmerScan == &save_kmers) {
kmerScan = &save_kmers_Sparse;
} else {
kmerScan = &save_kmers_sparse_chain;
}
+ */
+ kmerScan = &save_kmers_Sparse;
+
get_kmers_for_pair_ptr = &get_kmers_for_pair_Sparse;
}
}
=====================================
matrix.c
=====================================
@@ -17,9 +17,15 @@
* limitations under the License.
*/
+#define _XOPEN_SOURCE 600
+#include <limits.h>
#include <stdlib.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/param.h>
#include "matrix.h"
#include "pherror.h"
+#include "tmp.h"
Matrix * matrix_init(unsigned size) {
@@ -54,7 +60,7 @@ Matrix * ltdMatrix_init(unsigned size) {
dest->n = 0;
dest->size = size;
dest->mat = smalloc(size * sizeof(int *));
- *(dest->mat) = calloc(size * size / 2, sizeof(int));
+ *(dest->mat) = calloc(size * (size - 1) / 2, sizeof(int));
if(!*(dest->mat)) {
ERROR();
}
@@ -72,11 +78,50 @@ Matrix * ltdMatrix_init(unsigned size) {
return dest;
}
+Matrix * ltdMatrix_minit(long unsigned size) {
+
+ int i, n, **ptr, *src;
+ FILE *tmp;
+ Matrix *dest;
+
+ dest = smalloc(sizeof(Matrix));
+ dest->n = 0;
+ dest->size = size;
+ dest->mat = smalloc(size * sizeof(int *));
+ n = size;
+ size = size * (size - 1) * sizeof(int) / 2;
+
+ /* get matrix */
+ tmp = tmpF(0);
+ if(fseek(tmp, size - 1, SEEK_SET) || putc(0, tmp) == EOF) {
+ ERROR();
+ }
+ fflush(tmp);
+ fseek(tmp, 0, SEEK_SET);
+ *(dest->mat) = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fileno(tmp), 0);
+ if(*(dest->mat) == MAP_FAILED) {
+ ERROR();
+ }
+ posix_madvise(*(dest->mat), size, POSIX_MADV_SEQUENTIAL);
+
+ /* set matrix rows */
+ ptr = dest->mat;
+ src = *ptr;
+ i = 0;
+ *ptr++ = src;
+ while(--n) {
+ *ptr++ = src + i;
+ src += i++;
+ }
+
+ return dest;
+}
+
void ltdMatrix_realloc(Matrix *src, unsigned size) {
int i, **ptr, *mat;
- *(src->mat) = realloc(*(src->mat), size * size * sizeof(int) / 2);
+ *(src->mat) = realloc(*(src->mat), size * (size - 1) * sizeof(int) / 2);
src->mat = realloc(src->mat, size * sizeof(int *));
if(!src->mat || !*(src->mat)) {
ERROR();
@@ -96,6 +141,14 @@ void ltdMatrix_realloc(Matrix *src, unsigned size) {
void Matrix_destroy(Matrix *src) {
+ free(*(src->mat));
+ free(src->mat);
+ free(src);
+}
+
+void Matrix_mdestroy(Matrix *src) {
+
+ munmap(*(src->mat), src->size * src->size * sizeof(int) / 2);
free(src->mat);
free(src);
}
=====================================
matrix.h
=====================================
@@ -29,7 +29,9 @@ struct matrix {
Matrix * matrix_init(unsigned size);
Matrix * ltdMatrix_init(unsigned size);
+Matrix * ltdMatrix_minit(long unsigned size);
void ltdMatrix_realloc(Matrix *src, unsigned size);
void Matrix_destroy(Matrix *src);
+void Matrix_mdestroy(Matrix *src);
void ltdMatrix_popArrange(Matrix *mat, unsigned pos);
int ltdMatrix_add(Matrix *src);
=====================================
runkma.c
=====================================
@@ -1250,6 +1250,7 @@ int runKMA(char *templatefilename, char *outputfilename, char *exePrev, int ConC
}
} else {
if(sam || ID_t == 0.0) {
+ thread->template_index = templates_index[template];
thread->template_name = nameLoad(template_name, name_file);
thread->template = template;
assembly_KMA_Ptr(thread);
=====================================
savekmers.c
=====================================
@@ -37,7 +37,7 @@
#include "threader.h"
void (*ankerPtr)(int*, int*, int*, char*, int*, unsigned**, unsigned**, int*, CompDNA*, int, int, int, int, Qseqs*, volatile int*, FILE*) = &ankerAndClean;
-int (*kmerScan)(const HashMapKMA *, const Penalties *, int*, int*, int*, int*, CompDNA*, CompDNA*, const Qseqs*, int*, const int, volatile int*, FILE*) = &save_kmers_chain; //&save_kmers_HMM;
+int (*kmerScan)(const HashMapKMA *, const Penalties *, int*, int*, int*, int*, CompDNA*, CompDNA*, const Qseqs*, int*, const int, volatile int*, FILE*) = &save_kmers_HMM; /* here */ //&save_kmers_chain;
int (*save_kmers_pair)(const HashMapKMA *, const Penalties *, int*, int*, int*, int*, int*, int*, CompDNA*, CompDNA*, const Qseqs*, const Qseqs*, int*, const int, volatile int*, FILE*) = &save_kmers_unionPair;
int (*get_kmers_for_pair_ptr)(const HashMapKMA *, const Penalties *, int *, int *, int *, int *, CompDNA *, int *, int) = &get_kmers_for_pair;
int (*getMatch)(int*, int*) = &getBestMatch;
=====================================
version.h
=====================================
@@ -17,4 +17,4 @@
* limitations under the License.
*/
-#define KMA_VERSION "1.3.0"
+#define KMA_VERSION "1.3.2"
View it on GitLab: https://salsa.debian.org/med-team/kma/-/compare/4df71cc73eca450d82ba52c25b328e547d33617d...4ad339db652cf28df3ea7bed5195c7b772b473b1
--
View it on GitLab: https://salsa.debian.org/med-team/kma/-/compare/4df71cc73eca450d82ba52c25b328e547d33617d...4ad339db652cf28df3ea7bed5195c7b772b473b1
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200713/6dc4d37a/attachment-0001.html>
More information about the debian-med-commit
mailing list