[med-svn] [Git][med-team/genometester][upstream] New upstream version 4.0+git20221122.71e6625

Andreas Tille (@tille) gitlab at salsa.debian.org
Thu Dec 11 20:33:04 GMT 2025



Andreas Tille pushed to branch upstream at Debian Med / genometester


Commits:
aa4cc7b1 by Andreas Tille at 2025-12-11T14:49:38+01:00
New upstream version 4.0+git20221122.71e6625
- - - - -


14 changed files:

- src/Makefile
- src/database.c
- src/fasta.c
- src/gassembler.c
- src/glistcompare.c
- src/glistmaker.c
- src/glistquery.c
- src/gmer_counter.c
- src/set-operations.c
- src/set-operations.h
- src/utils.c
- src/version.h
- src/word-map.c
- src/word-map.h


Changes:

=====================================
src/Makefile
=====================================
@@ -46,6 +46,7 @@ LISTQUERY_SOURCES = \
 	sequence-source.c sequence-source.h \
 	sequence-stream.c sequence-stream.h \
 	sequence-zstream.c sequence-zstream.h \
+	set-operations.c set-operations.h \
 	common.c common.h \
 	queue.c queue.h \
 	utils.c utils.h \
@@ -172,7 +173,7 @@ INCS = -I.
 BINS  = glistmaker glistquery glistcompare
 
 #CXXFLAGS = $(INCS) $(DEBUGFLAGS) -Wall 
-CXXFLAGS = $(INCS) $(RELEASEFLAGS) -Wall 
+CXXFLAGS = $(INCS) $(RELEASEFLAGS) -Wall -std=c99 -D_DEFAULT_SOURCE -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L -D_XOPEN_SOURCE
 
 .PHONY: all all-before all-after clean clean-custom
 


=====================================
src/database.c
=====================================
@@ -42,6 +42,12 @@ count_lines_from_text (const unsigned char *cdata, size_t csize, unsigned int *w
     unsigned int lengths[4];
     unsigned int ntokenz;
 
+    /* Skip comments */
+    if (cdata[cpos] == '#') {
+      while ((cpos < csize) && (cdata[cpos] != '\n')) cpos += 1;
+      if (cpos < csize) cpos += 1;
+      continue;
+    }
     ntokenz = split_line (cdata + cpos, csize - cpos, tokenz, lengths, 3);
     if (ntokenz < 2) {
       fprintf (stderr, "Line %u has <2 (%u) tokens\n", n_lines, ntokenz);
@@ -115,7 +121,7 @@ gt4_gmer_db_new_from_text (const unsigned char *cdata, unsigned long long csize,
   node_bits = get_bits (nlines + 1);
   kmer_bits = get_bits (max_kmers);
   if ((node_bits + kmer_bits) > 31) {
-    fprintf (stderr, "Too many nodes and kmers (%u (%u bits), %u (%u bits)\n", nlines + 1, max_kmers, node_bits, kmer_bits);
+    fprintf (stderr, "Too many nodes and kmers (%u (%u bits), %u (%u bits)\n", nlines + 1, node_bits, max_kmers, kmer_bits);
     return 0;
   }
   /* Set up DB */
@@ -162,6 +168,12 @@ gt4_gmer_db_new_from_text (const unsigned char *cdata, unsigned long long csize,
     unsigned int lengths[65536];
     unsigned int ntokenz, n_kmers;
     unsigned int i;
+    /* Skip comments */
+    if (cdata[cpos] == '#') {
+      while ((cpos < csize) && (cdata[cpos] != '\n')) cpos += 1;
+      if (cpos < csize) cpos += 1;
+      continue;
+    }
     /* Initialize */
     memset (db->nodes + idx, 0, sizeof (Node));
     /* Parse ID + number of kmers */
@@ -213,7 +225,7 @@ gt4_gmer_db_new_from_text (const unsigned char *cdata, unsigned long long csize,
           kmer2 = code2 & ((1 << db->kmer_bits) - 1);
           fprintf (stderr, "KMer already present (current node %u (%s) kmer %u/%u (%s) code %u) previous %u (%s) kmer %u/%u code %u\n",
             idx, db->names + db->nodes[idx].name, i, (dir != 0), word_to_string (word, db->wordsize), code,
-            idx2, db->names + db->nodes[idx2].name, kmer2, ((code2 & 0x7fffffff) != 0), code2);
+            idx2, db->names + db->nodes[idx2].name, kmer2, ((code2 & 0x80000000) != 0), code2);
           break;
         }
       }


=====================================
src/fasta.c
=====================================
@@ -252,13 +252,15 @@ fasta_reader_read_nwords (GT4FastaReader *reader, unsigned long long maxwords,
 	    nwords += 1;
 	  }
 	  /* We increase nucleotide position for N too */
-	  if (cval > ' ') {
-	    reader->seq_npos += 1;
-	  }
+	  //if (cval > ' ') {
+	  reader->seq_npos += 1;
+	  //}
 	} else if (cval >= ' ') {
 	  reader->wordfw = 0;
 	  reader->wordrv = 0;
 	  reader->currentlength = 0;
+	  /* We increase nucleotide position for N too */
+	  reader->seq_npos += 1;
 	}
       }
       reader->cpos += 1;


=====================================
src/gassembler.c
=====================================
@@ -698,6 +698,9 @@ print_usage (FILE *ofs, unsigned int advanced, int exit_value)
 static unsigned int only_chr = CHR_1;
 static unsigned int only_pos = 0;
 
+GT4Scout db_scout;
+GT4Scout seq_scout;
+
 int
 main (int argc, const char *argv[])
 {
@@ -1076,8 +1079,11 @@ main (int argc, const char *argv[])
     assemble_recursive (db, files, ref_chr, ref_start, ref_end, ref, kmers, nkmers);
   }
 
-  if (prefetch_db || prefetch_seq) {
-    delete_scouts ();
+  if (prefetch_db) {
+    gt4_delete_scout (&db_scout);
+  }
+  if (prefetch_seq) {
+    //gt4_delete_scout (&seq_scout);
   }
 
   return 0;
@@ -1098,6 +1104,9 @@ assemble_recursive (GT4GmerDB *db, SeqFile *files, unsigned int ref_chr, unsigne
   strncpy (dup, ref, len);
   adata->ref = dup;
   adata->cblock = (CallBlock *) malloc (sizeof (CallBlock));
+  memset (adata->cblock, 0, sizeof (CallBlock));
+  adata->cblock->calls = (Call *) malloc (MAX_REFERENCE_LENGTH * 2 * sizeof (Call));
+  memset (adata->cblock->calls, 0, MAX_REFERENCE_LENGTH * 2 * sizeof (Call));
   adata->cblock->chr = adata->chr;
   adata->cblock->start = adata->start;
   adata->cblock->end = adata->end;
@@ -2471,8 +2480,9 @@ load_db_or_die (const char *db_name, const char *seq_dir, const char *id)
     exit (1);
   }
   if (prefetch_db) {
-    scout_mmap (cdata, csize);
-    sleep (10);
+    db_scout.cdata = cdata;
+    db_scout.csize = csize;
+    gt4_scout_mmap (&db_scout);
   }
   db = gt4_gmer_db_new_from_binary (cdata, csize);
   if (!db) {
@@ -2533,7 +2543,7 @@ map_sequences (GT4GmerDB *db, const char *seq_dir)
         return NULL;
       }
       if (prefetch_seq) {
-        scout_mmap (files[i].cdata, files[i].csize);
+        //scout_mmap (files[i].cdata, files[i].csize);
       }
     }
   }


=====================================
src/glistcompare.c
=====================================
@@ -341,8 +341,13 @@ int main (int argc, const char *argv[])
     exit (1);
   }
   
-  if (!find_intrsec && (rule == RULE_SUBTRACT || rule == RULE_MIN || rule == RULE_FIRST || rule == RULE_SECOND)) {
-    fprintf (stderr, "Error: Rules min, subtract, fist and second can only be used with finding the intersection.\n");
+  if (!find_intrsec && (rule == RULE_MIN || rule == RULE_FIRST || rule == RULE_SECOND)) {
+    fprintf (stderr, "Error: Rules min, fist and second can only be used with finding the intersection.\n");
+    exit (1);
+  }
+
+  if ((!find_intrsec && !find_diff) && (rule == RULE_SUBTRACT)) {
+    fprintf (stderr, "Error: Rule subtract can only be used with intersection and difference.\n");
     exit (1);
   }
 


=====================================
src/glistmaker.c
=====================================
@@ -278,6 +278,15 @@ main (int argc, const char *argv[])
     exit (1);
   }
 
+  if (debug && create_index) {
+    for (i = 0; i < mq.n_sources; i++) {
+      fprintf (stderr, "%u: %s start %llu subseqs %u\n", i, i_files[mq.sources[i].file_idx].name, mq.sources[i].start, mq.sources[i].n_subseqs);
+      //for (j = 0; j < mq.sources[i].n_subseqs; j++) {
+      //  fprintf (stderr, "%llu %u\n", i_files[i].subseqs[j]->name_pos, i_files[i].subseqs[j]->name_len);
+      //}
+    }
+  }
+
   /* Do work */
   process (&mq.queue, 0, &mq);
   gt4_queue_lock (&mq.queue);
@@ -626,6 +635,27 @@ write_index (FILE *ofs, const char *loc_files[], unsigned int n_loc_files, GT4Li
   unsigned int version;
   unsigned long long n_words_loc, n_words, n_locations_loc, n_locs, file_block_loc, file_block_pos, kmer_list_loc, kmer_list_pos, locations_loc, locations_pos;
   unsigned char zero[16] = { 0 };
+
+  if (debug) {
+    for (i = 0; i < mq->n_sources; i++) {
+      unsigned int j;
+      GT4LMQSource *src = &mq->sources[i];
+      for (j = 0; j < src->n_subseqs; j++) {
+        GT4SubSequence *ss = &src->subseqs[j];
+        fprintf (stderr, "Src %u subseq %u np %llu nl %u ss %u sl %u\n", i, j, ss->name_pos, ss->name_len, ss->seq_pos, ss->seq_len);
+        unsigned long long np = src->start + ss->name_pos;
+        IFile *ifile = &i_files[src->file_idx];
+        FILE *f = fopen(ifile->name, "r");
+        fseek (f, np, SEEK_SET);
+        char b[256];
+        fread (b, ss->name_len, 1, f);
+        b[ss->name_len] = 0;
+        fclose (f);
+        fprintf (stderr, "%s\n", b);
+      }
+    }
+  }
+
   /* Determine file data */
   for (i = 0; i < n_i_files; i++) {
     GT4LMQSource *sources[1024];
@@ -652,10 +682,34 @@ write_index (FILE *ofs, const char *loc_files[], unsigned int n_loc_files, GT4Li
     for (j = 0; j < n_sources; j++) {
       unsigned int k;
       for (k = 0; k < sources[j]->n_subseqs; k++) {
-        i_files[i].subseqs[sources[j]->first_subseq + k] = &sources[j]->subseqs[k];
+        GT4SubSequence *ss = &sources[j]->subseqs[k];
+        i_files[i].subseqs[sources[j]->first_subseq + k] = ss;
+        // Adjust positions
+        ss->name_pos += sources[j]->start;
+        //fprintf (stderr, "%u %u start %llu np %llu\n", j, k, sources[j]->start, ss->name_pos);
+      }
+    }
+  }
+
+  if (debug) {
+    for (i = 0; i < n_i_files; i++) {
+      unsigned int j;
+      IFile *ifile = &i_files[i];
+      FILE *f = fopen(ifile->name, "r");
+      for (j = 0; j < ifile->n_subseqs; j++) {
+        GT4SubSequence *ss = ifile->subseqs[j];
+        fprintf (stderr, "Src %u subseq %u np %llu nl %u ss %u sl %u\n", i, j, ss->name_pos, ss->name_len, ss->seq_pos, ss->seq_len);
+        unsigned long long np = ss->name_pos;
+        fseek (f, np, SEEK_SET);
+        char b[256];
+        fread (b, ss->name_len, 1, f);
+        b[ss->name_len] = 0;
+        fprintf (stderr, "%s\n", b);
       }
+      fclose (f);
     }
   }
+
   /* Determine bitsizes */
   for (i = 0; i < mq->n_sources; i++) {
     unsigned int last_subseq = mq->sources[i].first_subseq + mq->sources[i].n_subseqs - 1;


=====================================
src/glistquery.c
=====================================
@@ -33,6 +33,7 @@
 #include "utils.h"
 #include "sequence.h"
 #include "sequence-stream.h"
+#include "set-operations.h"
 #include "fasta.h"
 #include "version.h"
 #include "index-map.h"
@@ -70,6 +71,39 @@ enum {
   FILES,
   SEQUENCES
 };
+
+typedef struct _DumpData DumpData;
+
+struct _DumpData {
+  unsigned int n_lists;
+  unsigned int wlen;
+};
+
+static unsigned int
+dump_callback (uint64_t word, uint32_t *counts, void *data)
+{
+  DumpData *dd = (DumpData *) data;
+  unsigned int i;
+  fprintf (stdout, "%s", word_to_string (word, dd->wlen));
+  for (i = 0; i < dd->n_lists; i++) {
+    fprintf (stdout, "\t%u", counts[i]);
+  }
+  fprintf (stdout, "\n");
+  return 0;
+}
+
+static void
+dump_lists (AZObject *objs[], unsigned int n_objs, unsigned int wlen, unsigned int is_union)
+{
+  DumpData dd;
+  dd.n_lists = n_objs;
+  dd.wlen = wlen;
+  if (is_union) {
+    gt4_is_union (objs, n_objs, dump_callback, &dd);
+  } else {
+    gt4_union (objs, n_objs, dump_callback, &dd);
+  }
+}
   
 int main (int argc, const char *argv[])
 {
@@ -80,11 +114,12 @@ int main (int argc, const char *argv[])
   unsigned int nmm = 0;
   unsigned int pm3 = 0;
   char *end;
-  int printall = 0;
+  int printall = 0, print_header = 0;
   unsigned int minfreq = 0, maxfreq = UINT_MAX;
   unsigned int distro = 0;
   unsigned int bloom = 0;
   unsigned int command = QUERY;
+  unsigned int is_union = 0;
 
   for (argidx = 1; argidx < argc; argidx++) {
     if (!strcmp (argv[argidx], "-v") || !strcmp (argv[argidx], "--version")) {
@@ -199,8 +234,12 @@ int main (int argc, const char *argv[])
       use_3p = 1;
     } else if (!strcmp(argv[argidx], "--5p")) {
       use_5p = 1;
+    } else if (!strcmp(argv[argidx], "--header")) {
+      print_header = 1;
     } else if (!strcmp(argv[argidx], "--bloom")) {
       bloom = 1;
+    } else if (!strcmp(argv[argidx], "--is_union")) {  
+      is_union = 1;
     } else if (!strcmp(argv[argidx], "--disable_scouts")) {  
       use_scouts = 0;
     } else if (argv[argidx][0] != '-') {
@@ -337,8 +376,19 @@ int main (int argc, const char *argv[])
 
   /* If no options is given print all lists/indices */
   if (!seqfilename && !querylistfilename && !queryfilename && !querystring) {
-    for (i = 0; i < n_lists; i++) {
-      print_full_map (maps[i], locations);
+    if (n_lists > 1) {
+      if (print_header) {
+        fprintf (stdout, "KMER");
+        for (i = 0; i < n_lists; i++) {
+          fprintf (stdout, "\t%s", lists[i]);
+        }
+        fprintf (stdout, "\n");
+      }
+      dump_lists (maps, n_lists, wlen, is_union);
+    } else {
+      for (i = 0; i < n_lists; i++) {
+        print_full_map (maps[i], locations);
+      }
     }
     exit (0);
   }
@@ -463,6 +513,8 @@ typedef struct _QueryData QueryData;
 struct _QueryData {
   GT4WordDictImplementation *dict_impl;
   GT4WordDictInstance *dict_inst;
+  GT4WordSListImplementation *slist_impl;
+  GT4WordSListInstance *slist_inst;
   GT4WordIndexImplementation *index_impl;
   GT4WordIndexInstance *index_inst;
   unsigned int n_mm;
@@ -647,6 +699,23 @@ search_fasta (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int
   return result;
 }
 
+static unsigned int
+search_list_zipper (GT4WordSListImplementation *l_impl, GT4WordSListInstance *l_inst, QueryData *qd)
+{
+  gt4_word_slist_get_first_word (l_impl, l_inst);
+  gt4_word_slist_get_first_word (qd->slist_impl, qd->slist_inst);
+  while ((qd->slist_inst->idx < qd->slist_inst->num_words) && (l_inst->idx < l_inst->num_words)) {
+    while (qd->slist_inst->word < l_inst->word) {
+      gt4_word_slist_get_next_word (qd->slist_impl, qd->slist_inst);
+    }
+    if (qd->slist_inst->word == l_inst->word) {
+      cb_print (l_inst->word, l_inst->count, qd);
+    }
+    gt4_word_slist_get_next_word (l_impl, l_inst);
+  }
+  return 0;
+}
+
 static unsigned int
 search_list (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int pm_3, unsigned int min_freq, unsigned int max_freq, int print_all_words)
 {
@@ -658,6 +727,7 @@ search_list (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int p
   unsigned int code = 0;
 
   qd.dict_impl = (GT4WordDictImplementation *) az_object_get_interface (obj, GT4_TYPE_WORD_DICT, (void **) &qd.dict_inst);
+  qd.slist_impl = (GT4WordSListImplementation *) az_object_get_interface (obj, GT4_TYPE_WORD_SLIST, (void **) &qd.slist_inst);
   if (GT4_IS_INDEX_MAP (obj) && locations) {
     qd.index_impl = (GT4WordIndexImplementation *) az_object_get_interface (obj, GT4_TYPE_WORD_INDEX, (void **) &qd.index_inst);
   }
@@ -688,12 +758,16 @@ search_list (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int p
     az_object_shutdown (s_obj);
     return GT_INCOMPATIBLE_WORDLENGTH_ERROR;
   }
-  
-  gt4_word_slist_get_first_word (s_impl, s_inst);
-  while (s_inst->idx < s_inst->num_words) {
-    uint64_t word = s_inst->word;
-    search_one_word (&qd, word);
-    if (!gt4_word_slist_get_next_word (s_impl, s_inst)) break;
+
+  if (!n_mm) {
+    search_list_zipper (s_impl, s_inst, &qd);
+  } else {
+    gt4_word_slist_get_first_word (s_impl, s_inst);
+    while (s_inst->idx < s_inst->num_words) {
+      uint64_t word = s_inst->word;
+      search_one_word (&qd, word);
+      if (!gt4_word_slist_get_next_word (s_impl, s_inst)) break;
+    }
   }
   az_object_shutdown (s_obj);
   return 0;
@@ -846,12 +920,15 @@ print_gc (AZObject *obj)
     unsigned long long word = inst->word;
     unsigned int j;
     for (j = 0; j < inst->word_length; j++) {
+      //unsigned int n = (unsigned int) (word & 3);
+      //n = (n ^ (n >> 1)) & 1;
+      //if ((n == 1) || (n == 2)) count += inst->count;
       count += inst->count * ((word ^ (word >> 1)) & 1);
       word = word >> 2;
     }
     gt4_word_slist_get_next_word (impl, inst);
   }
-  printf ("GC\t%g\n", (double) count / (inst->num_words * inst->word_length));
+  printf ("GC\t%g\n", (double) count / (inst->sum_counts * inst->word_length));
 }
 
 void print_help (int exit_value)


=====================================
src/gmer_counter.c
=====================================
@@ -39,6 +39,9 @@ struct _SNPTable {
   unsigned long long *words;
   unsigned int *alleles;
   /* Stats */
+  /* Number of N-s */
+  unsigned long long n_n;
+  /* Nucleotide length (excludes N) */
   unsigned long long n_nucl;
   unsigned long long n_gc;
   /* Index */
@@ -69,6 +72,9 @@ struct _SNPQueue {
   unsigned int n_full_tables;
   SNPTable **full_tables;
   /* Stats */
+  /* Sequence length (includes N) */
+  unsigned long long n_seq;
+  /* Nucleotide length (excludes N) */
   unsigned long long n_nucl;
   unsigned long long n_gc;
   unsigned long long n_kmers_total;
@@ -86,6 +92,7 @@ static void print_counts (SNPQueue *snpq, GT4GmerDB *db);
 static void process (GT4Queue *queue, unsigned int idx, void *arg);
 static int start_sequence (GT4FastaReader *reader, void *data);
 static int end_sequence (GT4FastaReader *reader, void *data);
+static int read_character (GT4FastaReader *reader, unsigned int ch, void *data);
 static int read_nucleotide (GT4FastaReader *reader, unsigned int nucleotide, void *data);
 static int read_word (GT4FastaReader *reader, unsigned long long word, void *data);
 static int compare_counts (const void *lhs, const void *rhs);
@@ -403,7 +410,8 @@ main (int argc, const char *argv[])
       }
 
       if (stats) {
-        fprintf (stdout, "#LENGTH\t%llu\n", snpq.n_nucl);
+        fprintf (stdout, "#LENGTH\t%llu\n", snpq.n_seq);
+        fprintf (stdout, "#LENGTH_ACGT\t%llu\n", snpq.n_nucl);
         fprintf (stdout, "#GC\t%.3f\n", (double) snpq.n_gc / snpq.n_nucl);
         fprintf (stdout, "#TOTAL_KMERS\t%llu\n", snpq.n_kmers_total);
         fprintf (stdout, "#LIST_KMERS\t%llu\n", snpq.n_kmers);
@@ -713,11 +721,12 @@ read_file (SNPQueue *snpq, TaskRead *tr)
   SNPTable *tbl = snpq->free_tables[--snpq->n_free_tables];
   gt4_queue_unlock (&snpq->lmq.queue);
   tbl->nwords = 0;
+  tbl->n_n = 0;
   tbl->n_nucl = 0;
   tbl->n_gc = 0;
   tr->data = tbl;
   /* if (debug > 0) fprintf (stderr, "Thread %d: reading file %s from %llu\n", idx, tt->_seqfile->path, tf->task_read.reader.cpos); */
-  result = fasta_reader_read_nwords (&tr->reader, BLOCK_SIZE, start_sequence, end_sequence, NULL, (stats) ? read_nucleotide : NULL, read_word, tr);
+  result = fasta_reader_read_nwords (&tr->reader, BLOCK_SIZE, start_sequence, end_sequence, (stats) ? read_character : NULL, (stats) ? read_nucleotide : NULL, read_word, tr);
   if (result) {
     fprintf (stderr, "read_file: Fasta reader %s returned %u\n", tr->reader.id, result);
     if (!recover) exit (1);
@@ -756,6 +765,8 @@ process_table (SNPQueue *snpq, TaskTable *tt, unsigned int thread_idx)
   gt4_queue_lock (&snpq->lmq.queue);
   /* fixme: Create separate task / mutex */
   if (stats) {
+    snpq->n_seq += tbl->n_nucl;
+    snpq->n_seq += tbl->n_n;
     snpq->n_nucl += tbl->n_nucl;
     snpq->n_gc += tbl->n_gc;
     snpq->n_kmers_total += tbl->nwords;
@@ -914,6 +925,16 @@ read_nucleotide (GT4FastaReader *reader, unsigned int nucl, void *data)
   return 0;
 }
 
+static int
+read_character (GT4FastaReader *reader, unsigned int ch, void *data)
+{
+  TaskRead *tt = (TaskRead *) data;
+  SNPTable *tbl = (SNPTable *) tt->data;
+
+  if ((ch == 'N') || (ch == 'n')) tbl->n_n += 1;
+  return 0;
+}
+
 static int
 compare_counts (const void *lhs, const void *rhs) {
   if (*((unsigned int *) lhs) < *((unsigned int *) rhs)) return -1;


=====================================
src/set-operations.c
=====================================
@@ -127,3 +127,103 @@ gt4_write_union (AZObject *arrays[], unsigned int n_arrays, unsigned int cutoff,
   
   return 0;
 }
+
+unsigned int
+gt4_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data)
+{
+  GT4WordSListImplementation *impls[GT4_MAX_SETS];
+  GT4WordSListInstance *insts[GT4_MAX_SETS];
+  unsigned int n_sources;
+  unsigned long long total = 0;
+  unsigned int j;
+
+  arikkei_return_val_if_fail (n_objs > 0, 1);
+  arikkei_return_val_if_fail (n_objs <= GT4_MAX_SETS, 1);
+  
+  n_sources = 0;
+  for (j = 0; j < n_objs; j++) {
+    impls[n_sources] = (GT4WordSListImplementation *) az_object_get_interface (AZ_OBJECT(objs[j]), GT4_TYPE_WORD_SLIST, (void **) &insts[n_sources]);
+    if (insts[n_sources]->num_words) {
+      gt4_word_slist_get_first_word (impls[n_sources], insts[n_sources]);
+      total += insts[n_sources]->num_words;
+      n_sources += 1;
+    }
+  }
+
+  if (n_sources) {
+    unsigned long long word;
+    uint32_t counts[GT4_MAX_SETS];
+    unsigned int result;
+    /* Find first word */
+    word = 0xffffffffffffffffULL;
+    for (j = 0; j < n_objs; j++) if ((insts[j]->idx < insts[j]->num_words) && (insts[j]->word < word)) word = insts[j]->word;
+    /* Iterate until all lists are exhausted */
+    while (n_sources) {
+      unsigned long long next = 0xffffffffffffffffULL;
+      for (j = 0; j < n_objs; j++) {
+        counts[j] = 0;
+        if (insts[j]->idx < insts[j]->num_words) {
+          if (insts[j]->word == word) {
+            counts[j] = insts[j]->count;
+            if (!gt4_word_slist_get_next_word (impls[j], insts[j])) {
+              n_sources -= 1;
+            }
+          }
+          if (insts[j]->word < next) next = insts[j]->word;
+        }
+      }
+      /* Now we have all freqs */
+      result = callback (word, counts, data);
+      if (result) return result;
+      word = next;
+    }
+  }
+  
+  return 0;
+}
+
+unsigned int
+gt4_is_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data)
+{
+  GT4WordSListImplementation *impls[GT4_MAX_SETS];
+  GT4WordSListInstance *insts[GT4_MAX_SETS];
+  unsigned int n_sources;
+  unsigned long long total = 0;
+  unsigned int j;
+
+  arikkei_return_val_if_fail (n_objs > 0, 1);
+  arikkei_return_val_if_fail (n_objs <= GT4_MAX_SETS, 1);
+  
+  n_sources = 0;
+  for (j = 0; j < n_objs; j++) {
+    impls[n_sources] = (GT4WordSListImplementation *) az_object_get_interface (AZ_OBJECT(objs[j]), GT4_TYPE_WORD_SLIST, (void **) &insts[n_sources]);
+    if (insts[n_sources]->num_words) {
+      gt4_word_slist_get_first_word (impls[n_sources], insts[n_sources]);
+      total += insts[n_sources]->num_words;
+      n_sources += 1;
+    }
+  }
+
+  while (insts[0]->idx < insts[0]->num_words) {
+    unsigned long long word;
+    uint32_t counts[GT4_MAX_SETS];
+    unsigned int result;
+    /* Find first word */
+    word = insts[0]->word;
+    counts[0] = insts[0]->count;
+    for (j = 1; j < n_objs; j++) {
+      counts[j] = 0;
+      while ((insts[j]->idx < insts[j]->num_words) && (insts[j]->word < word)) gt4_word_slist_get_next_word (impls[j], insts[j]);
+      if ((insts[j]->idx < insts[j]->num_words) && (insts[j]->word == word)) {
+        counts[j] = insts[j]->count;
+      }
+    }
+    /* Now we have all freqs */
+    result = callback (word, counts, data);
+    if (result) return result;
+    gt4_word_slist_get_next_word (impls[0], insts[0]);
+  }
+  
+  return 0;
+}
+


=====================================
src/set-operations.h
=====================================
@@ -33,4 +33,9 @@
 
 unsigned int gt4_write_union (AZObject *arrays[], unsigned int n_arrays, unsigned int cutoff, int ofile, GT4ListHeader *header);
 
+/* Execures callback for each unique kmer */
+/* If any callback returns not 0, reading stops and result is returned */
+unsigned int gt4_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data);
+unsigned int gt4_is_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data);
+
 #endif


=====================================
src/utils.c
=====================================
@@ -78,6 +78,7 @@ scout_map (void *arg)
     val += scout->cdata[i];
     if (!scout->running) break;
   }
+  pthread_exit (NULL);
   return (void *) val;
 }
 
@@ -93,8 +94,8 @@ void
 gt4_delete_scout (GT4Scout *scout)
 {
   if (!scout->running) return;
-  pthread_join (scout->thread, (void **) NULL);
   scout->running = 0;
+  pthread_join (scout->thread, (void **) NULL);
 }
 
 /* this implementation is based on:


=====================================
src/version.h
=====================================
@@ -26,7 +26,7 @@
 
 #define VERSION_MAJOR 4
 #define VERSION_MINOR 2
-#define VERSION_MICRO 7
+#define VERSION_MICRO 16
 #define VERSION_QUALIFIER "stable"
 
 #endif


=====================================
src/word-map.c
=====================================
@@ -197,7 +197,7 @@ gt4_word_map_new (const char *listfilename, unsigned int major_version, unsigned
   wmap->file_size = csize;
   if (hdr->version_minor == 0) {
     memcpy (&wmap->header, hdr, sizeof (struct _GT4ListHeader_4_0));
-    wmap->header.list_start = sizeof (GT4ListHeader);
+    wmap->header.list_start = sizeof (struct _GT4ListHeader_4_0);
     wmap->header.word_bytes = 8;
     wmap->header.count_bytes = 4;
   } else if (hdr->version_minor <= 2) {
@@ -309,21 +309,3 @@ gt4_word_map_lookup (GT4WordMap *wmap, unsigned long long query)
   return 0;
 }
 
-uint64_t
-gt4_word_map_get_word (const GT4WordMap *wmap, uint64_t idx)
-{
-  return *((uint64_t *) (wmap->wordlist + 12 * idx));
-}
-
-uint32_t
-gt4_word_map_get_count (const GT4WordMap *wmap, uint64_t idx)
-{
-  return *((uint32_t *) (wmap->wordlist + 12 * idx + 8));
-}
-
-uint64_t *
-gt4_word_map_get_word_ptr (const GT4WordMap *wmap, uint64_t idx)
-{
-  return (uint64_t *) (wmap->wordlist + 12 * idx);
-}
-


=====================================
src/word-map.h
=====================================
@@ -86,19 +86,19 @@ struct _GT4WordMapClass {
 
 unsigned int gt4_word_map_get_type (void);
 
-inline extern uint64_t
+static __inline__ uint64_t
 gt4_word_map_get_word (const GT4WordMap *wmap, uint64_t idx)
 {
   return *((uint64_t *) (wmap->wordlist + 12 * idx));
 }
 
-inline extern uint32_t
+static __inline__ uint32_t
 gt4_word_map_get_count (const GT4WordMap *wmap, uint64_t idx)
 {
   return *((uint32_t *) (wmap->wordlist + 12 * idx + 8));
 }
 
-inline extern uint64_t *
+static __inline__ uint64_t *
 gt4_word_map_get_word_ptr (const GT4WordMap *wmap, uint64_t idx)
 {
   return (uint64_t *) (wmap->wordlist + 12 * idx);



View it on GitLab: https://salsa.debian.org/med-team/genometester/-/commit/aa4cc7b11bf166cb484e0700dc06e6037ab02bda

-- 
View it on GitLab: https://salsa.debian.org/med-team/genometester/-/commit/aa4cc7b11bf166cb484e0700dc06e6037ab02bda
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20251211/833feb88/attachment-0001.htm>


More information about the debian-med-commit mailing list