[med-svn] [Git][med-team/genometester][master] 8 commits: Version=5

Andreas Tille (@tille) gitlab at salsa.debian.org
Thu Dec 11 20:32:55 GMT 2025



Andreas Tille pushed to branch master at Debian Med / genometester


Commits:
d69ace17 by Andreas Tille at 2025-12-11T14:48:22+01:00
Version=5

- - - - -
f71423a6 by Andreas Tille at 2025-12-11T14:49:37+01:00
New upstream version

- - - - -
aa4cc7b1 by Andreas Tille at 2025-12-11T14:49:38+01:00
New upstream version 4.0+git20221122.71e6625
- - - - -
ee7f38b2 by Andreas Tille at 2025-12-11T14:49:38+01:00
Update upstream source from tag 'upstream/4.0+git20221122.71e6625'

Update to upstream version '4.0+git20221122.71e6625'
with Debian dir cd315fe16a0fbff35585b472153baf4b607cb26b
- - - - -
78ed819e by Andreas Tille at 2025-12-11T14:49:39+01:00
Standards-Version: 4.7.2 (routine-update)

- - - - -
4c1ab384 by Andreas Tille at 2025-12-11T14:49:42+01:00
Remove duplicate line from changelog.

Changes-By: lintian-brush

- - - - -
40feacd6 by Andreas Tille at 2025-12-11T21:26:23+01:00
Refresh patches

- - - - -
4238f5cc by Andreas Tille at 2025-12-11T21:32:22+01:00
Fix C vs C++ confusion by rather calling CC for C code Closes: #1107684

- - - - -


22 changed files:

- debian/changelog
- debian/control
- debian/patches/add_debug_symbols.patch
- − debian/patches/avoid_redefinition.patch
- + debian/patches/cross.patch
- debian/patches/hardening.patch
- debian/patches/series
- debian/watch
- src/Makefile
- src/database.c
- src/fasta.c
- src/gassembler.c
- src/glistcompare.c
- src/glistmaker.c
- src/glistquery.c
- src/gmer_counter.c
- src/set-operations.c
- src/set-operations.h
- src/utils.c
- src/version.h
- src/word-map.c
- src/word-map.h


Changes:

=====================================
debian/changelog
=====================================
@@ -1,13 +1,21 @@
-genometester (4.0+git20211112.9030deb-1) UNRELEASED; urgency=medium
+genometester (4.0+git20221122.71e6625-1) UNRELEASED; urgency=medium
 
-  * Fix watchfile to detect new versions on github
+  [ Andreas Tille ]
   * New upstream version
+  * d/watch:
+     - detect new versions on github
+     - version=5
   * Standards-Version: 4.6.0 (routine-update)
   * debhelper-compat 13 (routine-update)
   * Add missing build dependency on dh addon.
   * Build-Depends: zlib1g-dev
+  * Standards-Version: 4.7.2 (routine-update)
 
- -- Andreas Tille <tille at debian.org>  Sun, 16 Jan 2022 17:13:40 +0100
+  [ Helmut Grohne ]
+  * Fix C vs C++ confusion by rather calling CC for C code
+    Closes: #1107684
+
+ -- Andreas Tille <tille at debian.org>  Thu, 11 Dec 2025 14:49:37 +0100
 
 genometester (4.0+git20200511.91cecb5+dfsg-1) unstable; urgency=medium
 


=====================================
debian/control
=====================================
@@ -7,7 +7,7 @@ Priority: optional
 Build-Depends: debhelper-compat (= 13),
                debhelper,
                zlib1g-dev
-Standards-Version: 4.6.0
+Standards-Version: 4.7.2
 Vcs-Browser: https://salsa.debian.org/med-team/genometester
 Vcs-Git: https://salsa.debian.org/med-team/genometester.git
 Homepage: https://github.com/bioinfo-ut/GenomeTester4


=====================================
debian/patches/add_debug_symbols.patch
=====================================
@@ -4,7 +4,7 @@ Description: Add debug symbols
 
 --- a/src/Makefile
 +++ b/src/Makefile
-@@ -164,7 +164,7 @@ AZ_SOURCES = \
+@@ -165,7 +165,7 @@ AZ_SOURCES = \
  	az/serialization.c az/serialization.h \
  	az/types.c az/types.h
  


=====================================
debian/patches/avoid_redefinition.patch deleted
=====================================
@@ -1,31 +0,0 @@
-Author: Andreas Tille <tille at debian.org>
-        Aaron M. Ucko <ucko at debian.org>
-Last-Update: Sun, 16 Jan 2022 21:32:08 -0500
-Origin: https://lists.debian.org/debian-med/2022/01/msg00030.html
-Description: Avoid "previous definition of ..."
-
---- a/src/word-map.h
-+++ b/src/word-map.h
-@@ -86,19 +86,19 @@ struct _GT4WordMapClass {
- 
- unsigned int gt4_word_map_get_type (void);
- 
--inline extern uint64_t
-+static inline uint64_t
- gt4_word_map_get_word (const GT4WordMap *wmap, uint64_t idx)
- {
-   return *((uint64_t *) (wmap->wordlist + 12 * idx));
- }
- 
--inline extern uint32_t
-+static inline uint32_t
- gt4_word_map_get_count (const GT4WordMap *wmap, uint64_t idx)
- {
-   return *((uint32_t *) (wmap->wordlist + 12 * idx + 8));
- }
- 
--inline extern uint64_t *
-+static inline uint64_t *
- gt4_word_map_get_word_ptr (const GT4WordMap *wmap, uint64_t idx)
- {
-   return (uint64_t *) (wmap->wordlist + 12 * idx);


=====================================
debian/patches/cross.patch
=====================================
@@ -0,0 +1,101 @@
+Author: Helmut Grohne <helmut at subdivi.de>
+Last-Update: 2025-12-11
+Bug-Debian: https://bugs.debian.org/1107684
+Description: Fix C vs C++ confusion by rather calling CC for C code
+
+--- a/src/Makefile
++++ b/src/Makefile
+@@ -1,7 +1,7 @@
+ # Project: GenomeTester v4.0
+ 
+ VERSION = 4.0
+-CXX  = gcc
++CC  = gcc
+ 
+ # C Files
+ 
+@@ -172,39 +172,39 @@ LIBS = -lm -lpthread -lrt -lz
+ INCS = -I.
+ BINS  = glistmaker glistquery glistcompare
+ 
+-#CXXFLAGS = $(INCS) $(DEBUGFLAGS) -Wall 
+-CXXFLAGS += $(INCS) $(RELEASEFLAGS) -Wall -std=c99 -D_DEFAULT_SOURCE -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L -D_XOPEN_SOURCE
++#CFLAGS = $(INCS) $(DEBUGFLAGS) -Wall 
++CFLAGS += $(INCS) $(RELEASEFLAGS) -Wall -std=c99 -D_DEFAULT_SOURCE -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L -D_XOPEN_SOURCE
+ 
+ .PHONY: all all-before all-after clean clean-custom
+ 
+ all: all-before $(BINS) all-after
+ 
+ glistmaker: $(LISTMAKER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES)
+-	$(CXX) $(LISTMAKER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o glistmaker $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
++	$(CC) $(LISTMAKER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o glistmaker $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
+ 
+ glistquery: $(LISTQUERY_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES)
+-	$(CXX) $(LISTQUERY_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o glistquery $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
++	$(CC) $(LISTQUERY_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o glistquery $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
+ 
+ glistcompare: $(LISTCOMPARE_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES)
+-	$(CXX) $(LISTCOMPARE_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o glistcompare $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
++	$(CC) $(LISTCOMPARE_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o glistcompare $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
+ 
+ gindexer: $(GINDEXER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES)
+-	$(CXX) $(GINDEXER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o gindexer $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
++	$(CC) $(GINDEXER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o gindexer $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
+ 
+ gmer_counter: $(GMER_COUNTER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES)
+-	$(CXX) $(GMER_COUNTER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o gmer_counter $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
++	$(CC) $(GMER_COUNTER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o gmer_counter $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
+ 
+ gmer_caller: $(GMER_CALLER_SOURCES)
+-	$(CXX) $(GMER_CALLER_SOURCES) -o gmer_caller $(LIBS) $(CXXFLAGS) -Wall
++	$(CC) $(GMER_CALLER_SOURCES) -o gmer_caller $(LIBS) $(CXXFLAGS) -Wall
+ 
+ gassembler: $(GASSEMBLER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES)
+-	$(CXX) $(GASSEMBLER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o gassembler $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
++	$(CC) $(GASSEMBLER_SOURCES) $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o gassembler $(LIBS) $(CXXFLAGS) $(AZ_FLAGS)
+ 
+ aleq: $(ALEQ_SOURCES)
+-	$(CXX) $(ALEQ_SOURCES) -o aleq $(LIBS) $(CXXFLAGS) -Wall
++	$(CC) $(ALEQ_SOURCES) -o aleq $(LIBS) $(CXXFLAGS) -Wall
+ 
+ kmer_predictor: $(KMER_PREDICTOR_SOURCES)
+-	$(CXX) $(KMER_PREDICTOR_SOURCES)  $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o kmer_predictor $(LIBS) $(AZ_FLAGS) $(CXXFLAGS)
++	$(CC) $(KMER_PREDICTOR_SOURCES)  $(AZ_SOURCES) $(ARIKKEI_SOURCES) -o kmer_predictor $(LIBS) $(AZ_FLAGS) $(CXXFLAGS)
+ 	
+ clean: clean-custom
+ 	rm -f *.o $(BINS)
+--- a/src/Makefile.gmer
++++ b/src/Makefile.gmer
+@@ -1,7 +1,7 @@
+ # Project: GMER Caller
+ 
+ VERSION = 1.0
+-CXX  = gcc
++CC  = gcc
+ 
+ # C Files
+ 
+@@ -37,18 +37,18 @@ LIBS = -lm -lpthread -lrt
+ INCS = -I.
+ BINS  = gmer_counter gmer_caller
+ 
+-#CXXFLAGS = $(INCS) $(DEBUGFLAGS) -Wall 
+-CXXFLAGS = $(INCS) $(RELEASEFLAGS) -Wall 
++#XFLAGS = $(INCS) $(DEBUGFLAGS) -Wall 
++CFLAGS = $(INCS) $(RELEASEFLAGS) -Wall 
+ 
+ .PHONY: all all-before all-after clean clean-custom
+ 
+ all: all-before $(BINS) all-after
+ 
+ gmer_counter: $(GMERCOUNTER_SOURCES)
+-	$(CXX) $(GMERCOUNTER_SOURCES) -o gmer_counter $(LIBS) $(CXXFLAGS) -Wall
++	$(CC) $(GMERCOUNTER_SOURCES) -o gmer_counter $(LIBS) $(CFLAGS) -Wall
+ 
+ gmer_caller: $(GMER_CALLER_SOURCES)
+-	$(CXX) $(GMER_CALLER_SOURCES) -o gmer_caller $(LIBS) $(CXXFLAGS) -Wall
++	$(CC) $(GMER_CALLER_SOURCES) -o gmer_caller $(LIBS) $(CFLAGS) -Wall
+ 
+ dist: $(GMERCOUNTER_SOURCES) $(GMER_CALLER_SOURCES)
+ 	mkdir fastgt_$(VERSION);


=====================================
debian/patches/hardening.patch
=====================================
@@ -4,12 +4,12 @@ Description: Propagate hardening options
 
 --- a/src/Makefile
 +++ b/src/Makefile
-@@ -172,7 +172,7 @@ INCS = -I.
+@@ -173,7 +173,7 @@ INCS = -I.
  BINS  = glistmaker glistquery glistcompare
  
  #CXXFLAGS = $(INCS) $(DEBUGFLAGS) -Wall 
--CXXFLAGS = $(INCS) $(RELEASEFLAGS) -Wall 
-+CXXFLAGS += $(INCS) $(RELEASEFLAGS) -Wall $(LDFLAGS)
+-CXXFLAGS = $(INCS) $(RELEASEFLAGS) -Wall -std=c99 -D_DEFAULT_SOURCE -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L -D_XOPEN_SOURCE
++CXXFLAGS += $(INCS) $(RELEASEFLAGS) -Wall -std=c99 -D_DEFAULT_SOURCE -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L -D_XOPEN_SOURCE
  
  .PHONY: all all-before all-after clean clean-custom
  


=====================================
debian/patches/series
=====================================
@@ -1,3 +1,3 @@
 add_debug_symbols.patch
 hardening.patch
-avoid_redefinition.patch
+cross.patch


=====================================
debian/watch
=====================================
@@ -1,6 +1,6 @@
-version=4
+Version: 5
 
-opts="mode=git,pretty=4.0+git%cd.%h,repacksuffix=+dfsg,dversionmangle=auto,uversionmangle=s/_/./g,repack,compression=xz" \
-   https://github.com/bioinfo-ut/GenomeTester4.git HEAD
-
-#  https://github.com/bioinfo-ut/GenomeTester4/releases .*/Version_(\d[_.\d]+)@ARCHIVE_EXT@
+Source: https://github.com/bioinfo-ut/GenomeTester4.git
+Matching-Pattern: HEAD
+Mode: git
+Git-Pretty: 4.0+git%cd.%h


=====================================
src/Makefile
=====================================
@@ -46,6 +46,7 @@ LISTQUERY_SOURCES = \
 	sequence-source.c sequence-source.h \
 	sequence-stream.c sequence-stream.h \
 	sequence-zstream.c sequence-zstream.h \
+	set-operations.c set-operations.h \
 	common.c common.h \
 	queue.c queue.h \
 	utils.c utils.h \
@@ -172,7 +173,7 @@ INCS = -I.
 BINS  = glistmaker glistquery glistcompare
 
 #CXXFLAGS = $(INCS) $(DEBUGFLAGS) -Wall 
-CXXFLAGS = $(INCS) $(RELEASEFLAGS) -Wall 
+CXXFLAGS = $(INCS) $(RELEASEFLAGS) -Wall -std=c99 -D_DEFAULT_SOURCE -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L -D_XOPEN_SOURCE
 
 .PHONY: all all-before all-after clean clean-custom
 


=====================================
src/database.c
=====================================
@@ -42,6 +42,12 @@ count_lines_from_text (const unsigned char *cdata, size_t csize, unsigned int *w
     unsigned int lengths[4];
     unsigned int ntokenz;
 
+    /* Skip comments */
+    if (cdata[cpos] == '#') {
+      while ((cpos < csize) && (cdata[cpos] != '\n')) cpos += 1;
+      if (cpos < csize) cpos += 1;
+      continue;
+    }
     ntokenz = split_line (cdata + cpos, csize - cpos, tokenz, lengths, 3);
     if (ntokenz < 2) {
       fprintf (stderr, "Line %u has <2 (%u) tokens\n", n_lines, ntokenz);
@@ -115,7 +121,7 @@ gt4_gmer_db_new_from_text (const unsigned char *cdata, unsigned long long csize,
   node_bits = get_bits (nlines + 1);
   kmer_bits = get_bits (max_kmers);
   if ((node_bits + kmer_bits) > 31) {
-    fprintf (stderr, "Too many nodes and kmers (%u (%u bits), %u (%u bits)\n", nlines + 1, max_kmers, node_bits, kmer_bits);
+    fprintf (stderr, "Too many nodes and kmers (%u (%u bits), %u (%u bits)\n", nlines + 1, node_bits, max_kmers, kmer_bits);
     return 0;
   }
   /* Set up DB */
@@ -162,6 +168,12 @@ gt4_gmer_db_new_from_text (const unsigned char *cdata, unsigned long long csize,
     unsigned int lengths[65536];
     unsigned int ntokenz, n_kmers;
     unsigned int i;
+    /* Skip comments */
+    if (cdata[cpos] == '#') {
+      while ((cpos < csize) && (cdata[cpos] != '\n')) cpos += 1;
+      if (cpos < csize) cpos += 1;
+      continue;
+    }
     /* Initialize */
     memset (db->nodes + idx, 0, sizeof (Node));
     /* Parse ID + number of kmers */
@@ -213,7 +225,7 @@ gt4_gmer_db_new_from_text (const unsigned char *cdata, unsigned long long csize,
           kmer2 = code2 & ((1 << db->kmer_bits) - 1);
           fprintf (stderr, "KMer already present (current node %u (%s) kmer %u/%u (%s) code %u) previous %u (%s) kmer %u/%u code %u\n",
             idx, db->names + db->nodes[idx].name, i, (dir != 0), word_to_string (word, db->wordsize), code,
-            idx2, db->names + db->nodes[idx2].name, kmer2, ((code2 & 0x7fffffff) != 0), code2);
+            idx2, db->names + db->nodes[idx2].name, kmer2, ((code2 & 0x80000000) != 0), code2);
           break;
         }
       }


=====================================
src/fasta.c
=====================================
@@ -252,13 +252,15 @@ fasta_reader_read_nwords (GT4FastaReader *reader, unsigned long long maxwords,
 	    nwords += 1;
 	  }
 	  /* We increase nucleotide position for N too */
-	  if (cval > ' ') {
-	    reader->seq_npos += 1;
-	  }
+	  //if (cval > ' ') {
+	  reader->seq_npos += 1;
+	  //}
 	} else if (cval >= ' ') {
 	  reader->wordfw = 0;
 	  reader->wordrv = 0;
 	  reader->currentlength = 0;
+	  /* We increase nucleotide position for N too */
+	  reader->seq_npos += 1;
 	}
       }
       reader->cpos += 1;


=====================================
src/gassembler.c
=====================================
@@ -698,6 +698,9 @@ print_usage (FILE *ofs, unsigned int advanced, int exit_value)
 static unsigned int only_chr = CHR_1;
 static unsigned int only_pos = 0;
 
+GT4Scout db_scout;
+GT4Scout seq_scout;
+
 int
 main (int argc, const char *argv[])
 {
@@ -1076,8 +1079,11 @@ main (int argc, const char *argv[])
     assemble_recursive (db, files, ref_chr, ref_start, ref_end, ref, kmers, nkmers);
   }
 
-  if (prefetch_db || prefetch_seq) {
-    delete_scouts ();
+  if (prefetch_db) {
+    gt4_delete_scout (&db_scout);
+  }
+  if (prefetch_seq) {
+    //gt4_delete_scout (&seq_scout);
   }
 
   return 0;
@@ -1098,6 +1104,9 @@ assemble_recursive (GT4GmerDB *db, SeqFile *files, unsigned int ref_chr, unsigne
   strncpy (dup, ref, len);
   adata->ref = dup;
   adata->cblock = (CallBlock *) malloc (sizeof (CallBlock));
+  memset (adata->cblock, 0, sizeof (CallBlock));
+  adata->cblock->calls = (Call *) malloc (MAX_REFERENCE_LENGTH * 2 * sizeof (Call));
+  memset (adata->cblock->calls, 0, MAX_REFERENCE_LENGTH * 2 * sizeof (Call));
   adata->cblock->chr = adata->chr;
   adata->cblock->start = adata->start;
   adata->cblock->end = adata->end;
@@ -2471,8 +2480,9 @@ load_db_or_die (const char *db_name, const char *seq_dir, const char *id)
     exit (1);
   }
   if (prefetch_db) {
-    scout_mmap (cdata, csize);
-    sleep (10);
+    db_scout.cdata = cdata;
+    db_scout.csize = csize;
+    gt4_scout_mmap (&db_scout);
   }
   db = gt4_gmer_db_new_from_binary (cdata, csize);
   if (!db) {
@@ -2533,7 +2543,7 @@ map_sequences (GT4GmerDB *db, const char *seq_dir)
         return NULL;
       }
       if (prefetch_seq) {
-        scout_mmap (files[i].cdata, files[i].csize);
+        //scout_mmap (files[i].cdata, files[i].csize);
       }
     }
   }


=====================================
src/glistcompare.c
=====================================
@@ -341,8 +341,13 @@ int main (int argc, const char *argv[])
     exit (1);
   }
   
-  if (!find_intrsec && (rule == RULE_SUBTRACT || rule == RULE_MIN || rule == RULE_FIRST || rule == RULE_SECOND)) {
-    fprintf (stderr, "Error: Rules min, subtract, fist and second can only be used with finding the intersection.\n");
+  if (!find_intrsec && (rule == RULE_MIN || rule == RULE_FIRST || rule == RULE_SECOND)) {
+    fprintf (stderr, "Error: Rules min, fist and second can only be used with finding the intersection.\n");
+    exit (1);
+  }
+
+  if ((!find_intrsec && !find_diff) && (rule == RULE_SUBTRACT)) {
+    fprintf (stderr, "Error: Rule subtract can only be used with intersection and difference.\n");
     exit (1);
   }
 


=====================================
src/glistmaker.c
=====================================
@@ -278,6 +278,15 @@ main (int argc, const char *argv[])
     exit (1);
   }
 
+  if (debug && create_index) {
+    for (i = 0; i < mq.n_sources; i++) {
+      fprintf (stderr, "%u: %s start %llu subseqs %u\n", i, i_files[mq.sources[i].file_idx].name, mq.sources[i].start, mq.sources[i].n_subseqs);
+      //for (j = 0; j < mq.sources[i].n_subseqs; j++) {
+      //  fprintf (stderr, "%llu %u\n", i_files[i].subseqs[j]->name_pos, i_files[i].subseqs[j]->name_len);
+      //}
+    }
+  }
+
   /* Do work */
   process (&mq.queue, 0, &mq);
   gt4_queue_lock (&mq.queue);
@@ -626,6 +635,27 @@ write_index (FILE *ofs, const char *loc_files[], unsigned int n_loc_files, GT4Li
   unsigned int version;
   unsigned long long n_words_loc, n_words, n_locations_loc, n_locs, file_block_loc, file_block_pos, kmer_list_loc, kmer_list_pos, locations_loc, locations_pos;
   unsigned char zero[16] = { 0 };
+
+  if (debug) {
+    for (i = 0; i < mq->n_sources; i++) {
+      unsigned int j;
+      GT4LMQSource *src = &mq->sources[i];
+      for (j = 0; j < src->n_subseqs; j++) {
+        GT4SubSequence *ss = &src->subseqs[j];
+        fprintf (stderr, "Src %u subseq %u np %llu nl %u ss %u sl %u\n", i, j, ss->name_pos, ss->name_len, ss->seq_pos, ss->seq_len);
+        unsigned long long np = src->start + ss->name_pos;
+        IFile *ifile = &i_files[src->file_idx];
+        FILE *f = fopen(ifile->name, "r");
+        fseek (f, np, SEEK_SET);
+        char b[256];
+        fread (b, ss->name_len, 1, f);
+        b[ss->name_len] = 0;
+        fclose (f);
+        fprintf (stderr, "%s\n", b);
+      }
+    }
+  }
+
   /* Determine file data */
   for (i = 0; i < n_i_files; i++) {
     GT4LMQSource *sources[1024];
@@ -652,10 +682,34 @@ write_index (FILE *ofs, const char *loc_files[], unsigned int n_loc_files, GT4Li
     for (j = 0; j < n_sources; j++) {
       unsigned int k;
       for (k = 0; k < sources[j]->n_subseqs; k++) {
-        i_files[i].subseqs[sources[j]->first_subseq + k] = &sources[j]->subseqs[k];
+        GT4SubSequence *ss = &sources[j]->subseqs[k];
+        i_files[i].subseqs[sources[j]->first_subseq + k] = ss;
+        // Adjust positions
+        ss->name_pos += sources[j]->start;
+        //fprintf (stderr, "%u %u start %llu np %llu\n", j, k, sources[j]->start, ss->name_pos);
+      }
+    }
+  }
+
+  if (debug) {
+    for (i = 0; i < n_i_files; i++) {
+      unsigned int j;
+      IFile *ifile = &i_files[i];
+      FILE *f = fopen(ifile->name, "r");
+      for (j = 0; j < ifile->n_subseqs; j++) {
+        GT4SubSequence *ss = ifile->subseqs[j];
+        fprintf (stderr, "Src %u subseq %u np %llu nl %u ss %u sl %u\n", i, j, ss->name_pos, ss->name_len, ss->seq_pos, ss->seq_len);
+        unsigned long long np = ss->name_pos;
+        fseek (f, np, SEEK_SET);
+        char b[256];
+        fread (b, ss->name_len, 1, f);
+        b[ss->name_len] = 0;
+        fprintf (stderr, "%s\n", b);
       }
+      fclose (f);
     }
   }
+
   /* Determine bitsizes */
   for (i = 0; i < mq->n_sources; i++) {
     unsigned int last_subseq = mq->sources[i].first_subseq + mq->sources[i].n_subseqs - 1;


=====================================
src/glistquery.c
=====================================
@@ -33,6 +33,7 @@
 #include "utils.h"
 #include "sequence.h"
 #include "sequence-stream.h"
+#include "set-operations.h"
 #include "fasta.h"
 #include "version.h"
 #include "index-map.h"
@@ -70,6 +71,39 @@ enum {
   FILES,
   SEQUENCES
 };
+
+typedef struct _DumpData DumpData;
+
+struct _DumpData {
+  unsigned int n_lists;
+  unsigned int wlen;
+};
+
+static unsigned int
+dump_callback (uint64_t word, uint32_t *counts, void *data)
+{
+  DumpData *dd = (DumpData *) data;
+  unsigned int i;
+  fprintf (stdout, "%s", word_to_string (word, dd->wlen));
+  for (i = 0; i < dd->n_lists; i++) {
+    fprintf (stdout, "\t%u", counts[i]);
+  }
+  fprintf (stdout, "\n");
+  return 0;
+}
+
+static void
+dump_lists (AZObject *objs[], unsigned int n_objs, unsigned int wlen, unsigned int is_union)
+{
+  DumpData dd;
+  dd.n_lists = n_objs;
+  dd.wlen = wlen;
+  if (is_union) {
+    gt4_is_union (objs, n_objs, dump_callback, &dd);
+  } else {
+    gt4_union (objs, n_objs, dump_callback, &dd);
+  }
+}
   
 int main (int argc, const char *argv[])
 {
@@ -80,11 +114,12 @@ int main (int argc, const char *argv[])
   unsigned int nmm = 0;
   unsigned int pm3 = 0;
   char *end;
-  int printall = 0;
+  int printall = 0, print_header = 0;
   unsigned int minfreq = 0, maxfreq = UINT_MAX;
   unsigned int distro = 0;
   unsigned int bloom = 0;
   unsigned int command = QUERY;
+  unsigned int is_union = 0;
 
   for (argidx = 1; argidx < argc; argidx++) {
     if (!strcmp (argv[argidx], "-v") || !strcmp (argv[argidx], "--version")) {
@@ -199,8 +234,12 @@ int main (int argc, const char *argv[])
       use_3p = 1;
     } else if (!strcmp(argv[argidx], "--5p")) {
       use_5p = 1;
+    } else if (!strcmp(argv[argidx], "--header")) {
+      print_header = 1;
     } else if (!strcmp(argv[argidx], "--bloom")) {
       bloom = 1;
+    } else if (!strcmp(argv[argidx], "--is_union")) {  
+      is_union = 1;
     } else if (!strcmp(argv[argidx], "--disable_scouts")) {  
       use_scouts = 0;
     } else if (argv[argidx][0] != '-') {
@@ -337,8 +376,19 @@ int main (int argc, const char *argv[])
 
   /* If no options is given print all lists/indices */
   if (!seqfilename && !querylistfilename && !queryfilename && !querystring) {
-    for (i = 0; i < n_lists; i++) {
-      print_full_map (maps[i], locations);
+    if (n_lists > 1) {
+      if (print_header) {
+        fprintf (stdout, "KMER");
+        for (i = 0; i < n_lists; i++) {
+          fprintf (stdout, "\t%s", lists[i]);
+        }
+        fprintf (stdout, "\n");
+      }
+      dump_lists (maps, n_lists, wlen, is_union);
+    } else {
+      for (i = 0; i < n_lists; i++) {
+        print_full_map (maps[i], locations);
+      }
     }
     exit (0);
   }
@@ -463,6 +513,8 @@ typedef struct _QueryData QueryData;
 struct _QueryData {
   GT4WordDictImplementation *dict_impl;
   GT4WordDictInstance *dict_inst;
+  GT4WordSListImplementation *slist_impl;
+  GT4WordSListInstance *slist_inst;
   GT4WordIndexImplementation *index_impl;
   GT4WordIndexInstance *index_inst;
   unsigned int n_mm;
@@ -647,6 +699,23 @@ search_fasta (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int
   return result;
 }
 
+static unsigned int
+search_list_zipper (GT4WordSListImplementation *l_impl, GT4WordSListInstance *l_inst, QueryData *qd)
+{
+  gt4_word_slist_get_first_word (l_impl, l_inst);
+  gt4_word_slist_get_first_word (qd->slist_impl, qd->slist_inst);
+  while ((qd->slist_inst->idx < qd->slist_inst->num_words) && (l_inst->idx < l_inst->num_words)) {
+    while (qd->slist_inst->word < l_inst->word) {
+      gt4_word_slist_get_next_word (qd->slist_impl, qd->slist_inst);
+    }
+    if (qd->slist_inst->word == l_inst->word) {
+      cb_print (l_inst->word, l_inst->count, qd);
+    }
+    gt4_word_slist_get_next_word (l_impl, l_inst);
+  }
+  return 0;
+}
+
 static unsigned int
 search_list (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int pm_3, unsigned int min_freq, unsigned int max_freq, int print_all_words)
 {
@@ -658,6 +727,7 @@ search_list (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int p
   unsigned int code = 0;
 
   qd.dict_impl = (GT4WordDictImplementation *) az_object_get_interface (obj, GT4_TYPE_WORD_DICT, (void **) &qd.dict_inst);
+  qd.slist_impl = (GT4WordSListImplementation *) az_object_get_interface (obj, GT4_TYPE_WORD_SLIST, (void **) &qd.slist_inst);
   if (GT4_IS_INDEX_MAP (obj) && locations) {
     qd.index_impl = (GT4WordIndexImplementation *) az_object_get_interface (obj, GT4_TYPE_WORD_INDEX, (void **) &qd.index_inst);
   }
@@ -688,12 +758,16 @@ search_list (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int p
     az_object_shutdown (s_obj);
     return GT_INCOMPATIBLE_WORDLENGTH_ERROR;
   }
-  
-  gt4_word_slist_get_first_word (s_impl, s_inst);
-  while (s_inst->idx < s_inst->num_words) {
-    uint64_t word = s_inst->word;
-    search_one_word (&qd, word);
-    if (!gt4_word_slist_get_next_word (s_impl, s_inst)) break;
+
+  if (!n_mm) {
+    search_list_zipper (s_impl, s_inst, &qd);
+  } else {
+    gt4_word_slist_get_first_word (s_impl, s_inst);
+    while (s_inst->idx < s_inst->num_words) {
+      uint64_t word = s_inst->word;
+      search_one_word (&qd, word);
+      if (!gt4_word_slist_get_next_word (s_impl, s_inst)) break;
+    }
   }
   az_object_shutdown (s_obj);
   return 0;
@@ -846,12 +920,15 @@ print_gc (AZObject *obj)
     unsigned long long word = inst->word;
     unsigned int j;
     for (j = 0; j < inst->word_length; j++) {
+      //unsigned int n = (unsigned int) (word & 3);
+      //n = (n ^ (n >> 1)) & 1;
+      //if ((n == 1) || (n == 2)) count += inst->count;
       count += inst->count * ((word ^ (word >> 1)) & 1);
       word = word >> 2;
     }
     gt4_word_slist_get_next_word (impl, inst);
   }
-  printf ("GC\t%g\n", (double) count / (inst->num_words * inst->word_length));
+  printf ("GC\t%g\n", (double) count / (inst->sum_counts * inst->word_length));
 }
 
 void print_help (int exit_value)


=====================================
src/gmer_counter.c
=====================================
@@ -39,6 +39,9 @@ struct _SNPTable {
   unsigned long long *words;
   unsigned int *alleles;
   /* Stats */
+  /* Number of N-s */
+  unsigned long long n_n;
+  /* Nucleotide length (excludes N) */
   unsigned long long n_nucl;
   unsigned long long n_gc;
   /* Index */
@@ -69,6 +72,9 @@ struct _SNPQueue {
   unsigned int n_full_tables;
   SNPTable **full_tables;
   /* Stats */
+  /* Sequence length (includes N) */
+  unsigned long long n_seq;
+  /* Nucleotide length (excludes N) */
   unsigned long long n_nucl;
   unsigned long long n_gc;
   unsigned long long n_kmers_total;
@@ -86,6 +92,7 @@ static void print_counts (SNPQueue *snpq, GT4GmerDB *db);
 static void process (GT4Queue *queue, unsigned int idx, void *arg);
 static int start_sequence (GT4FastaReader *reader, void *data);
 static int end_sequence (GT4FastaReader *reader, void *data);
+static int read_character (GT4FastaReader *reader, unsigned int ch, void *data);
 static int read_nucleotide (GT4FastaReader *reader, unsigned int nucleotide, void *data);
 static int read_word (GT4FastaReader *reader, unsigned long long word, void *data);
 static int compare_counts (const void *lhs, const void *rhs);
@@ -403,7 +410,8 @@ main (int argc, const char *argv[])
       }
 
       if (stats) {
-        fprintf (stdout, "#LENGTH\t%llu\n", snpq.n_nucl);
+        fprintf (stdout, "#LENGTH\t%llu\n", snpq.n_seq);
+        fprintf (stdout, "#LENGTH_ACGT\t%llu\n", snpq.n_nucl);
         fprintf (stdout, "#GC\t%.3f\n", (double) snpq.n_gc / snpq.n_nucl);
         fprintf (stdout, "#TOTAL_KMERS\t%llu\n", snpq.n_kmers_total);
         fprintf (stdout, "#LIST_KMERS\t%llu\n", snpq.n_kmers);
@@ -713,11 +721,12 @@ read_file (SNPQueue *snpq, TaskRead *tr)
   SNPTable *tbl = snpq->free_tables[--snpq->n_free_tables];
   gt4_queue_unlock (&snpq->lmq.queue);
   tbl->nwords = 0;
+  tbl->n_n = 0;
   tbl->n_nucl = 0;
   tbl->n_gc = 0;
   tr->data = tbl;
   /* if (debug > 0) fprintf (stderr, "Thread %d: reading file %s from %llu\n", idx, tt->_seqfile->path, tf->task_read.reader.cpos); */
-  result = fasta_reader_read_nwords (&tr->reader, BLOCK_SIZE, start_sequence, end_sequence, NULL, (stats) ? read_nucleotide : NULL, read_word, tr);
+  result = fasta_reader_read_nwords (&tr->reader, BLOCK_SIZE, start_sequence, end_sequence, (stats) ? read_character : NULL, (stats) ? read_nucleotide : NULL, read_word, tr);
   if (result) {
     fprintf (stderr, "read_file: Fasta reader %s returned %u\n", tr->reader.id, result);
     if (!recover) exit (1);
@@ -756,6 +765,8 @@ process_table (SNPQueue *snpq, TaskTable *tt, unsigned int thread_idx)
   gt4_queue_lock (&snpq->lmq.queue);
   /* fixme: Create separate task / mutex */
   if (stats) {
+    snpq->n_seq += tbl->n_nucl;
+    snpq->n_seq += tbl->n_n;
     snpq->n_nucl += tbl->n_nucl;
     snpq->n_gc += tbl->n_gc;
     snpq->n_kmers_total += tbl->nwords;
@@ -914,6 +925,16 @@ read_nucleotide (GT4FastaReader *reader, unsigned int nucl, void *data)
   return 0;
 }
 
+static int
+read_character (GT4FastaReader *reader, unsigned int ch, void *data)
+{
+  TaskRead *tt = (TaskRead *) data;
+  SNPTable *tbl = (SNPTable *) tt->data;
+
+  if ((ch == 'N') || (ch == 'n')) tbl->n_n += 1;
+  return 0;
+}
+
 static int
 compare_counts (const void *lhs, const void *rhs) {
   if (*((unsigned int *) lhs) < *((unsigned int *) rhs)) return -1;


=====================================
src/set-operations.c
=====================================
@@ -127,3 +127,103 @@ gt4_write_union (AZObject *arrays[], unsigned int n_arrays, unsigned int cutoff,
   
   return 0;
 }
+
+unsigned int
+gt4_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data)
+{
+  GT4WordSListImplementation *impls[GT4_MAX_SETS];
+  GT4WordSListInstance *insts[GT4_MAX_SETS];
+  unsigned int n_sources;
+  unsigned long long total = 0;
+  unsigned int j;
+
+  arikkei_return_val_if_fail (n_objs > 0, 1);
+  arikkei_return_val_if_fail (n_objs <= GT4_MAX_SETS, 1);
+  
+  n_sources = 0;
+  for (j = 0; j < n_objs; j++) {
+    impls[n_sources] = (GT4WordSListImplementation *) az_object_get_interface (AZ_OBJECT(objs[j]), GT4_TYPE_WORD_SLIST, (void **) &insts[n_sources]);
+    if (insts[n_sources]->num_words) {
+      gt4_word_slist_get_first_word (impls[n_sources], insts[n_sources]);
+      total += insts[n_sources]->num_words;
+      n_sources += 1;
+    }
+  }
+
+  if (n_sources) {
+    unsigned long long word;
+    uint32_t counts[GT4_MAX_SETS];
+    unsigned int result;
+    /* Find first word */
+    word = 0xffffffffffffffffULL;
+    for (j = 0; j < n_objs; j++) if ((insts[j]->idx < insts[j]->num_words) && (insts[j]->word < word)) word = insts[j]->word;
+    /* Iterate until all lists are exhausted */
+    while (n_sources) {
+      unsigned long long next = 0xffffffffffffffffULL;
+      for (j = 0; j < n_objs; j++) {
+        counts[j] = 0;
+        if (insts[j]->idx < insts[j]->num_words) {
+          if (insts[j]->word == word) {
+            counts[j] = insts[j]->count;
+            if (!gt4_word_slist_get_next_word (impls[j], insts[j])) {
+              n_sources -= 1;
+            }
+          }
+          if (insts[j]->word < next) next = insts[j]->word;
+        }
+      }
+      /* Now we have all freqs */
+      result = callback (word, counts, data);
+      if (result) return result;
+      word = next;
+    }
+  }
+  
+  return 0;
+}
+
+unsigned int
+gt4_is_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data)
+{
+  GT4WordSListImplementation *impls[GT4_MAX_SETS];
+  GT4WordSListInstance *insts[GT4_MAX_SETS];
+  unsigned int n_sources;
+  unsigned long long total = 0;
+  unsigned int j;
+
+  arikkei_return_val_if_fail (n_objs > 0, 1);
+  arikkei_return_val_if_fail (n_objs <= GT4_MAX_SETS, 1);
+  
+  n_sources = 0;
+  for (j = 0; j < n_objs; j++) {
+    impls[n_sources] = (GT4WordSListImplementation *) az_object_get_interface (AZ_OBJECT(objs[j]), GT4_TYPE_WORD_SLIST, (void **) &insts[n_sources]);
+    if (insts[n_sources]->num_words) {
+      gt4_word_slist_get_first_word (impls[n_sources], insts[n_sources]);
+      total += insts[n_sources]->num_words;
+      n_sources += 1;
+    }
+  }
+
+  while (insts[0]->idx < insts[0]->num_words) {
+    unsigned long long word;
+    uint32_t counts[GT4_MAX_SETS];
+    unsigned int result;
+    /* Find first word */
+    word = insts[0]->word;
+    counts[0] = insts[0]->count;
+    for (j = 1; j < n_objs; j++) {
+      counts[j] = 0;
+      while ((insts[j]->idx < insts[j]->num_words) && (insts[j]->word < word)) gt4_word_slist_get_next_word (impls[j], insts[j]);
+      if ((insts[j]->idx < insts[j]->num_words) && (insts[j]->word == word)) {
+        counts[j] = insts[j]->count;
+      }
+    }
+    /* Now we have all freqs */
+    result = callback (word, counts, data);
+    if (result) return result;
+    gt4_word_slist_get_next_word (impls[0], insts[0]);
+  }
+  
+  return 0;
+}
+


=====================================
src/set-operations.h
=====================================
@@ -33,4 +33,9 @@
 
 unsigned int gt4_write_union (AZObject *arrays[], unsigned int n_arrays, unsigned int cutoff, int ofile, GT4ListHeader *header);
 
+/* Execures callback for each unique kmer */
+/* If any callback returns not 0, reading stops and result is returned */
+unsigned int gt4_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data);
+unsigned int gt4_is_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data);
+
 #endif


=====================================
src/utils.c
=====================================
@@ -78,6 +78,7 @@ scout_map (void *arg)
     val += scout->cdata[i];
     if (!scout->running) break;
   }
+  pthread_exit (NULL);
   return (void *) val;
 }
 
@@ -93,8 +94,8 @@ void
 gt4_delete_scout (GT4Scout *scout)
 {
   if (!scout->running) return;
-  pthread_join (scout->thread, (void **) NULL);
   scout->running = 0;
+  pthread_join (scout->thread, (void **) NULL);
 }
 
 /* this implementation is based on:


=====================================
src/version.h
=====================================
@@ -26,7 +26,7 @@
 
 #define VERSION_MAJOR 4
 #define VERSION_MINOR 2
-#define VERSION_MICRO 7
+#define VERSION_MICRO 16
 #define VERSION_QUALIFIER "stable"
 
 #endif


=====================================
src/word-map.c
=====================================
@@ -197,7 +197,7 @@ gt4_word_map_new (const char *listfilename, unsigned int major_version, unsigned
   wmap->file_size = csize;
   if (hdr->version_minor == 0) {
     memcpy (&wmap->header, hdr, sizeof (struct _GT4ListHeader_4_0));
-    wmap->header.list_start = sizeof (GT4ListHeader);
+    wmap->header.list_start = sizeof (struct _GT4ListHeader_4_0);
     wmap->header.word_bytes = 8;
     wmap->header.count_bytes = 4;
   } else if (hdr->version_minor <= 2) {
@@ -309,21 +309,3 @@ gt4_word_map_lookup (GT4WordMap *wmap, unsigned long long query)
   return 0;
 }
 
-uint64_t
-gt4_word_map_get_word (const GT4WordMap *wmap, uint64_t idx)
-{
-  return *((uint64_t *) (wmap->wordlist + 12 * idx));
-}
-
-uint32_t
-gt4_word_map_get_count (const GT4WordMap *wmap, uint64_t idx)
-{
-  return *((uint32_t *) (wmap->wordlist + 12 * idx + 8));
-}
-
-uint64_t *
-gt4_word_map_get_word_ptr (const GT4WordMap *wmap, uint64_t idx)
-{
-  return (uint64_t *) (wmap->wordlist + 12 * idx);
-}
-


=====================================
src/word-map.h
=====================================
@@ -86,19 +86,19 @@ struct _GT4WordMapClass {
 
 unsigned int gt4_word_map_get_type (void);
 
-inline extern uint64_t
+static __inline__ uint64_t
 gt4_word_map_get_word (const GT4WordMap *wmap, uint64_t idx)
 {
   return *((uint64_t *) (wmap->wordlist + 12 * idx));
 }
 
-inline extern uint32_t
+static __inline__ uint32_t
 gt4_word_map_get_count (const GT4WordMap *wmap, uint64_t idx)
 {
   return *((uint32_t *) (wmap->wordlist + 12 * idx + 8));
 }
 
-inline extern uint64_t *
+static __inline__ uint64_t *
 gt4_word_map_get_word_ptr (const GT4WordMap *wmap, uint64_t idx)
 {
   return (uint64_t *) (wmap->wordlist + 12 * idx);



View it on GitLab: https://salsa.debian.org/med-team/genometester/-/compare/fefb8db39b02d173b496bb830a1c0f6bc4fe0f1a...4238f5cc1ffbea489fd0022f9b85014dba76066b

-- 
View it on GitLab: https://salsa.debian.org/med-team/genometester/-/compare/fefb8db39b02d173b496bb830a1c0f6bc4fe0f1a...4238f5cc1ffbea489fd0022f9b85014dba76066b
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20251211/baddaa44/attachment-0001.htm>


More information about the debian-med-commit mailing list