[med-svn] [Git][med-team/genometester][upstream] New upstream version 4.0+git20221122.71e6625
Andreas Tille (@tille)
gitlab at salsa.debian.org
Thu Dec 11 20:33:04 GMT 2025
Andreas Tille pushed to branch upstream at Debian Med / genometester
Commits:
aa4cc7b1 by Andreas Tille at 2025-12-11T14:49:38+01:00
New upstream version 4.0+git20221122.71e6625
- - - - -
14 changed files:
- src/Makefile
- src/database.c
- src/fasta.c
- src/gassembler.c
- src/glistcompare.c
- src/glistmaker.c
- src/glistquery.c
- src/gmer_counter.c
- src/set-operations.c
- src/set-operations.h
- src/utils.c
- src/version.h
- src/word-map.c
- src/word-map.h
Changes:
=====================================
src/Makefile
=====================================
@@ -46,6 +46,7 @@ LISTQUERY_SOURCES = \
sequence-source.c sequence-source.h \
sequence-stream.c sequence-stream.h \
sequence-zstream.c sequence-zstream.h \
+ set-operations.c set-operations.h \
common.c common.h \
queue.c queue.h \
utils.c utils.h \
@@ -172,7 +173,7 @@ INCS = -I.
BINS = glistmaker glistquery glistcompare
#CXXFLAGS = $(INCS) $(DEBUGFLAGS) -Wall
-CXXFLAGS = $(INCS) $(RELEASEFLAGS) -Wall
+CXXFLAGS = $(INCS) $(RELEASEFLAGS) -Wall -std=c99 -D_DEFAULT_SOURCE -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L -D_XOPEN_SOURCE
.PHONY: all all-before all-after clean clean-custom
=====================================
src/database.c
=====================================
@@ -42,6 +42,12 @@ count_lines_from_text (const unsigned char *cdata, size_t csize, unsigned int *w
unsigned int lengths[4];
unsigned int ntokenz;
+ /* Skip comments */
+ if (cdata[cpos] == '#') {
+ while ((cpos < csize) && (cdata[cpos] != '\n')) cpos += 1;
+ if (cpos < csize) cpos += 1;
+ continue;
+ }
ntokenz = split_line (cdata + cpos, csize - cpos, tokenz, lengths, 3);
if (ntokenz < 2) {
fprintf (stderr, "Line %u has <2 (%u) tokens\n", n_lines, ntokenz);
@@ -115,7 +121,7 @@ gt4_gmer_db_new_from_text (const unsigned char *cdata, unsigned long long csize,
node_bits = get_bits (nlines + 1);
kmer_bits = get_bits (max_kmers);
if ((node_bits + kmer_bits) > 31) {
- fprintf (stderr, "Too many nodes and kmers (%u (%u bits), %u (%u bits)\n", nlines + 1, max_kmers, node_bits, kmer_bits);
+ fprintf (stderr, "Too many nodes and kmers (%u (%u bits), %u (%u bits)\n", nlines + 1, node_bits, max_kmers, kmer_bits);
return 0;
}
/* Set up DB */
@@ -162,6 +168,12 @@ gt4_gmer_db_new_from_text (const unsigned char *cdata, unsigned long long csize,
unsigned int lengths[65536];
unsigned int ntokenz, n_kmers;
unsigned int i;
+ /* Skip comments */
+ if (cdata[cpos] == '#') {
+ while ((cpos < csize) && (cdata[cpos] != '\n')) cpos += 1;
+ if (cpos < csize) cpos += 1;
+ continue;
+ }
/* Initialize */
memset (db->nodes + idx, 0, sizeof (Node));
/* Parse ID + number of kmers */
@@ -213,7 +225,7 @@ gt4_gmer_db_new_from_text (const unsigned char *cdata, unsigned long long csize,
kmer2 = code2 & ((1 << db->kmer_bits) - 1);
fprintf (stderr, "KMer already present (current node %u (%s) kmer %u/%u (%s) code %u) previous %u (%s) kmer %u/%u code %u\n",
idx, db->names + db->nodes[idx].name, i, (dir != 0), word_to_string (word, db->wordsize), code,
- idx2, db->names + db->nodes[idx2].name, kmer2, ((code2 & 0x7fffffff) != 0), code2);
+ idx2, db->names + db->nodes[idx2].name, kmer2, ((code2 & 0x80000000) != 0), code2);
break;
}
}
=====================================
src/fasta.c
=====================================
@@ -252,13 +252,15 @@ fasta_reader_read_nwords (GT4FastaReader *reader, unsigned long long maxwords,
nwords += 1;
}
/* We increase nucleotide position for N too */
- if (cval > ' ') {
- reader->seq_npos += 1;
- }
+ //if (cval > ' ') {
+ reader->seq_npos += 1;
+ //}
} else if (cval >= ' ') {
reader->wordfw = 0;
reader->wordrv = 0;
reader->currentlength = 0;
+ /* We increase nucleotide position for N too */
+ reader->seq_npos += 1;
}
}
reader->cpos += 1;
=====================================
src/gassembler.c
=====================================
@@ -698,6 +698,9 @@ print_usage (FILE *ofs, unsigned int advanced, int exit_value)
static unsigned int only_chr = CHR_1;
static unsigned int only_pos = 0;
+GT4Scout db_scout;
+GT4Scout seq_scout;
+
int
main (int argc, const char *argv[])
{
@@ -1076,8 +1079,11 @@ main (int argc, const char *argv[])
assemble_recursive (db, files, ref_chr, ref_start, ref_end, ref, kmers, nkmers);
}
- if (prefetch_db || prefetch_seq) {
- delete_scouts ();
+ if (prefetch_db) {
+ gt4_delete_scout (&db_scout);
+ }
+ if (prefetch_seq) {
+ //gt4_delete_scout (&seq_scout);
}
return 0;
@@ -1098,6 +1104,9 @@ assemble_recursive (GT4GmerDB *db, SeqFile *files, unsigned int ref_chr, unsigne
strncpy (dup, ref, len);
adata->ref = dup;
adata->cblock = (CallBlock *) malloc (sizeof (CallBlock));
+ memset (adata->cblock, 0, sizeof (CallBlock));
+ adata->cblock->calls = (Call *) malloc (MAX_REFERENCE_LENGTH * 2 * sizeof (Call));
+ memset (adata->cblock->calls, 0, MAX_REFERENCE_LENGTH * 2 * sizeof (Call));
adata->cblock->chr = adata->chr;
adata->cblock->start = adata->start;
adata->cblock->end = adata->end;
@@ -2471,8 +2480,9 @@ load_db_or_die (const char *db_name, const char *seq_dir, const char *id)
exit (1);
}
if (prefetch_db) {
- scout_mmap (cdata, csize);
- sleep (10);
+ db_scout.cdata = cdata;
+ db_scout.csize = csize;
+ gt4_scout_mmap (&db_scout);
}
db = gt4_gmer_db_new_from_binary (cdata, csize);
if (!db) {
@@ -2533,7 +2543,7 @@ map_sequences (GT4GmerDB *db, const char *seq_dir)
return NULL;
}
if (prefetch_seq) {
- scout_mmap (files[i].cdata, files[i].csize);
+ //scout_mmap (files[i].cdata, files[i].csize);
}
}
}
=====================================
src/glistcompare.c
=====================================
@@ -341,8 +341,13 @@ int main (int argc, const char *argv[])
exit (1);
}
- if (!find_intrsec && (rule == RULE_SUBTRACT || rule == RULE_MIN || rule == RULE_FIRST || rule == RULE_SECOND)) {
- fprintf (stderr, "Error: Rules min, subtract, fist and second can only be used with finding the intersection.\n");
+ if (!find_intrsec && (rule == RULE_MIN || rule == RULE_FIRST || rule == RULE_SECOND)) {
+ fprintf (stderr, "Error: Rules min, fist and second can only be used with finding the intersection.\n");
+ exit (1);
+ }
+
+ if ((!find_intrsec && !find_diff) && (rule == RULE_SUBTRACT)) {
+ fprintf (stderr, "Error: Rule subtract can only be used with intersection and difference.\n");
exit (1);
}
=====================================
src/glistmaker.c
=====================================
@@ -278,6 +278,15 @@ main (int argc, const char *argv[])
exit (1);
}
+ if (debug && create_index) {
+ for (i = 0; i < mq.n_sources; i++) {
+ fprintf (stderr, "%u: %s start %llu subseqs %u\n", i, i_files[mq.sources[i].file_idx].name, mq.sources[i].start, mq.sources[i].n_subseqs);
+ //for (j = 0; j < mq.sources[i].n_subseqs; j++) {
+ // fprintf (stderr, "%llu %u\n", i_files[i].subseqs[j]->name_pos, i_files[i].subseqs[j]->name_len);
+ //}
+ }
+ }
+
/* Do work */
process (&mq.queue, 0, &mq);
gt4_queue_lock (&mq.queue);
@@ -626,6 +635,27 @@ write_index (FILE *ofs, const char *loc_files[], unsigned int n_loc_files, GT4Li
unsigned int version;
unsigned long long n_words_loc, n_words, n_locations_loc, n_locs, file_block_loc, file_block_pos, kmer_list_loc, kmer_list_pos, locations_loc, locations_pos;
unsigned char zero[16] = { 0 };
+
+ if (debug) {
+ for (i = 0; i < mq->n_sources; i++) {
+ unsigned int j;
+ GT4LMQSource *src = &mq->sources[i];
+ for (j = 0; j < src->n_subseqs; j++) {
+ GT4SubSequence *ss = &src->subseqs[j];
+ fprintf (stderr, "Src %u subseq %u np %llu nl %u ss %u sl %u\n", i, j, ss->name_pos, ss->name_len, ss->seq_pos, ss->seq_len);
+ unsigned long long np = src->start + ss->name_pos;
+ IFile *ifile = &i_files[src->file_idx];
+ FILE *f = fopen(ifile->name, "r");
+ fseek (f, np, SEEK_SET);
+ char b[256];
+ fread (b, ss->name_len, 1, f);
+ b[ss->name_len] = 0;
+ fclose (f);
+ fprintf (stderr, "%s\n", b);
+ }
+ }
+ }
+
/* Determine file data */
for (i = 0; i < n_i_files; i++) {
GT4LMQSource *sources[1024];
@@ -652,10 +682,34 @@ write_index (FILE *ofs, const char *loc_files[], unsigned int n_loc_files, GT4Li
for (j = 0; j < n_sources; j++) {
unsigned int k;
for (k = 0; k < sources[j]->n_subseqs; k++) {
- i_files[i].subseqs[sources[j]->first_subseq + k] = &sources[j]->subseqs[k];
+ GT4SubSequence *ss = &sources[j]->subseqs[k];
+ i_files[i].subseqs[sources[j]->first_subseq + k] = ss;
+ // Adjust positions
+ ss->name_pos += sources[j]->start;
+ //fprintf (stderr, "%u %u start %llu np %llu\n", j, k, sources[j]->start, ss->name_pos);
+ }
+ }
+ }
+
+ if (debug) {
+ for (i = 0; i < n_i_files; i++) {
+ unsigned int j;
+ IFile *ifile = &i_files[i];
+ FILE *f = fopen(ifile->name, "r");
+ for (j = 0; j < ifile->n_subseqs; j++) {
+ GT4SubSequence *ss = ifile->subseqs[j];
+ fprintf (stderr, "Src %u subseq %u np %llu nl %u ss %u sl %u\n", i, j, ss->name_pos, ss->name_len, ss->seq_pos, ss->seq_len);
+ unsigned long long np = ss->name_pos;
+ fseek (f, np, SEEK_SET);
+ char b[256];
+ fread (b, ss->name_len, 1, f);
+ b[ss->name_len] = 0;
+ fprintf (stderr, "%s\n", b);
}
+ fclose (f);
}
}
+
/* Determine bitsizes */
for (i = 0; i < mq->n_sources; i++) {
unsigned int last_subseq = mq->sources[i].first_subseq + mq->sources[i].n_subseqs - 1;
=====================================
src/glistquery.c
=====================================
@@ -33,6 +33,7 @@
#include "utils.h"
#include "sequence.h"
#include "sequence-stream.h"
+#include "set-operations.h"
#include "fasta.h"
#include "version.h"
#include "index-map.h"
@@ -70,6 +71,39 @@ enum {
FILES,
SEQUENCES
};
+
+typedef struct _DumpData DumpData;
+
+struct _DumpData {
+ unsigned int n_lists;
+ unsigned int wlen;
+};
+
+static unsigned int
+dump_callback (uint64_t word, uint32_t *counts, void *data)
+{
+ DumpData *dd = (DumpData *) data;
+ unsigned int i;
+ fprintf (stdout, "%s", word_to_string (word, dd->wlen));
+ for (i = 0; i < dd->n_lists; i++) {
+ fprintf (stdout, "\t%u", counts[i]);
+ }
+ fprintf (stdout, "\n");
+ return 0;
+}
+
+static void
+dump_lists (AZObject *objs[], unsigned int n_objs, unsigned int wlen, unsigned int is_union)
+{
+ DumpData dd;
+ dd.n_lists = n_objs;
+ dd.wlen = wlen;
+ if (is_union) {
+ gt4_is_union (objs, n_objs, dump_callback, &dd);
+ } else {
+ gt4_union (objs, n_objs, dump_callback, &dd);
+ }
+}
int main (int argc, const char *argv[])
{
@@ -80,11 +114,12 @@ int main (int argc, const char *argv[])
unsigned int nmm = 0;
unsigned int pm3 = 0;
char *end;
- int printall = 0;
+ int printall = 0, print_header = 0;
unsigned int minfreq = 0, maxfreq = UINT_MAX;
unsigned int distro = 0;
unsigned int bloom = 0;
unsigned int command = QUERY;
+ unsigned int is_union = 0;
for (argidx = 1; argidx < argc; argidx++) {
if (!strcmp (argv[argidx], "-v") || !strcmp (argv[argidx], "--version")) {
@@ -199,8 +234,12 @@ int main (int argc, const char *argv[])
use_3p = 1;
} else if (!strcmp(argv[argidx], "--5p")) {
use_5p = 1;
+ } else if (!strcmp(argv[argidx], "--header")) {
+ print_header = 1;
} else if (!strcmp(argv[argidx], "--bloom")) {
bloom = 1;
+ } else if (!strcmp(argv[argidx], "--is_union")) {
+ is_union = 1;
} else if (!strcmp(argv[argidx], "--disable_scouts")) {
use_scouts = 0;
} else if (argv[argidx][0] != '-') {
@@ -337,8 +376,19 @@ int main (int argc, const char *argv[])
/* If no options is given print all lists/indices */
if (!seqfilename && !querylistfilename && !queryfilename && !querystring) {
- for (i = 0; i < n_lists; i++) {
- print_full_map (maps[i], locations);
+ if (n_lists > 1) {
+ if (print_header) {
+ fprintf (stdout, "KMER");
+ for (i = 0; i < n_lists; i++) {
+ fprintf (stdout, "\t%s", lists[i]);
+ }
+ fprintf (stdout, "\n");
+ }
+ dump_lists (maps, n_lists, wlen, is_union);
+ } else {
+ for (i = 0; i < n_lists; i++) {
+ print_full_map (maps[i], locations);
+ }
}
exit (0);
}
@@ -463,6 +513,8 @@ typedef struct _QueryData QueryData;
struct _QueryData {
GT4WordDictImplementation *dict_impl;
GT4WordDictInstance *dict_inst;
+ GT4WordSListImplementation *slist_impl;
+ GT4WordSListInstance *slist_inst;
GT4WordIndexImplementation *index_impl;
GT4WordIndexInstance *index_inst;
unsigned int n_mm;
@@ -647,6 +699,23 @@ search_fasta (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int
return result;
}
+static unsigned int
+search_list_zipper (GT4WordSListImplementation *l_impl, GT4WordSListInstance *l_inst, QueryData *qd)
+{
+ gt4_word_slist_get_first_word (l_impl, l_inst);
+ gt4_word_slist_get_first_word (qd->slist_impl, qd->slist_inst);
+ while ((qd->slist_inst->idx < qd->slist_inst->num_words) && (l_inst->idx < l_inst->num_words)) {
+ while (qd->slist_inst->word < l_inst->word) {
+ gt4_word_slist_get_next_word (qd->slist_impl, qd->slist_inst);
+ }
+ if (qd->slist_inst->word == l_inst->word) {
+ cb_print (l_inst->word, l_inst->count, qd);
+ }
+ gt4_word_slist_get_next_word (l_impl, l_inst);
+ }
+ return 0;
+}
+
static unsigned int
search_list (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int pm_3, unsigned int min_freq, unsigned int max_freq, int print_all_words)
{
@@ -658,6 +727,7 @@ search_list (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int p
unsigned int code = 0;
qd.dict_impl = (GT4WordDictImplementation *) az_object_get_interface (obj, GT4_TYPE_WORD_DICT, (void **) &qd.dict_inst);
+ qd.slist_impl = (GT4WordSListImplementation *) az_object_get_interface (obj, GT4_TYPE_WORD_SLIST, (void **) &qd.slist_inst);
if (GT4_IS_INDEX_MAP (obj) && locations) {
qd.index_impl = (GT4WordIndexImplementation *) az_object_get_interface (obj, GT4_TYPE_WORD_INDEX, (void **) &qd.index_inst);
}
@@ -688,12 +758,16 @@ search_list (AZObject *obj, const char *fname, unsigned int n_mm, unsigned int p
az_object_shutdown (s_obj);
return GT_INCOMPATIBLE_WORDLENGTH_ERROR;
}
-
- gt4_word_slist_get_first_word (s_impl, s_inst);
- while (s_inst->idx < s_inst->num_words) {
- uint64_t word = s_inst->word;
- search_one_word (&qd, word);
- if (!gt4_word_slist_get_next_word (s_impl, s_inst)) break;
+
+ if (!n_mm) {
+ search_list_zipper (s_impl, s_inst, &qd);
+ } else {
+ gt4_word_slist_get_first_word (s_impl, s_inst);
+ while (s_inst->idx < s_inst->num_words) {
+ uint64_t word = s_inst->word;
+ search_one_word (&qd, word);
+ if (!gt4_word_slist_get_next_word (s_impl, s_inst)) break;
+ }
}
az_object_shutdown (s_obj);
return 0;
@@ -846,12 +920,15 @@ print_gc (AZObject *obj)
unsigned long long word = inst->word;
unsigned int j;
for (j = 0; j < inst->word_length; j++) {
+ //unsigned int n = (unsigned int) (word & 3);
+ //n = (n ^ (n >> 1)) & 1;
+ //if ((n == 1) || (n == 2)) count += inst->count;
count += inst->count * ((word ^ (word >> 1)) & 1);
word = word >> 2;
}
gt4_word_slist_get_next_word (impl, inst);
}
- printf ("GC\t%g\n", (double) count / (inst->num_words * inst->word_length));
+ printf ("GC\t%g\n", (double) count / (inst->sum_counts * inst->word_length));
}
void print_help (int exit_value)
=====================================
src/gmer_counter.c
=====================================
@@ -39,6 +39,9 @@ struct _SNPTable {
unsigned long long *words;
unsigned int *alleles;
/* Stats */
+ /* Number of N-s */
+ unsigned long long n_n;
+ /* Nucleotide length (excludes N) */
unsigned long long n_nucl;
unsigned long long n_gc;
/* Index */
@@ -69,6 +72,9 @@ struct _SNPQueue {
unsigned int n_full_tables;
SNPTable **full_tables;
/* Stats */
+ /* Sequence length (includes N) */
+ unsigned long long n_seq;
+ /* Nucleotide length (excludes N) */
unsigned long long n_nucl;
unsigned long long n_gc;
unsigned long long n_kmers_total;
@@ -86,6 +92,7 @@ static void print_counts (SNPQueue *snpq, GT4GmerDB *db);
static void process (GT4Queue *queue, unsigned int idx, void *arg);
static int start_sequence (GT4FastaReader *reader, void *data);
static int end_sequence (GT4FastaReader *reader, void *data);
+static int read_character (GT4FastaReader *reader, unsigned int ch, void *data);
static int read_nucleotide (GT4FastaReader *reader, unsigned int nucleotide, void *data);
static int read_word (GT4FastaReader *reader, unsigned long long word, void *data);
static int compare_counts (const void *lhs, const void *rhs);
@@ -403,7 +410,8 @@ main (int argc, const char *argv[])
}
if (stats) {
- fprintf (stdout, "#LENGTH\t%llu\n", snpq.n_nucl);
+ fprintf (stdout, "#LENGTH\t%llu\n", snpq.n_seq);
+ fprintf (stdout, "#LENGTH_ACGT\t%llu\n", snpq.n_nucl);
fprintf (stdout, "#GC\t%.3f\n", (double) snpq.n_gc / snpq.n_nucl);
fprintf (stdout, "#TOTAL_KMERS\t%llu\n", snpq.n_kmers_total);
fprintf (stdout, "#LIST_KMERS\t%llu\n", snpq.n_kmers);
@@ -713,11 +721,12 @@ read_file (SNPQueue *snpq, TaskRead *tr)
SNPTable *tbl = snpq->free_tables[--snpq->n_free_tables];
gt4_queue_unlock (&snpq->lmq.queue);
tbl->nwords = 0;
+ tbl->n_n = 0;
tbl->n_nucl = 0;
tbl->n_gc = 0;
tr->data = tbl;
/* if (debug > 0) fprintf (stderr, "Thread %d: reading file %s from %llu\n", idx, tt->_seqfile->path, tf->task_read.reader.cpos); */
- result = fasta_reader_read_nwords (&tr->reader, BLOCK_SIZE, start_sequence, end_sequence, NULL, (stats) ? read_nucleotide : NULL, read_word, tr);
+ result = fasta_reader_read_nwords (&tr->reader, BLOCK_SIZE, start_sequence, end_sequence, (stats) ? read_character : NULL, (stats) ? read_nucleotide : NULL, read_word, tr);
if (result) {
fprintf (stderr, "read_file: Fasta reader %s returned %u\n", tr->reader.id, result);
if (!recover) exit (1);
@@ -756,6 +765,8 @@ process_table (SNPQueue *snpq, TaskTable *tt, unsigned int thread_idx)
gt4_queue_lock (&snpq->lmq.queue);
/* fixme: Create separate task / mutex */
if (stats) {
+ snpq->n_seq += tbl->n_nucl;
+ snpq->n_seq += tbl->n_n;
snpq->n_nucl += tbl->n_nucl;
snpq->n_gc += tbl->n_gc;
snpq->n_kmers_total += tbl->nwords;
@@ -914,6 +925,16 @@ read_nucleotide (GT4FastaReader *reader, unsigned int nucl, void *data)
return 0;
}
+static int
+read_character (GT4FastaReader *reader, unsigned int ch, void *data)
+{
+ TaskRead *tt = (TaskRead *) data;
+ SNPTable *tbl = (SNPTable *) tt->data;
+
+ if ((ch == 'N') || (ch == 'n')) tbl->n_n += 1;
+ return 0;
+}
+
static int
compare_counts (const void *lhs, const void *rhs) {
if (*((unsigned int *) lhs) < *((unsigned int *) rhs)) return -1;
=====================================
src/set-operations.c
=====================================
@@ -127,3 +127,103 @@ gt4_write_union (AZObject *arrays[], unsigned int n_arrays, unsigned int cutoff,
return 0;
}
+
+unsigned int
+gt4_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data)
+{
+ GT4WordSListImplementation *impls[GT4_MAX_SETS];
+ GT4WordSListInstance *insts[GT4_MAX_SETS];
+ unsigned int n_sources;
+ unsigned long long total = 0;
+ unsigned int j;
+
+ arikkei_return_val_if_fail (n_objs > 0, 1);
+ arikkei_return_val_if_fail (n_objs <= GT4_MAX_SETS, 1);
+
+ n_sources = 0;
+ for (j = 0; j < n_objs; j++) {
+ impls[n_sources] = (GT4WordSListImplementation *) az_object_get_interface (AZ_OBJECT(objs[j]), GT4_TYPE_WORD_SLIST, (void **) &insts[n_sources]);
+ if (insts[n_sources]->num_words) {
+ gt4_word_slist_get_first_word (impls[n_sources], insts[n_sources]);
+ total += insts[n_sources]->num_words;
+ n_sources += 1;
+ }
+ }
+
+ if (n_sources) {
+ unsigned long long word;
+ uint32_t counts[GT4_MAX_SETS];
+ unsigned int result;
+ /* Find first word */
+ word = 0xffffffffffffffffULL;
+ for (j = 0; j < n_objs; j++) if ((insts[j]->idx < insts[j]->num_words) && (insts[j]->word < word)) word = insts[j]->word;
+ /* Iterate until all lists are exhausted */
+ while (n_sources) {
+ unsigned long long next = 0xffffffffffffffffULL;
+ for (j = 0; j < n_objs; j++) {
+ counts[j] = 0;
+ if (insts[j]->idx < insts[j]->num_words) {
+ if (insts[j]->word == word) {
+ counts[j] = insts[j]->count;
+ if (!gt4_word_slist_get_next_word (impls[j], insts[j])) {
+ n_sources -= 1;
+ }
+ }
+ if (insts[j]->word < next) next = insts[j]->word;
+ }
+ }
+ /* Now we have all freqs */
+ result = callback (word, counts, data);
+ if (result) return result;
+ word = next;
+ }
+ }
+
+ return 0;
+}
+
+unsigned int
+gt4_is_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data)
+{
+ GT4WordSListImplementation *impls[GT4_MAX_SETS];
+ GT4WordSListInstance *insts[GT4_MAX_SETS];
+ unsigned int n_sources;
+ unsigned long long total = 0;
+ unsigned int j;
+
+ arikkei_return_val_if_fail (n_objs > 0, 1);
+ arikkei_return_val_if_fail (n_objs <= GT4_MAX_SETS, 1);
+
+ n_sources = 0;
+ for (j = 0; j < n_objs; j++) {
+ impls[n_sources] = (GT4WordSListImplementation *) az_object_get_interface (AZ_OBJECT(objs[j]), GT4_TYPE_WORD_SLIST, (void **) &insts[n_sources]);
+ if (insts[n_sources]->num_words) {
+ gt4_word_slist_get_first_word (impls[n_sources], insts[n_sources]);
+ total += insts[n_sources]->num_words;
+ n_sources += 1;
+ }
+ }
+
+ while (insts[0]->idx < insts[0]->num_words) {
+ unsigned long long word;
+ uint32_t counts[GT4_MAX_SETS];
+ unsigned int result;
+ /* Find first word */
+ word = insts[0]->word;
+ counts[0] = insts[0]->count;
+ for (j = 1; j < n_objs; j++) {
+ counts[j] = 0;
+ while ((insts[j]->idx < insts[j]->num_words) && (insts[j]->word < word)) gt4_word_slist_get_next_word (impls[j], insts[j]);
+ if ((insts[j]->idx < insts[j]->num_words) && (insts[j]->word == word)) {
+ counts[j] = insts[j]->count;
+ }
+ }
+ /* Now we have all freqs */
+ result = callback (word, counts, data);
+ if (result) return result;
+ gt4_word_slist_get_next_word (impls[0], insts[0]);
+ }
+
+ return 0;
+}
+
=====================================
src/set-operations.h
=====================================
@@ -33,4 +33,9 @@
unsigned int gt4_write_union (AZObject *arrays[], unsigned int n_arrays, unsigned int cutoff, int ofile, GT4ListHeader *header);
+/* Execures callback for each unique kmer */
+/* If any callback returns not 0, reading stops and result is returned */
+unsigned int gt4_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data);
+unsigned int gt4_is_union (AZObject *objs[], unsigned int n_objs, unsigned int (*callback) (uint64_t, uint32_t *, void *), void *data);
+
#endif
=====================================
src/utils.c
=====================================
@@ -78,6 +78,7 @@ scout_map (void *arg)
val += scout->cdata[i];
if (!scout->running) break;
}
+ pthread_exit (NULL);
return (void *) val;
}
@@ -93,8 +94,8 @@ void
gt4_delete_scout (GT4Scout *scout)
{
if (!scout->running) return;
- pthread_join (scout->thread, (void **) NULL);
scout->running = 0;
+ pthread_join (scout->thread, (void **) NULL);
}
/* this implementation is based on:
=====================================
src/version.h
=====================================
@@ -26,7 +26,7 @@
#define VERSION_MAJOR 4
#define VERSION_MINOR 2
-#define VERSION_MICRO 7
+#define VERSION_MICRO 16
#define VERSION_QUALIFIER "stable"
#endif
=====================================
src/word-map.c
=====================================
@@ -197,7 +197,7 @@ gt4_word_map_new (const char *listfilename, unsigned int major_version, unsigned
wmap->file_size = csize;
if (hdr->version_minor == 0) {
memcpy (&wmap->header, hdr, sizeof (struct _GT4ListHeader_4_0));
- wmap->header.list_start = sizeof (GT4ListHeader);
+ wmap->header.list_start = sizeof (struct _GT4ListHeader_4_0);
wmap->header.word_bytes = 8;
wmap->header.count_bytes = 4;
} else if (hdr->version_minor <= 2) {
@@ -309,21 +309,3 @@ gt4_word_map_lookup (GT4WordMap *wmap, unsigned long long query)
return 0;
}
-uint64_t
-gt4_word_map_get_word (const GT4WordMap *wmap, uint64_t idx)
-{
- return *((uint64_t *) (wmap->wordlist + 12 * idx));
-}
-
-uint32_t
-gt4_word_map_get_count (const GT4WordMap *wmap, uint64_t idx)
-{
- return *((uint32_t *) (wmap->wordlist + 12 * idx + 8));
-}
-
-uint64_t *
-gt4_word_map_get_word_ptr (const GT4WordMap *wmap, uint64_t idx)
-{
- return (uint64_t *) (wmap->wordlist + 12 * idx);
-}
-
=====================================
src/word-map.h
=====================================
@@ -86,19 +86,19 @@ struct _GT4WordMapClass {
unsigned int gt4_word_map_get_type (void);
-inline extern uint64_t
+static __inline__ uint64_t
gt4_word_map_get_word (const GT4WordMap *wmap, uint64_t idx)
{
return *((uint64_t *) (wmap->wordlist + 12 * idx));
}
-inline extern uint32_t
+static __inline__ uint32_t
gt4_word_map_get_count (const GT4WordMap *wmap, uint64_t idx)
{
return *((uint32_t *) (wmap->wordlist + 12 * idx + 8));
}
-inline extern uint64_t *
+static __inline__ uint64_t *
gt4_word_map_get_word_ptr (const GT4WordMap *wmap, uint64_t idx)
{
return (uint64_t *) (wmap->wordlist + 12 * idx);
View it on GitLab: https://salsa.debian.org/med-team/genometester/-/commit/aa4cc7b11bf166cb484e0700dc06e6037ab02bda
--
View it on GitLab: https://salsa.debian.org/med-team/genometester/-/commit/aa4cc7b11bf166cb484e0700dc06e6037ab02bda
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20251211/833feb88/attachment-0001.htm>
More information about the debian-med-commit
mailing list