[med-svn] [htslib] 02/06: synced_reader: Improved regions/targets to be faster with large number of sequences; More userfriendly regions_overlap() function

Charles Plessy plessy at moszumanska.debian.org
Tue Dec 10 10:06:47 UTC 2013


This is an automated email from the git hooks/post-receive script.

plessy pushed a commit to branch debian/unstable
in repository htslib.

commit 0908d4ce7aac6ef8ad00783ce62c2b8db6b6379b
Author: Petr Danecek <pd3 at sanger.ac.uk>
Date:   Thu Dec 5 16:01:42 2013 +0000

    synced_reader: Improved regions/targets to be faster with large number of sequences; More userfriendly regions_overlap() function
---
 Makefile                   |   2 +-
 htslib/khash_str2int.h     |  98 +++++++++++++++++++
 htslib/synced_bcf_reader.h |  28 +++---
 synced_bcf_reader.c        | 232 ++++++++++++++++++++++-----------------------
 4 files changed, 229 insertions(+), 131 deletions(-)

diff --git a/Makefile b/Makefile
index 4a57d82..bce92b6 100644
--- a/Makefile
+++ b/Makefile
@@ -173,7 +173,7 @@ sam.o sam.pico: sam.c $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) hfile.h htslib/
 tbx.o tbx.pico: tbx.c $(htslib_tbx_h) $(htslib_bgzf_h) htslib/khash.h
 faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) htslib/khash.h htslib/knetfile.h
 razf.o razf.pico: razf.c $(htslib_razf_h)
-synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c $(htslib_synced_bcf_reader_h) htslib/kseq.h
+synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c $(htslib_synced_bcf_reader_h) htslib/kseq.h htslib/khash_str2int.h
 vcf_sweep.o vcf_sweep.pico: vcf_sweep.c $(htslib_vcf_sweep_h) $(htslib_bgzf_h)
 vcfutils.o vcfutils.pico: vcfutils.c $(htslib_vcfutils_h)
 kfunc.o kfunc.pico: kfunc.c htslib/kfunc.h
diff --git a/htslib/khash_str2int.h b/htslib/khash_str2int.h
new file mode 100644
index 0000000..d9c7ac3
--- /dev/null
+++ b/htslib/khash_str2int.h
@@ -0,0 +1,98 @@
+#ifndef __KHASH_UTILS_H__
+#define __KHASH_UTILS_H__
+
+#include <htslib/khash.h>
+
+KHASH_MAP_INIT_STR(str2int, int)
+
+/*
+ *  Wrappers for khash dictionaries used by mpileup. 
+ */
+
+static inline void *khash_str2int_init()
+{
+    return kh_init(str2int);
+}
+
+/*
+ *  Destroy the hash structure, but not the keys
+ */ 
+static inline void khash_str2int_destroy(void *_hash)
+{
+    khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+    if (hash) kh_destroy(str2int, hash); // Note that strings are not freed.
+}
+
+/*
+ *  Destroys both the hash structure and the keys
+ */ 
+static inline void khash_str2int_destroy_free(void *_hash)
+{
+    khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+    khint_t k;
+    if (hash == 0) return;
+    for (k = 0; k < kh_end(hash); ++k)
+        if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
+    kh_destroy(str2int, hash);
+}
+
+/*
+ *  Returns 1 if key exists or 0 if not
+ */
+static inline int khash_str2int_has_key(void *_hash, const char *str)
+{
+    khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+    khint_t k = kh_get(str2int, hash, str);
+    if ( k == kh_end(hash) ) return 0;
+    return 1;
+}
+
+/*
+ *  Returns 0 on success and -1 when the key is not present. On success,
+ *  *value is set, unless NULL is passed.
+ */
+static inline int khash_str2int_get(void *_hash, const char *str, int *value)
+{
+    khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+    khint_t k;
+    if ( !hash ) return -1;
+    k = kh_get(str2int, hash, str);
+    if ( k == kh_end(hash) ) return -1;
+    if ( !value ) return 0;
+    *value = kh_val(hash, k);
+    return 0;
+}
+
+/*
+ *  Add a new string to the dictionary, auto-incrementing the value.
+ *  On success returns the newly inserted integer id, on error -1
+ *  is returned.
+ */
+static inline int khash_str2int_inc(void *_hash, const char *str)
+{
+    khint_t k;
+    int ret;
+    khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+    if ( !hash ) return -1;
+    k = kh_put(str2int, hash, str, &ret);
+    if (ret == 0) return kh_val(hash, k);
+    kh_val(hash, k) = kh_size(hash) - 1;
+    return kh_val(hash, k);
+}
+
+/*
+ *  Set a new key,value pair. On success returns the bin index, on
+ *  error -1 is returned.
+ */
+static inline int khash_str2int_set(void *_hash, const char *str, int value)
+{
+    khint_t k;
+    int ret;
+    khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+    if ( !hash ) return -1;
+    k = kh_put(str2int, hash, str, &ret);
+    kh_val(hash,k) = value;
+    return k;
+}
+
+#endif
diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h
index 9168e52..05de019 100644
--- a/htslib/synced_bcf_reader.h
+++ b/htslib/synced_bcf_reader.h
@@ -55,14 +55,13 @@ typedef struct
 
     // for in-memory regions (small data)
     struct _region_t *regs; // the regions
-    int nregs, ireg;        // number of regions; current region
 
     // shared by both tabix-index and in-memory regions
-    char **snames;          // sequence names
+    void *seq_hash;         // keys: sequence names, values: index to seqs
+    char **seq_names;       // sequence names
     int nseqs;              // number of sequences (chromosomes) in the file
-    char *seq;              // current position: chr name
+    int iseq;               // current position: chr name, index to snames
     int start, end;         // current position: start, end of the region (0-based)
-    int done;               // finished the region or the whole file if streaming
 }
 bcf_sr_regions_t;
 
@@ -103,8 +102,6 @@ typedef struct
 	char **samples;	// List of samples 
     bcf_sr_regions_t *regions, *targets;    // see bcf_sr_set_[targets|regions] for description
     int targets_als;    // subset to targets not only by position but also by alleles? (todo)
-    char *cseq;         // current sequence for targets, when regions are present
-    int crid;           // current sequence for targets, with single VCF
     kstring_t tmps;
 	int n_smpl;
 }
@@ -186,12 +183,15 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions);
 /*
  *  bcf_sr_regions_init() 
  *  @regions:   regions can be either a comma-separated list of regions
- *              (chr,chr:from-to) or a name of a tabix indexed file with a list
- *              of regions (<chr,pos> or <chr,from,to>).  Coordinates are
- *              one-based and inclusive.
+ *              (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or
+ *              tab-delimited file (the default). The columns of the
+ *              tab-delimited file are: CHROM, POS, and, optionally, POS_TO,
+ *              where positions are 1-based and inclusive. Uncompressed files
+ *              are stored in memory, while bgzip-compressed and tabix-indexed
+ *              region files are streamed.
  */
 bcf_sr_regions_t *bcf_sr_regions_init(const char *regions);
-void bcf_sr_regions_destroy(bcf_sr_regions_t *reg);
+void bcf_sr_regions_destroy(bcf_sr_regions_t *regions);
 
 /*
  *  bcf_sr_regions_seek() - seek to the chromosome block
@@ -211,14 +211,14 @@ int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr);
 int bcf_sr_regions_next(bcf_sr_regions_t *reg);
 
 /*
- *  bcf_sr_regions_query() - checks if the interval <start,end> overlaps any of
- *  the regions.  The sequence name is set by bcf_sr_regions_seek(). The
- *  coordinate queries must come in ascending order.
+ *  bcf_sr_regions_overlap() - checks if the interval <start,end> overlaps any of
+ *  the regions, the coordinates are 0-based, inclusive. The coordinate queries
+ *  must come in ascending order.
  *
  *  Returns 0 if the position is in regions; -1 if the position is not in the
  *  regions and more regions exist; -2 if not in the regions and there are no more
  *  regions left.
  */
-int bcf_sr_regions_query(bcf_sr_regions_t *reg, int start, int end);
+int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end);
 
 #endif
diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c
index f28e3e7..6ec258b 100644
--- a/synced_bcf_reader.c
+++ b/synced_bcf_reader.c
@@ -7,11 +7,18 @@
 #include <sys/stat.h>
 #include "htslib/synced_bcf_reader.h"
 #include "htslib/kseq.h"
+#include "htslib/khash_str2int.h"
+
+typedef struct
+{
+    uint32_t start, end;
+}
+region1_t;
 
 typedef struct _region_t 
 {
-    char *chr;
-    int start, end;
+    region1_t *regs;
+    int nregs, mregs, creg;
 }
 region_t;
 
@@ -167,7 +174,6 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
 bcf_srs_t *bcf_sr_init(void)
 {
     bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t));
-    files->crid = -1;
     return files;
 }
 
@@ -313,13 +319,13 @@ static int _readers_next_region(bcf_srs_t *files)
         bcf_sr_t *reader = &files->readers[i];
         if ( reader->tbx_idx )
         {
-            int tid = tbx_name2id(reader->tbx_idx, files->regions->seq);
+            int tid = tbx_name2id(reader->tbx_idx, files->regions->seq_names[files->regions->iseq]);
             if ( tid==-1 ) continue;    // the sequence not present in this file
             reader->itr = tbx_itr_queryi(reader->tbx_idx,tid,files->regions->start,files->regions->end+1);
         }
         else
         {
-            int tid = bcf_hdr_name2id(reader->header,files->regions->seq);
+            int tid = bcf_hdr_name2id(reader->header, files->regions->seq_names[files->regions->iseq]);
             if ( tid==-1 ) continue;    // the sequence not present in this file
             reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,files->regions->start,files->regions->end+1);
         }
@@ -505,13 +511,18 @@ int _reader_next_line(bcf_srs_t *files)
         if ( files->regions && _readers_next_region(files)<0 ) break;
 
         // Fill buffers
+        const char *chr = NULL;
         for (i=0; i<files->nreaders; i++)
         {
             _reader_fill_buffer(files, &files->readers[i]);
 
             // Update the minimum coordinate
             if ( !files->readers[i].nbuffer ) continue;
-            if ( min_pos > files->readers[i].buffer[1]->pos ) min_pos = files->readers[i].buffer[1]->pos; 
+            if ( min_pos > files->readers[i].buffer[1]->pos ) 
+            {
+                min_pos = files->readers[i].buffer[1]->pos; 
+                chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]);
+            }
         }
         if ( min_pos==INT_MAX ) 
         {
@@ -522,27 +533,7 @@ int _reader_next_line(bcf_srs_t *files)
         // Skip this position if not present in targets
         if ( files->targets )
         {
-            if  ( files->regions )
-            {
-                if ( !files->cseq || files->cseq!=files->regions->seq )
-                {
-                    bcf_sr_regions_seek(files->targets, files->regions->seq);
-                    files->cseq = files->regions->seq;  // set the current sequence
-                }
-            }
-            else
-            {
-                // If here, we must be streaming a single VCF, otherwise either explicit
-                // or implicit regions would be set. We can safely use rid as a unique sequence
-                // identifier
-                int rid = files->readers[0].buffer[1]->rid;
-                if ( files->crid<0 || files->crid!=rid )
-                {
-                    bcf_sr_regions_seek(files->targets, files->readers[0].header->id[BCF_DT_CTG][rid].key);
-                    files->crid = rid;
-                }
-            }
-            if ( bcf_sr_regions_query(files->targets, min_pos, min_pos)<0 ) 
+            if ( bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos)<0 ) 
             {
                 // Remove all lines with this position from the buffer
                 for (i=0; i<files->nreaders; i++)
@@ -785,7 +776,8 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname)
 }
 
 
-// Add a new region into a list sorted by start,end (1-based coordinates)
+// Add a new region into a list sorted by start,end. On input the coordinates
+// are 1-based, stored 0-based, inclusive.
 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end)
 {
     if ( start==-1 && end==-1 )
@@ -797,48 +789,53 @@ static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int
         start--; end--; // store 0-based coordinates
     }
 
-    int i;
-    for (i=0; i<reg->nregs; i++)
-        if ( !strcmp(reg->regs[i].chr,chr) ) break;
-    if ( i<reg->nregs && !strcmp(reg->regs[i].chr,chr) ) // the chromosome block already exists
+    if ( !reg->seq_hash )
+         reg->seq_hash = khash_str2int_init();
+
+    int iseq;
+    if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 )
     {
-        for (; i<reg->nregs; i++)
-            if ( strcmp(reg->regs[i].chr,chr) || reg->regs[i].start >= start ) break;
+        // the chromosome block does not exist
+        iseq = reg->nseqs++;
+        reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*reg->nseqs);
+        reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*reg->nseqs);
+        memset(&reg->regs[reg->nseqs-1],0,sizeof(region_t));
+        reg->seq_names[iseq] = strdup(chr);
+        reg->regs[iseq].creg = -1;
+        khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq);
+    }
 
-        // return if the region already exists
-        if ( i<reg->nregs && !strcmp(reg->regs[i].chr,chr) && reg->regs[i].start==start && reg->regs[i].end==end ) return;
+    region_t *creg = &reg->regs[iseq];
 
-        reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*(reg->nregs+1));
-        if ( i<reg->nregs )
-            memmove(&reg->regs[i+1],&reg->regs[i],(reg->nregs - i)*sizeof(region_t));
+    // the regions may not be sorted on input: binary search
+    int i, min = 0, max = creg->nregs - 1;
+    while ( min<=max )
+    {
+        i = (max+min)/2;
+        if ( start < creg->regs[i].start ) max = i - 1;
+        else if ( start > creg->regs[i].start ) min = i + 1;
+        else break;
     }
-    else
-        reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*(reg->nregs+1));
-
-    // Check if a new sequence name has to be added
-    int j;
-    for (j=0; j<reg->nseqs; j++)
-        if ( !strcmp(chr,reg->snames[j]) ) break;
-    if ( j==reg->nseqs )
+    if ( min>max || creg->regs[i].start!=start || creg->regs[i].end!=end )
     {
-        reg->nseqs++;
-        reg->snames = (char**) realloc(reg->snames,sizeof(char*)*reg->nseqs);
-        reg->snames[j] = strdup(chr);
+        // no such region, insert a new one just after max
+        hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs);
+        if ( ++max < creg->nregs )
+            memmove(&creg->regs[max+1],&creg->regs[max],(creg->nregs - max)*sizeof(region1_t));
+        creg->regs[max].start = start;
+        creg->regs[max].end   = end;
+        creg->nregs++;
     }
-
-    reg->nregs++;
-    reg->regs[i].chr   = reg->snames[j];
-    reg->regs[i].start = start;
-    reg->regs[i].end   = end;
 }
 
-// File name or a list of genomic locations
+// File name or a list of genomic locations. If file name, NULL is returned.
 static bcf_sr_regions_t *_regions_init_string(const char *str)
 {
     struct stat sbuf;
     if ( stat(str, &sbuf)==0 ) return NULL; // it's a file
 
     bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
+    reg->start = reg->end = -1;
 
     kstring_t tmp = {0,0,0};
     const char *sp = str, *ep = str;
@@ -889,11 +886,10 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
         }
     }
     free(tmp.s);
-    reg->ireg = -1;
     return reg;
 }
 
-// ichr,ifrom,ito are 0-based; line will be modified so that the *chr pointer is 0-terminated
+// ichr,ifrom,ito are 0-based; line will be modified so that the *chr pointer is 0-terminated.
 // returns -1 on error, 0 if the line is a comment line, 1 on success
 static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,int *from,int *to)
 {
@@ -958,7 +954,7 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions)
     if ( reg ) return reg;
 
     reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
-    reg->ireg = -1;
+    reg->start = reg->end = -1;
 
     reg->file = hts_open(regions, "rb");
     if ( !reg->file )
@@ -995,10 +991,16 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions)
             _regions_add(reg, chr, from, to);
         }
         hts_close(reg->file); reg->file = NULL;
-        if ( !reg->nregs ) { free(reg); return NULL; }
+        if ( !reg->nseqs ) { free(reg); return NULL; }
         return reg;
     }
-    reg->snames = (char**) tbx_seqnames(reg->tbx, &reg->nseqs);
+
+    reg->seq_names = (char**) tbx_seqnames(reg->tbx, &reg->nseqs);
+    if ( !reg->seq_hash )
+        reg->seq_hash = khash_str2int_init();
+    int i;
+    for (i=0; i<reg->nseqs; i++)
+        khash_str2int_set(reg->seq_hash,reg->seq_names[i],i);
     reg->fname  = strdup(regions);
     reg->is_bin = 1;
     return reg;
@@ -1007,67 +1009,66 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions)
 void bcf_sr_regions_destroy(bcf_sr_regions_t *reg)
 {
     int i;
-    free(reg->regs);
     free(reg->fname);
     if ( reg->itr ) tbx_itr_destroy(reg->itr);
     if ( reg->tbx ) tbx_destroy(reg->tbx);
     if ( reg->file ) hts_close(reg->file);
     if ( reg->als ) free(reg->als);
     free(reg->line.s);
-    if (reg->regs) 
-        for (i=0; i<reg->nseqs; i++) free(reg->snames[i]);  // free only in-memory names, tbx names are const
-    free(reg->snames);
+    if ( reg->regs ) 
+    {
+         // free only in-memory names, tbx names are const
+        for (i=0; i<reg->nseqs; i++) 
+        {   
+            free(reg->seq_names[i]);
+            free(reg->regs[i].regs);
+        }
+    }
+    free(reg->regs);
+    free(reg->seq_names);
+    khash_str2int_destroy(reg->seq_hash);
     free(reg);
 }
 
 int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
 {
-    reg->done = 1;
-    reg->start = reg->end = -1;
+    reg->iseq = reg->start = reg->end = -1;
+    if ( khash_str2int_get(reg->seq_hash, seq, &reg->iseq) < 0 ) return -1;  // sequence seq not in regions
 
-    int i;
-    if ( reg->regs )    // using in-memory regions
-    {
-        for (i=0; i<reg->nregs; i++)
-            if ( !strcmp(seq,reg->regs[i].chr) ) break;
-        reg->ireg = i-1;
-        if ( i==reg->nregs ) return -1;
-        reg->seq  = reg->snames[i]; 
-        reg->done = 0;
-        return 0;
-    }
+    // using in-memory regions
+    if ( reg->regs ) return 0;
 
     // reading regions from tabix
     if ( reg->itr ) tbx_itr_destroy(reg->itr);
     reg->itr = tbx_itr_querys(reg->tbx, seq);
-    if ( reg->itr )
-    {
-        for (i=0; i<reg->nseqs; i++) 
-            if (!strcmp(seq,reg->snames[i]) ) break;
-        if ( i==reg->nseqs ) return -1;
-        reg->seq  = reg->snames[i];
-        reg->done = 0;
-        return 0;
-    }
+    if ( reg->itr ) return 0;
+
     return -1;
 }
 
 int bcf_sr_regions_next(bcf_sr_regions_t *reg)
 {
-    if ( reg->done ) return -1;
-    reg->seq  = NULL; reg->start = reg->end = -1;
+    if ( reg->iseq<0 ) return -1;
+    reg->start = reg->end = -1;
     reg->nals = 0;
 
-    if ( reg->regs )    // using in-memory regions
+    // using in-memory regions
+    if ( reg->regs )
     {
-        reg->ireg++;
-        if ( reg->ireg>=reg->nregs ) { reg->done = 1; return -1; } // no more regions left
-        reg->seq   = reg->regs[reg->ireg].chr;
-        reg->start = reg->regs[reg->ireg].start;
-        reg->end   = reg->regs[reg->ireg].end;
+        while ( reg->iseq < reg->nseqs )
+        {
+            reg->regs[reg->iseq].creg++;
+            if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break;
+            reg->iseq++;
+        }
+        if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left
+        region1_t *creg = &reg->regs[reg->iseq].regs[reg->regs[reg->iseq].creg];
+        reg->start = creg->start;
+        reg->end   = creg->end;
         return 0;
     }
 
+    // reading from tabix
     char *chr;
     int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to;
     if ( reg->tbx )
@@ -1086,7 +1087,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg)
         {
             // tabix index present, reading a chromosome block
             ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, &reg->line);
-            if ( ret<0 ) { reg->done = 1; return -1; }
+            if ( ret<0 ) { reg->iseq = -1; return -1; }
         }
         else
         {
@@ -1108,7 +1109,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg)
 
             // tabix index absent, reading the whole file
             ret = hts_getline(reg->file, KS_SEP_LINE, &reg->line);
-            if ( ret<0 ) { reg->done = 1; return -1; }
+            if ( ret<0 ) { reg->iseq = -1; return -1; }
         }
         ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&from,&to);
         if ( ret<0 ) 
@@ -1119,21 +1120,19 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg)
     }
     if ( is_bed ) from++;
 
-    // find the chromosome name: assuming small number of chromosomes!
-    int i;
-    for (i=0; i<reg->nseqs; i++) 
-        if (!strcmp(chr,reg->snames[i]) ) break;
-    if ( !(i<reg->nseqs) ) fprintf(stderr,"i=%d nseq=%d  [%s][%s] [%s]\n", i,reg->nseqs,chr,reg->line.s,reg->snames[0]);
-    assert( i<reg->nseqs );
+    if ( khash_str2int_get(reg->seq_hash, chr, &reg->iseq)<0 )
+    {
+        fprintf(stderr,"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->line.s);
+        exit(1);
+    }
 
     // This is a bit hacky: unset the chr-terminating 0 set by _regions_parse_line, or
     //  otherwise _regions_match_alleles will be confused.
     int len = strlen(chr);
     if ( len < reg->line.l ) chr[len] = '\t';
 
-    reg->seq    = reg->snames[i];
-    reg->start  = from - 1;
-    reg->end    = to - 1;
+    reg->start = from - 1;
+    reg->end   = to - 1;
     return 0;
 }
 
@@ -1170,21 +1169,22 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re
     return !(type & VCF_INDEL) ? 1 : 0;
 }
 
-int bcf_sr_regions_query(bcf_sr_regions_t *reg, int start, int end)
+int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end)
 {
-    if ( reg->done ) return -2;     // no more regions left
+    if ( reg->iseq < 0 ) return -2;     // no more regions left
 
-    // init regions if it was not done already
-    if ( reg->start==-1 )
-        if ( bcf_sr_regions_next(reg) < 0 ) return -2;  // no more regions left
+    int iseq;
+    if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1;    // no such sequence
+
+    // new sequence
+    if ( iseq!=reg->iseq ) { reg->start = reg->end = -1; reg->iseq = iseq; }
 
-    char *seq = reg->seq;
-    while ( reg->seq==seq && reg->end < start )
+    while ( iseq==reg->iseq && reg->end < start )
     {
         if ( bcf_sr_regions_next(reg) < 0 ) return -2;  // no more regions left
+        if ( reg->iseq != iseq ) return -1; // does not overlap any regions
     }
-    if ( reg->seq != seq ) return -2;   // different chromosome, new seek is necessary
-    if ( reg->start <= end ) return 0;  // is in a region
-    return -1;  // does not overlap any region 
+    if ( reg->start <= end ) return 0;    // region overlap
+    return -1;  // no overlap
 }
 

-- 
Alioth's /git/debian-med/git-commit-notice on /srv/git.debian.org/git/debian-med/htslib.git



More information about the debian-med-commit mailing list