[med-svn] [htslib] 01/10: Added regidx API which should replace bcf_sr_regions at some point

Charles Plessy plessy at moszumanska.debian.org
Sun Oct 5 00:54:09 UTC 2014


This is an automated email from the git hooks/post-receive script.

plessy pushed a commit to branch debian/unstable
in repository htslib.

commit 5b98adcd6c52cdabcf6f71877ca94d370edf5eb6
Author: Petr Danecek <pd3 at sanger.ac.uk>
Date:   Thu Sep 25 10:18:44 2014 +0100

    Added regidx API which should replace bcf_sr_regions at some point
---
 Makefile               |  12 +-
 htslib/khash_str2int.h |   9 ++
 htslib/regidx.h        | 136 ++++++++++++++++++++++
 htslib_vars.mk         |   1 +
 regidx.c               | 306 +++++++++++++++++++++++++++++++++++++++++++++++++
 test/test-regidx.c     | 116 +++++++++++++++++++
 6 files changed, 578 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index ed9c9ce..cbc5360 100644
--- a/Makefile
+++ b/Makefile
@@ -60,7 +60,8 @@ BUILT_TEST_PROGRAMS = \
 	test/sam \
 	test/test_view \
 	test/test-vcf-api \
-	test/test-vcf-sweep
+	test/test-vcf-sweep \
+	test/test-regidx
 
 all: lib-static lib-shared $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS)
 
@@ -154,7 +155,8 @@ LIBHTS_OBJS = \
 	cram/string_alloc.o \
 	cram/thread_pool.o \
 	cram/vlen.o \
-	cram/zfio.o
+	cram/zfio.o \
+	regidx.o
 
 
 libhts.a: $(LIBHTS_OBJS)
@@ -204,6 +206,7 @@ synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c $(htslib_synced_
 vcf_sweep.o vcf_sweep.pico: vcf_sweep.c $(htslib_vcf_sweep_h) $(htslib_bgzf_h)
 vcfutils.o vcfutils.pico: vcfutils.c $(htslib_vcfutils_h)
 kfunc.o kfunc.pico: kfunc.c htslib/kfunc.h
+regidx.o regidx.pico: regidx.c $(htslib_regidx_h)
 
 cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c $(cram_h)
 cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c $(cram_h) cram/os.h cram/md5.h
@@ -237,6 +240,7 @@ tabix.o: tabix.c $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) htslib/kseq.h $
 # For tests that might use it, set $REF_PATH explicitly to use only reference
 # areas within the test suite (or set it to ':' to use no reference areas).
 check test: $(BUILT_TEST_PROGRAMS)
+	test/test-regidx
 	test/fieldarith test/fieldarith.sam
 	test/hfile
 	test/sam
@@ -261,12 +265,16 @@ test/test-vcf-api: test/test-vcf-api.o libhts.a
 test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a
 	$(CC) -pthread $(LDFLAGS) -o $@ test/test-vcf-sweep.o libhts.a $(LDLIBS) -lz
 
+test/test-regidx: test/test-regidx.o libhts.a
+	$(CC) -pthread $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LDLIBS) -lz
+
 test/fieldarith.o: test/fieldarith.c $(htslib_sam_h)
 test/hfile.o: test/hfile.c $(htslib_hfile_h) $(htslib_hts_defs_h)
 test/sam.o: test/sam.c $(htslib_sam_h) htslib/kstring.h
 test/test_view.o: test/test_view.c $(cram_h) $(htslib_sam_h)
 test/test-vcf-api.o: test/test-vcf-api.c $(htslib_hts_h) $(htslib_vcf_h) htslib/kstring.h
 test/test-vcf-sweep.o: test/test-vcf-sweep.c $(htslib_vcf_sweep_h)
+test/test-regidx.o: test/test-regidx.c $(htslib_regidx_h)
 
 
 install: libhts.a $(BUILT_PROGRAMS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig
diff --git a/htslib/khash_str2int.h b/htslib/khash_str2int.h
index 8c4f5a6..4bbc100 100644
--- a/htslib/khash_str2int.h
+++ b/htslib/khash_str2int.h
@@ -121,4 +121,13 @@ static inline int khash_str2int_set(void *_hash, const char *str, int value)
     return k;
 }
 
+/*
+ *  Return the number of keys in the hash table.
+ */
+static inline int khash_str2int_size(void *_hash)
+{
+    khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+    return kh_size(hash);
+}
+
 #endif
diff --git a/htslib/regidx.h b/htslib/regidx.h
new file mode 100644
index 0000000..74dc8c7
--- /dev/null
+++ b/htslib/regidx.h
@@ -0,0 +1,136 @@
+/* 
+    Copyright (C) 2014 Genome Research Ltd.
+
+    Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+    
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+    THE SOFTWARE.
+*/
+
+/*
+    Regions indexing with an optional payload. Inspired by samtools/bedidx.c.
+    This code is intended as future replacement of bcf_sr_regions_t.
+
+    Example of usage:
+
+        // Init the parser and print regions. In this example the payload is a
+        // pointer to a string. For the description of parse_custom and
+        // free_custom functions, see regidx_parse_f and regidx_free_f below,
+        // and for working example see test/test-regidx.c.
+        regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL);
+
+        // Query overlap with chr:from-to
+        regitr_t itr;
+        if ( regidx_overlap(idx, chr,from,to, &itr) ) printf("There is an overlap!\n");
+
+        while ( REGITR_OVERLAP(itr,from,to) )
+        {
+            printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to, 
+                REGITR_START(itr), REGITR_END(itr), REGITR_PAYLOAD(itr,char*));
+            itr.i++;
+        }
+
+        regidx_destroy(regs);
+*/
+
+#ifndef __REGIDX_H__
+#define __REGIDX_H__
+
+#include <inttypes.h>
+
+typedef struct _regidx_t regidx_t;
+typedef struct
+{
+    uint32_t start, end;
+}
+reg_t;
+typedef struct
+{
+    int i, n;
+    reg_t *reg;
+    void *payload;
+}
+regitr_t;
+
+#define REGITR_START(itr) (itr).reg[(itr).i].start
+#define REGITR_END(itr)   (itr).reg[(itr).i].end
+#define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload)[(itr).i]
+#define REGITR_OVERLAP(itr,from,to) (itr.i < itr.n && REGITR_START(itr)<=to && REGITR_END(itr)>=from )
+
+/*
+ *  regidx_parse_f - Function to parse one input line, such as regidx_parse_bed
+ *  or regidx_parse_tab below. The function is expected to set `chr_from` and
+ *  `chr_to` to point to first and last character of chromosome name and set
+ *  coordinates `reg->start` and `reg->end` (0-based, inclusive). If
+ *  regidx_init() was called with non-zero payload_size, the `payload` points
+ *  to a memory location of the payload_size and `usr` is data passed to
+ *  regidx_init(). Any memory allocated by the function will be freed by
+ *  regidx_free_f on regidx_destroy().
+ *
+ *  Return value: 0 on success, -1 to skip a record, -2 on fatal error.
+ */
+typedef int  (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr);
+typedef void (*regidx_free_f)(void *payload);
+
+int regidx_parse_bed(const char*,char**,char**,reg_t*,void*,void*);   // CHROM,FROM,TO (0-based,right-open)
+int regidx_parse_tab(const char*,char**,char**,reg_t*,void*,void*);   // CHROM,POS (1-based, inclusive)
+
+/*
+ *  regidx_init() - creates new index
+ *  @param fname:  input file name or NULL if regions will be added one-by-one via regidx_insert()
+ *  @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f
+ *  @param freef:  NULL or see description of regidx_parse_f
+ *  @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f
+ *  @param usr:    optional user data passed to regidx_parse_f
+ *
+ *  Returns index on success or NULL on error.
+ */
+regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, ssize_t payload_size, void *usr);
+
+/*
+ *  regidx_destroy() - free memory allocated by regidx_init
+ */
+void regidx_destroy(regidx_t *idx);
+
+/*
+ *  regidx_overlap() - check overlap of the location chr:from-to with regions
+ *  @param start,end:   0-based start, end coordinate (inclusive)
+ *  @param itr:         pointer to iterator, can be NULL if not needed
+ *
+ *  Returns 0 if there is no overlap or 1 if overlap is found. The overlapping
+ *  regions can be iterated as shown in the example above.
+ */
+int regidx_overlap(regidx_t *idx, char *chr, uint32_t start, uint32_t end, regitr_t *itr);
+
+/*
+ *  regidx_insert() - add a new region. 
+ *
+ *  After last region has been added, call regidx_insert(idx,NULL) to
+ *  build the index.
+ *
+ *  Returns 0 on success or -1 on error.
+ */
+int regidx_insert(regidx_t *idx, char *line);
+
+/*
+ *  regidx_seq_names() - return list of all sequence names
+ */
+char **regidx_seq_names(regidx_t *idx, int *n);
+
+#endif
+
diff --git a/htslib_vars.mk b/htslib_vars.mk
index 725e9ee..5a0fcbc 100644
--- a/htslib_vars.mk
+++ b/htslib_vars.mk
@@ -36,3 +36,4 @@ htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h)
 htslib_vcf_h = $(HTSPREFIX)htslib/vcf.h $(htslib_hts_h) $(HTSPREFIX)htslib/kstring.h
 htslib_vcf_sweep_h = $(HTSPREFIX)htslib/vcf_sweep.h $(htslib_hts_h) $(htslib_vcf_h)
 htslib_vcfutils_h = $(HTSPREFIX)htslib/vcfutils.h $(htslib_vcf_h)
+htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(HTSPREFIX)htslib/kstring.h $(HTSPREFIX)htslib/kseq.h  $(HTSPREFIX)htslib/khash_str2int.h
diff --git a/regidx.c b/regidx.c
new file mode 100644
index 0000000..9318ee1
--- /dev/null
+++ b/regidx.c
@@ -0,0 +1,306 @@
+/* 
+    Copyright (C) 2014 Genome Research Ltd.
+
+    Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+    
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+    THE SOFTWARE.
+*/
+
+#include "htslib/hts.h"
+#include "htslib/kstring.h"
+#include "htslib/kseq.h"
+#include "htslib/khash_str2int.h"
+#include "htslib/regidx.h"
+
+#define LIDX_SHIFT 13   // number of insignificant index bits
+
+// List of regions for one chromosome
+typedef struct
+{
+    int *idx, nidx;
+    int nregs, mregs;   // n:used, m:alloced
+    reg_t *regs;
+    void *payload;
+}
+reglist_t;
+
+// Container of all sequences
+struct _regidx_t
+{
+    int nseq, mseq;     // n:used, m:alloced
+    reglist_t *seq;     // regions for each sequence
+    void *seq2regs;     // hash for fast lookup from chr name to regions
+    char **seq_names;
+    regidx_free_f free;     // function to free any data allocated by regidx_parse_f
+    regidx_parse_f parse;   // parse one input line
+    void *usr;              // user data to pass to regidx_parse_f
+
+    // temporary data for index initialization
+    kstring_t str;
+    int rid_prev, start_prev, end_prev;
+    int payload_size;
+    void *payload;
+};
+
+char **regidx_seq_names(regidx_t *idx, int *n)
+{
+    *n = idx->nseq;
+    return idx->seq_names;
+}
+
+int _regidx_build_index(regidx_t *idx)
+{
+    int iseq;
+    for (iseq=0; iseq<idx->nseq; iseq++)
+    {
+        reglist_t *list = &idx->seq[iseq];
+        int j,k, imax = 0;   // max index bin
+        for (j=0; j<list->nregs; j++)
+        {
+            int ibeg = list->regs[j].start >> LIDX_SHIFT;
+            int iend = list->regs[j].end >> LIDX_SHIFT;
+            if ( imax < iend + 1 )
+            {
+                int old_imax = imax; 
+                imax = iend + 1;
+                kroundup32(imax);
+                list->idx = (int*) realloc(list->idx, imax*sizeof(int));
+                for (k=old_imax; k<imax; k++) list->idx[k] = -1;
+            }
+            if ( ibeg==iend )
+            {
+                if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j;
+            }
+            else
+            {
+                for (k=ibeg; k<=iend; k++)
+                    if ( list->idx[k]<0 ) list->idx[k] = j;
+            }
+            list->nidx = iend + 1;
+        }
+    }
+    return 0;
+}
+
+int regidx_insert(regidx_t *idx, char *line)
+{
+    if ( !line )
+        return _regidx_build_index(idx);
+
+    char *chr_from, *chr_to;
+    reg_t reg;
+    int ret = idx->parse(line,&chr_from,&chr_to,&reg,idx->payload,idx->usr);
+    if ( ret==-2 ) return -1;   // error
+    if ( ret==-1 ) return 0;    // skip the line
+
+    int rid;
+    idx->str.l = 0;
+    kputsn(chr_from, chr_to-chr_from+1, &idx->str);
+    if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
+    {
+        idx->nseq++;
+        int m_prev = idx->mseq;
+        hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
+        hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
+        idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
+        rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
+    }
+
+    reglist_t *list = &idx->seq[rid];
+    list->nregs++;
+    int m_prev = list->mregs;
+    hts_expand(reg_t,list->nregs,list->mregs,list->regs);
+    list->regs[list->nregs-1] = reg;
+    if ( idx->payload_size )
+    {
+        if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs);
+        memcpy(list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size);
+    }
+
+    if ( idx->rid_prev==rid )
+    {
+        if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) ) 
+        { 
+            fprintf(stderr,"The regions are not sorted: %s:%d-%d is before %s:%d-%d\n", 
+                idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1); 
+            return -1;
+        }
+    }
+    idx->rid_prev = rid;
+    idx->start_prev = reg.start;
+    idx->end_prev = reg.end;
+    return 0;
+}
+
+regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, ssize_t payload_size, void *usr_dat)
+{
+    regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+    idx->free  = free_f;
+    idx->parse = parser;
+    idx->usr   = usr_dat;
+    idx->seq2regs = khash_str2int_init();
+    idx->rid_prev   = -1;
+    idx->start_prev = -1;
+    idx->end_prev   = -1;
+    idx->payload_size = payload_size;
+    if ( payload_size ) idx->payload = malloc(payload_size);
+
+    if ( !fname ) return idx;
+    
+    htsFile *fp = hts_open(fname,"r");
+    if ( !fp ) return NULL;
+
+    kstring_t str = {0,0,0};
+    while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+    {
+        if ( regidx_insert(idx, str.s) ) goto error;
+    }
+    regidx_insert(idx, NULL);
+    
+    free(str.s);
+    hts_close(fp);
+    return idx;
+
+error:
+    free(str.s);
+    hts_close(fp);
+    regidx_destroy(idx);
+    return NULL;
+}
+
+void regidx_destroy(regidx_t *idx)
+{
+    int i, j;
+    for (i=0; i<idx->nseq; i++)
+    {
+        reglist_t *list = &idx->seq[i];
+        if ( idx->free )
+        {
+            for (j=0; j<list->nregs; j++)
+                idx->free(list->payload + idx->payload_size*j);
+        }
+        free(list->payload);
+        free(list->regs);
+        free(list->idx);
+    }
+    free(idx->seq_names);
+    free(idx->seq);
+    free(idx->str.s);
+    free(idx->payload);
+    khash_str2int_destroy_free(idx->seq2regs);
+    free(idx);
+}
+
+int regidx_overlap(regidx_t *idx, char *chr, uint32_t from, uint32_t to, regitr_t *itr)
+{
+    if ( itr ) itr->i = itr->n = 0;
+
+    int iseq;
+    if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
+
+    reglist_t *list = &idx->seq[iseq];
+    if ( !list->nregs ) return 0;
+
+    int i, ibeg = from>>LIDX_SHIFT; 
+    int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ];
+    if ( ireg < 0 )
+    {
+        // linear search; if slow, replace with binary search
+        if ( ibeg > list->nidx ) ibeg = list->nidx;
+        for (i=ibeg - 1; i>=0; i--)
+            if ( list->idx[i] >=0 ) break;
+        ireg = i>=0 ? list->idx[i] : 0;
+    }
+    for (i=ireg; i<list->nregs; i++)
+    {
+        if ( list->regs[i].start > to ) return 0;   // no match
+        if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found
+    }
+
+    if ( i>=list->nregs ) return 0;   // no match
+
+    if ( !itr ) return 1;
+
+    itr->i = 0;
+    itr->n = list->nregs - i;
+    itr->reg = &idx->seq[iseq].regs[i];
+    if ( idx->payload_size )
+        itr->payload = idx->seq[iseq].payload + i*idx->payload_size;
+    else
+        itr->payload = NULL;
+
+    return 1;
+}
+
+int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+    char *ss = (char*) line;
+    while ( *ss && isspace(*ss) ) ss++;
+    if ( !*ss ) return -1;      // skip blank lines
+    if ( *ss=='#' ) return -1;  // skip comments
+    
+    char *se = ss;
+    while ( *se && !isspace(*se) ) se++;
+    if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+    *chr_beg = ss;
+    *chr_end = se-1;
+
+    ss = se+1;
+    reg->start = strtol(ss, &se, 10);
+    if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+    ss = se+1;
+    reg->end = strtol(ss, &se, 10) - 1;
+    if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+    
+    return 0;
+}
+
+int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+    char *ss = (char*) line;
+    while ( *ss && isspace(*ss) ) ss++;
+    if ( !*ss ) return -1;      // skip blank lines
+    if ( *ss=='#' ) return -1;  // skip comments
+    
+    char *se = ss;
+    while ( *se && !isspace(*se) ) se++;
+    if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+    *chr_beg = ss;
+    *chr_end = se-1;
+
+    ss = se+1;
+    reg->start = strtol(ss, &se, 10) - 1;
+    if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+    if ( !se[0] || !se[1] )
+        reg->end = reg->start;
+    else
+    {
+        ss = se+1;
+        reg->end = strtol(ss, &se, 10);
+        if ( ss==se ) reg->end = reg->start;
+        else reg->end--;
+    }
+    
+    return 0;
+}
+
diff --git a/test/test-regidx.c b/test/test-regidx.c
new file mode 100644
index 0000000..0aea6b8
--- /dev/null
+++ b/test/test-regidx.c
@@ -0,0 +1,116 @@
+/*  test/test-regidx.c -- Regions index test harness.
+
+    Copyright (C) 2014 Genome Research Ltd.
+
+    Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+    
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+    THE SOFTWARE.
+*/
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <htslib/regidx.h>
+
+void error(const char *format, ...)
+{
+    va_list ap;
+    va_start(ap, format);
+    vfprintf(stderr, format, ap);
+    va_end(ap);
+    exit(-1);
+}
+
+int custom_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+    // Use the standard parser for CHROM,FROM,TO
+    int i, ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL);
+    if ( ret!=0 ) return ret;
+
+    // Skip the fields that were parsed above
+    char *ss = (char*) line;
+    while ( *ss && isspace(*ss) ) ss++;
+    for (i=0; i<3; i++)
+    {
+        while ( *ss && !isspace(*ss) ) ss++;
+        if ( !*ss ) return -2;  // wrong number of fields
+        while ( *ss && isspace(*ss) ) ss++;
+    }
+    if ( !*ss ) return -2;
+
+    // Parse the payload
+    char *se = ss;
+    while ( *se && !isspace(*se) ) se++;
+    char **dat = (char**) payload;
+    *dat = (char*) malloc(se-ss+1);
+    memcpy(*dat,ss,se-ss+1);
+    (*dat)[se-ss] = 0;
+    return 0;
+}
+void custom_free(void *payload)
+{
+    char **dat = (char**)payload;
+    free(*dat);
+}
+
+int main(int argc, char **argv)
+{
+    // Init index with no file name, we will insert the regions manually
+    regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL);
+    if ( !idx ) error("init failed\n");
+
+    // Insert regions
+    char *line;
+    line = "1 10000000 10000000 1:10000000-10000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+    line = "1 20000000 20000001 1:20000000-20000001"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+    line = "1 20000002 20000002 1:20000002-20000002"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+    line = "1 30000000 30000000 1:30000000-30000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+
+    // Finish initialization
+    regidx_insert(idx,NULL);
+
+    // Test 
+    regitr_t itr;
+    int from, to;
+
+    from = to = 10000000;
+    if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+    if ( strcmp("1:10000000-10000000",REGITR_PAYLOAD(itr,char*)) ) error("query failed: 1:%d-%d vs %s\n", from,to,REGITR_PAYLOAD(itr,char*));
+    if ( !regidx_overlap(idx,"1",from-2,to-1,&itr) ) error("query failed: 1:%d-%d\n",from-1,to);
+    if ( !regidx_overlap(idx,"1",from-2,to+3,&itr) ) error("query failed: 1:%d-%d\n",from-1,to+2);
+    if ( regidx_overlap(idx,"1",from-2,to-2,&itr) ) error("query failed: 1:%d-%d\n",from-1,to-1);
+
+    from = to = 20000000;
+    if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+
+    from = to = 20000002;
+    if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+
+    from = to = 30000000;
+    if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+
+    // Clean up
+    regidx_destroy(idx);
+    
+    return 0;
+}
+
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/htslib.git



More information about the debian-med-commit mailing list