[med-svn] [htslib] 01/10: Added regidx API which should replace bcf_sr_regions at some point
Charles Plessy
plessy at moszumanska.debian.org
Sun Oct 5 00:54:09 UTC 2014
This is an automated email from the git hooks/post-receive script.
plessy pushed a commit to branch debian/unstable
in repository htslib.
commit 5b98adcd6c52cdabcf6f71877ca94d370edf5eb6
Author: Petr Danecek <pd3 at sanger.ac.uk>
Date: Thu Sep 25 10:18:44 2014 +0100
Added regidx API which should replace bcf_sr_regions at some point
---
Makefile | 12 +-
htslib/khash_str2int.h | 9 ++
htslib/regidx.h | 136 ++++++++++++++++++++++
htslib_vars.mk | 1 +
regidx.c | 306 +++++++++++++++++++++++++++++++++++++++++++++++++
test/test-regidx.c | 116 +++++++++++++++++++
6 files changed, 578 insertions(+), 2 deletions(-)
diff --git a/Makefile b/Makefile
index ed9c9ce..cbc5360 100644
--- a/Makefile
+++ b/Makefile
@@ -60,7 +60,8 @@ BUILT_TEST_PROGRAMS = \
test/sam \
test/test_view \
test/test-vcf-api \
- test/test-vcf-sweep
+ test/test-vcf-sweep \
+ test/test-regidx
all: lib-static lib-shared $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS)
@@ -154,7 +155,8 @@ LIBHTS_OBJS = \
cram/string_alloc.o \
cram/thread_pool.o \
cram/vlen.o \
- cram/zfio.o
+ cram/zfio.o \
+ regidx.o
libhts.a: $(LIBHTS_OBJS)
@@ -204,6 +206,7 @@ synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c $(htslib_synced_
vcf_sweep.o vcf_sweep.pico: vcf_sweep.c $(htslib_vcf_sweep_h) $(htslib_bgzf_h)
vcfutils.o vcfutils.pico: vcfutils.c $(htslib_vcfutils_h)
kfunc.o kfunc.pico: kfunc.c htslib/kfunc.h
+regidx.o regidx.pico: regidx.c $(htslib_regidx_h)
cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c $(cram_h)
cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c $(cram_h) cram/os.h cram/md5.h
@@ -237,6 +240,7 @@ tabix.o: tabix.c $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) htslib/kseq.h $
# For tests that might use it, set $REF_PATH explicitly to use only reference
# areas within the test suite (or set it to ':' to use no reference areas).
check test: $(BUILT_TEST_PROGRAMS)
+ test/test-regidx
test/fieldarith test/fieldarith.sam
test/hfile
test/sam
@@ -261,12 +265,16 @@ test/test-vcf-api: test/test-vcf-api.o libhts.a
test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ test/test-vcf-sweep.o libhts.a $(LDLIBS) -lz
+test/test-regidx: test/test-regidx.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LDLIBS) -lz
+
test/fieldarith.o: test/fieldarith.c $(htslib_sam_h)
test/hfile.o: test/hfile.c $(htslib_hfile_h) $(htslib_hts_defs_h)
test/sam.o: test/sam.c $(htslib_sam_h) htslib/kstring.h
test/test_view.o: test/test_view.c $(cram_h) $(htslib_sam_h)
test/test-vcf-api.o: test/test-vcf-api.c $(htslib_hts_h) $(htslib_vcf_h) htslib/kstring.h
test/test-vcf-sweep.o: test/test-vcf-sweep.c $(htslib_vcf_sweep_h)
+test/test-regidx.o: test/test-regidx.c $(htslib_regidx_h)
install: libhts.a $(BUILT_PROGRAMS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig
diff --git a/htslib/khash_str2int.h b/htslib/khash_str2int.h
index 8c4f5a6..4bbc100 100644
--- a/htslib/khash_str2int.h
+++ b/htslib/khash_str2int.h
@@ -121,4 +121,13 @@ static inline int khash_str2int_set(void *_hash, const char *str, int value)
return k;
}
+/*
+ * Return the number of keys in the hash table.
+ */
+static inline int khash_str2int_size(void *_hash)
+{
+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+ return kh_size(hash);
+}
+
#endif
diff --git a/htslib/regidx.h b/htslib/regidx.h
new file mode 100644
index 0000000..74dc8c7
--- /dev/null
+++ b/htslib/regidx.h
@@ -0,0 +1,136 @@
+/*
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/*
+ Regions indexing with an optional payload. Inspired by samtools/bedidx.c.
+ This code is intended as future replacement of bcf_sr_regions_t.
+
+ Example of usage:
+
+ // Init the parser and print regions. In this example the payload is a
+ // pointer to a string. For the description of parse_custom and
+ // free_custom functions, see regidx_parse_f and regidx_free_f below,
+ // and for working example see test/test-regidx.c.
+ regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL);
+
+ // Query overlap with chr:from-to
+ regitr_t itr;
+ if ( regidx_overlap(idx, chr,from,to, &itr) ) printf("There is an overlap!\n");
+
+ while ( REGITR_OVERLAP(itr,from,to) )
+ {
+ printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to,
+ REGITR_START(itr), REGITR_END(itr), REGITR_PAYLOAD(itr,char*));
+ itr.i++;
+ }
+
+ regidx_destroy(regs);
+*/
+
+#ifndef __REGIDX_H__
+#define __REGIDX_H__
+
+#include <inttypes.h>
+
+typedef struct _regidx_t regidx_t;
+typedef struct
+{
+ uint32_t start, end;
+}
+reg_t;
+typedef struct
+{
+ int i, n;
+ reg_t *reg;
+ void *payload;
+}
+regitr_t;
+
+#define REGITR_START(itr) (itr).reg[(itr).i].start
+#define REGITR_END(itr) (itr).reg[(itr).i].end
+#define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload)[(itr).i]
+#define REGITR_OVERLAP(itr,from,to) (itr.i < itr.n && REGITR_START(itr)<=to && REGITR_END(itr)>=from )
+
+/*
+ * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed
+ * or regidx_parse_tab below. The function is expected to set `chr_from` and
+ * `chr_to` to point to first and last character of chromosome name and set
+ * coordinates `reg->start` and `reg->end` (0-based, inclusive). If
+ * regidx_init() was called with non-zero payload_size, the `payload` points
+ * to a memory location of the payload_size and `usr` is data passed to
+ * regidx_init(). Any memory allocated by the function will be freed by
+ * regidx_free_f on regidx_destroy().
+ *
+ * Return value: 0 on success, -1 to skip a record, -2 on fatal error.
+ */
+typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr);
+typedef void (*regidx_free_f)(void *payload);
+
+int regidx_parse_bed(const char*,char**,char**,reg_t*,void*,void*); // CHROM,FROM,TO (0-based,right-open)
+int regidx_parse_tab(const char*,char**,char**,reg_t*,void*,void*); // CHROM,POS (1-based, inclusive)
+
+/*
+ * regidx_init() - creates new index
+ * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert()
+ * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f
+ * @param freef: NULL or see description of regidx_parse_f
+ * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f
+ * @param usr: optional user data passed to regidx_parse_f
+ *
+ * Returns index on success or NULL on error.
+ */
+regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, ssize_t payload_size, void *usr);
+
+/*
+ * regidx_destroy() - free memory allocated by regidx_init
+ */
+void regidx_destroy(regidx_t *idx);
+
+/*
+ * regidx_overlap() - check overlap of the location chr:from-to with regions
+ * @param start,end: 0-based start, end coordinate (inclusive)
+ * @param itr: pointer to iterator, can be NULL if not needed
+ *
+ * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping
+ * regions can be iterated as shown in the example above.
+ */
+int regidx_overlap(regidx_t *idx, char *chr, uint32_t start, uint32_t end, regitr_t *itr);
+
+/*
+ * regidx_insert() - add a new region.
+ *
+ * After last region has been added, call regidx_insert(idx,NULL) to
+ * build the index.
+ *
+ * Returns 0 on success or -1 on error.
+ */
+int regidx_insert(regidx_t *idx, char *line);
+
+/*
+ * regidx_seq_names() - return list of all sequence names
+ */
+char **regidx_seq_names(regidx_t *idx, int *n);
+
+#endif
+
diff --git a/htslib_vars.mk b/htslib_vars.mk
index 725e9ee..5a0fcbc 100644
--- a/htslib_vars.mk
+++ b/htslib_vars.mk
@@ -36,3 +36,4 @@ htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h)
htslib_vcf_h = $(HTSPREFIX)htslib/vcf.h $(htslib_hts_h) $(HTSPREFIX)htslib/kstring.h
htslib_vcf_sweep_h = $(HTSPREFIX)htslib/vcf_sweep.h $(htslib_hts_h) $(htslib_vcf_h)
htslib_vcfutils_h = $(HTSPREFIX)htslib/vcfutils.h $(htslib_vcf_h)
+htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(HTSPREFIX)htslib/kstring.h $(HTSPREFIX)htslib/kseq.h $(HTSPREFIX)htslib/khash_str2int.h
diff --git a/regidx.c b/regidx.c
new file mode 100644
index 0000000..9318ee1
--- /dev/null
+++ b/regidx.c
@@ -0,0 +1,306 @@
+/*
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "htslib/hts.h"
+#include "htslib/kstring.h"
+#include "htslib/kseq.h"
+#include "htslib/khash_str2int.h"
+#include "htslib/regidx.h"
+
+#define LIDX_SHIFT 13 // number of insignificant index bits
+
+// List of regions for one chromosome
+typedef struct
+{
+ int *idx, nidx;
+ int nregs, mregs; // n:used, m:alloced
+ reg_t *regs;
+ void *payload;
+}
+reglist_t;
+
+// Container of all sequences
+struct _regidx_t
+{
+ int nseq, mseq; // n:used, m:alloced
+ reglist_t *seq; // regions for each sequence
+ void *seq2regs; // hash for fast lookup from chr name to regions
+ char **seq_names;
+ regidx_free_f free; // function to free any data allocated by regidx_parse_f
+ regidx_parse_f parse; // parse one input line
+ void *usr; // user data to pass to regidx_parse_f
+
+ // temporary data for index initialization
+ kstring_t str;
+ int rid_prev, start_prev, end_prev;
+ int payload_size;
+ void *payload;
+};
+
+char **regidx_seq_names(regidx_t *idx, int *n)
+{
+ *n = idx->nseq;
+ return idx->seq_names;
+}
+
+int _regidx_build_index(regidx_t *idx)
+{
+ int iseq;
+ for (iseq=0; iseq<idx->nseq; iseq++)
+ {
+ reglist_t *list = &idx->seq[iseq];
+ int j,k, imax = 0; // max index bin
+ for (j=0; j<list->nregs; j++)
+ {
+ int ibeg = list->regs[j].start >> LIDX_SHIFT;
+ int iend = list->regs[j].end >> LIDX_SHIFT;
+ if ( imax < iend + 1 )
+ {
+ int old_imax = imax;
+ imax = iend + 1;
+ kroundup32(imax);
+ list->idx = (int*) realloc(list->idx, imax*sizeof(int));
+ for (k=old_imax; k<imax; k++) list->idx[k] = -1;
+ }
+ if ( ibeg==iend )
+ {
+ if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j;
+ }
+ else
+ {
+ for (k=ibeg; k<=iend; k++)
+ if ( list->idx[k]<0 ) list->idx[k] = j;
+ }
+ list->nidx = iend + 1;
+ }
+ }
+ return 0;
+}
+
+int regidx_insert(regidx_t *idx, char *line)
+{
+ if ( !line )
+ return _regidx_build_index(idx);
+
+ char *chr_from, *chr_to;
+ reg_t reg;
+ int ret = idx->parse(line,&chr_from,&chr_to,®,idx->payload,idx->usr);
+ if ( ret==-2 ) return -1; // error
+ if ( ret==-1 ) return 0; // skip the line
+
+ int rid;
+ idx->str.l = 0;
+ kputsn(chr_from, chr_to-chr_from+1, &idx->str);
+ if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
+ {
+ idx->nseq++;
+ int m_prev = idx->mseq;
+ hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
+ hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
+ idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
+ rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
+ }
+
+ reglist_t *list = &idx->seq[rid];
+ list->nregs++;
+ int m_prev = list->mregs;
+ hts_expand(reg_t,list->nregs,list->mregs,list->regs);
+ list->regs[list->nregs-1] = reg;
+ if ( idx->payload_size )
+ {
+ if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs);
+ memcpy(list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size);
+ }
+
+ if ( idx->rid_prev==rid )
+ {
+ if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) )
+ {
+ fprintf(stderr,"The regions are not sorted: %s:%d-%d is before %s:%d-%d\n",
+ idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1);
+ return -1;
+ }
+ }
+ idx->rid_prev = rid;
+ idx->start_prev = reg.start;
+ idx->end_prev = reg.end;
+ return 0;
+}
+
+regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, ssize_t payload_size, void *usr_dat)
+{
+ regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+ idx->free = free_f;
+ idx->parse = parser;
+ idx->usr = usr_dat;
+ idx->seq2regs = khash_str2int_init();
+ idx->rid_prev = -1;
+ idx->start_prev = -1;
+ idx->end_prev = -1;
+ idx->payload_size = payload_size;
+ if ( payload_size ) idx->payload = malloc(payload_size);
+
+ if ( !fname ) return idx;
+
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) return NULL;
+
+ kstring_t str = {0,0,0};
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( regidx_insert(idx, str.s) ) goto error;
+ }
+ regidx_insert(idx, NULL);
+
+ free(str.s);
+ hts_close(fp);
+ return idx;
+
+error:
+ free(str.s);
+ hts_close(fp);
+ regidx_destroy(idx);
+ return NULL;
+}
+
+void regidx_destroy(regidx_t *idx)
+{
+ int i, j;
+ for (i=0; i<idx->nseq; i++)
+ {
+ reglist_t *list = &idx->seq[i];
+ if ( idx->free )
+ {
+ for (j=0; j<list->nregs; j++)
+ idx->free(list->payload + idx->payload_size*j);
+ }
+ free(list->payload);
+ free(list->regs);
+ free(list->idx);
+ }
+ free(idx->seq_names);
+ free(idx->seq);
+ free(idx->str.s);
+ free(idx->payload);
+ khash_str2int_destroy_free(idx->seq2regs);
+ free(idx);
+}
+
+int regidx_overlap(regidx_t *idx, char *chr, uint32_t from, uint32_t to, regitr_t *itr)
+{
+ if ( itr ) itr->i = itr->n = 0;
+
+ int iseq;
+ if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
+
+ reglist_t *list = &idx->seq[iseq];
+ if ( !list->nregs ) return 0;
+
+ int i, ibeg = from>>LIDX_SHIFT;
+ int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ];
+ if ( ireg < 0 )
+ {
+ // linear search; if slow, replace with binary search
+ if ( ibeg > list->nidx ) ibeg = list->nidx;
+ for (i=ibeg - 1; i>=0; i--)
+ if ( list->idx[i] >=0 ) break;
+ ireg = i>=0 ? list->idx[i] : 0;
+ }
+ for (i=ireg; i<list->nregs; i++)
+ {
+ if ( list->regs[i].start > to ) return 0; // no match
+ if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found
+ }
+
+ if ( i>=list->nregs ) return 0; // no match
+
+ if ( !itr ) return 1;
+
+ itr->i = 0;
+ itr->n = list->nregs - i;
+ itr->reg = &idx->seq[iseq].regs[i];
+ if ( idx->payload_size )
+ itr->payload = idx->seq[iseq].payload + i*idx->payload_size;
+ else
+ itr->payload = NULL;
+
+ return 1;
+}
+
+int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ ss = se+1;
+ reg->start = strtol(ss, &se, 10);
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ ss = se+1;
+ reg->end = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ return 0;
+}
+
+int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ ss = se+1;
+ reg->start = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ if ( !se[0] || !se[1] )
+ reg->end = reg->start;
+ else
+ {
+ ss = se+1;
+ reg->end = strtol(ss, &se, 10);
+ if ( ss==se ) reg->end = reg->start;
+ else reg->end--;
+ }
+
+ return 0;
+}
+
diff --git a/test/test-regidx.c b/test/test-regidx.c
new file mode 100644
index 0000000..0aea6b8
--- /dev/null
+++ b/test/test-regidx.c
@@ -0,0 +1,116 @@
+/* test/test-regidx.c -- Regions index test harness.
+
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <htslib/regidx.h>
+
+void error(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ exit(-1);
+}
+
+int custom_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ // Use the standard parser for CHROM,FROM,TO
+ int i, ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL);
+ if ( ret!=0 ) return ret;
+
+ // Skip the fields that were parsed above
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ for (i=0; i<3; i++)
+ {
+ while ( *ss && !isspace(*ss) ) ss++;
+ if ( !*ss ) return -2; // wrong number of fields
+ while ( *ss && isspace(*ss) ) ss++;
+ }
+ if ( !*ss ) return -2;
+
+ // Parse the payload
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ char **dat = (char**) payload;
+ *dat = (char*) malloc(se-ss+1);
+ memcpy(*dat,ss,se-ss+1);
+ (*dat)[se-ss] = 0;
+ return 0;
+}
+void custom_free(void *payload)
+{
+ char **dat = (char**)payload;
+ free(*dat);
+}
+
+int main(int argc, char **argv)
+{
+ // Init index with no file name, we will insert the regions manually
+ regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL);
+ if ( !idx ) error("init failed\n");
+
+ // Insert regions
+ char *line;
+ line = "1 10000000 10000000 1:10000000-10000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+ line = "1 20000000 20000001 1:20000000-20000001"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+ line = "1 20000002 20000002 1:20000002-20000002"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+ line = "1 30000000 30000000 1:30000000-30000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line);
+
+ // Finish initialization
+ regidx_insert(idx,NULL);
+
+ // Test
+ regitr_t itr;
+ int from, to;
+
+ from = to = 10000000;
+ if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+ if ( strcmp("1:10000000-10000000",REGITR_PAYLOAD(itr,char*)) ) error("query failed: 1:%d-%d vs %s\n", from,to,REGITR_PAYLOAD(itr,char*));
+ if ( !regidx_overlap(idx,"1",from-2,to-1,&itr) ) error("query failed: 1:%d-%d\n",from-1,to);
+ if ( !regidx_overlap(idx,"1",from-2,to+3,&itr) ) error("query failed: 1:%d-%d\n",from-1,to+2);
+ if ( regidx_overlap(idx,"1",from-2,to-2,&itr) ) error("query failed: 1:%d-%d\n",from-1,to-1);
+
+ from = to = 20000000;
+ if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+
+ from = to = 20000002;
+ if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+
+ from = to = 30000000;
+ if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to);
+
+ // Clean up
+ regidx_destroy(idx);
+
+ return 0;
+}
+
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/htslib.git
More information about the debian-med-commit
mailing list